In [27]:
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, r2_score
from scipy.stats import spearmanr


def print_metrics(y_true, y_pred):
    
    print("================================================")
    print("Spearman's correlation coef: " + str(spearmanr(y_true, y_pred)[0]))
    print("================================================")
    
    print("-----------")
    print("R^2 = " + str(r2_score(y_true, y_pred)))
    print("R = " + str(np.sqrt(r2_score(y_true, y_pred))))
    print("-----------")
    
  

In [29]:
from scipy.stats import spearmanr
from sklearn.model_selection import KFold
import numpy as np


def grid_search_cv_for_ensembles(model, max_depth_values, n_estimators_values, X, y, scoring_function, k=5, verbose=0):
   
    
    best_score = 0.0
    best_n_estimators = 1
    best_max_depth = 1
    
    for max_depth in max_depth_values: 
        for n_estimators in n_estimators_values:
            
            kf = KFold(n_splits=k, random_state=None, shuffle=True)

            fold = 1
            scores = []
            for train_index, test_index in kf.split(X):

                # get train and test set for the i-th fold
                X_train, X_test = X.loc[train_index], X.loc[test_index]
                y_train, y_test = y[train_index], y[test_index]

                # train and predict
                model.set_hyperparams(max_depth, n_estimators)
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)

                scores.append(scoring_function(y_test, y_pred))

                fold += 1
              
            score = np.mean(scores)
            
            if verbose > 0:
                print("score=" + str(score) + " | max_depth=" + str(max_depth) + " n_estimators=" + str(n_estimators))

            if score > best_score:
                best_score = score
                best_n_estimators = n_estimators
                best_max_depth = max_depth

    return best_max_depth, best_n_estimators


def find_best_C(model, c_values, X, y, scoring_function, k=5, verbose=0):
    
    best_score = 0.0
    best_c = 1.0
    
    for c in c_values: 
            
        kf = KFold(n_splits=k, random_state=None, shuffle=True)

        fold = 1
        scores = []
        for train_index, test_index in kf.split(X):

            # get train and test set for the i-th fold
            X_train, X_test = X.loc[train_index], X.loc[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # train and predict
            model.set_hyperparams('linear', c)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            scores.append(scoring_function(y_test, y_pred))

            fold += 1

        score = np.mean(scores)

        if verbose > 0:
            print("score=" + str(score) + " | C=" + str(c))

        if score > best_score:
            best_score = score
            best_c = c

    return best_c

In [31]:
import numpy as np


def discretize(y_pred):
    
    
    for i in range(len(y_pred)):    
        if y_pred[i] < 0.5:
            y_pred[i] = 0.0
        elif y_pred[i] < 1.5:
            y_pred[i] = 1.0
        elif y_pred[i] < 2.5:
            y_pred[i] = 2.0
        elif y_pred[i] < 3.5:
            y_pred[i] = 3.0
        else:
            y_pred[i] = 4.0
            
    return y_pred

In [68]:
from sklearn.ensemble import RandomForestRegressor
import pickle





class RandomForest():
  
    
    def __init__(self, max_depth=20, n_estimators=100, save_model=False, use_saved_model=False, model_path='rf.pickle'):
        self.model_path = model_path
        self.save_model = save_model
        
        if use_saved_model:
            with open(self.model_path, 'rb') as file:
                self.model = pickle.load(file)
        else:
            self.model = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators)    
    
    
    def fit(self, X_train, y_train):
        self.model.fit(X_train, y_train)
        
        if self.save_model:
            with open(self.model_path, 'wb') as handle:
                pickle.dump(self.model, handle)
    
    
    def predict(self, X_test):
        return discretize(self.model.predict(X_test))
    
    
    def set_hyperparams(self, max_depth, n_estimators):
        
        self.model = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators)  

In [74]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout


import numpy as np
import tensorflow as tf



class MultilayerPerceptron():
  
    
    def __init__(self, input_dim=None, verbose=0, save_model=False, use_saved_model=False, model_path='mlp.h5'):
        self.model_path = model_path
        self.save_model = save_model
        
        self.input_dim = input_dim
        self.verbose = verbose

        
        if use_saved_model:
            self.model = load_model(model_path)
    
    
    def fit(self, X_train, y_train):
        self.model = self._make_model()
        
        y_train_cat = tf.keras.utils.to_categorical(y_train, num_classes=5)
        
        self.model.fit(
            X_train, y_train_cat,
            epochs=150,
            batch_size=64,
            verbose=self.verbose)
        
        if self.save_model:
            self.model.save(self.model_path)
    
    
    def predict(self, X_test): 
        y_pred_cat = self.model.predict(X_test)
        y_pred = np.argmax(y_pred_cat, axis=1)
        
        return discretize(y_pred)
    
    
    def _make_model(self):
        
        # architecture
        model = Sequential()
        model.add(Dense(64, activation='relu', input_dim=self.input_dim))
        model.add(Dropout(0.2))
        model.add(Dense(32, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(5, activation='linear'))
        
        # opitimizer
        adam = tf.keras.optimizers.Adam(lr=0.001)
        
        model.compile(optimizer=adam,
              loss='mse',
              metrics=['mse'])
        
        return model

In [73]:
from xgboost import XGBRegressor
import pickle




class XGBoost():
    
    
    def __init__(self, max_depth=30, n_estimators=200, save_model=False, use_saved_model=False, model_path='xgboost.pickle'):
        self.model_path = model_path
        self.save_model = save_model
        
        if use_saved_model:
            with open(self.model_path, 'rb') as file:
                self.model = pickle.load(file)
        else:
            self.model = xgboost = XGBRegressor(max_depth=max_depth, n_estimators=n_estimators, objective="reg:squarederror")  
    
    
    def fit(self, X_train, y_train):
        self.model.fit(X_train, y_train)
        
        if self.save_model:
            with open(self.model_path, 'wb') as handle:
                pickle.dump(self.model, handle)
    
    
    def predict(self, X_test):
        return discretize(self.model.predict(X_test))
    
    
    def set_hyperparams(self, max_depth, n_estimators):
       
        
        self.model = XGBRegressor(max_depth=max_depth, n_estimators=n_estimators, objective="reg:squarederror")  

In [72]:
from sklearn.svm import SVR
import pickle




class SupportVectorMachine():
 
    
    def __init__(self, kernel='linear', C=10.0, save_model=False, use_saved_model=False, model_path='svm.pickle'):
        self.model_path = model_path
        self.save_model = save_model
        
        if use_saved_model:
            with open(self.model_path, 'rb') as file:
                self.model = pickle.load(file)
        else:
            self.model = SVR(kernel=kernel, C=C)
    
    
    def fit(self, X_train, y_train):
        self.model.fit(X_train, y_train)
        
        if self.save_model:
            with open(self.model_path, 'wb') as handle:
                pickle.dump(self.model, handle)
    
    
    def predict(self, X_test):
        return discretize(self.model.predict(X_test))
    
    
    def set_hyperparams(self, kernel, c):
       
        
        self.model = SVR(kernel=kernel, C=c)

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr

X_train = pd.read_csv("weebit_train_with_features.csv", index_col=0)
X_test = pd.read_csv("weebit_test_with_features.csv", index_col=0)

# get Y
y_train = X_train["Level"]
y_test = X_test["Level"]

# remove Y and Text columns 
X_train.drop(columns=['Text', 'Level'], inplace=True)
X_test.drop(columns=['Text', 'Level'], inplace=True)

# whole set; used in cross-validation
X = pd.concat([X_train, X_test]).reset_index(drop=True)
y = pd.concat([y_train, y_test]).reset_index(drop=True)

In [43]:
scoring_function = lambda y_true, y_pred: spearmanr(y_true, y_pred)[0]
max_depth_values = [5, 10, 15, 20]
n_estimators_values = [10, 50, 100]

max_depth, n_estimators = grid_search_cv_for_ensembles(RandomForest(), max_depth_values, n_estimators_values, X_train, y_train, scoring_function, k=3, verbose=1)

print()
print("Mejores hiperparámetros son: max_depth=" + str(max_depth) + " n_estimators=" + str(n_estimators))

score=0.6727937210959603 | max_depth=5 n_estimators=10
score=0.7010830544173632 | max_depth=5 n_estimators=50
score=0.6871396643175932 | max_depth=5 n_estimators=100
score=0.7144643305955798 | max_depth=10 n_estimators=10
score=0.7483417235339506 | max_depth=10 n_estimators=50
score=0.7551947294969076 | max_depth=10 n_estimators=100
score=0.7163050154274743 | max_depth=15 n_estimators=10
score=0.7466707942146229 | max_depth=15 n_estimators=50
score=0.7561007116148158 | max_depth=15 n_estimators=100
score=0.7000136596460722 | max_depth=20 n_estimators=10
score=0.7436165639474487 | max_depth=20 n_estimators=50
score=0.761314858219543 | max_depth=20 n_estimators=100

Mejores hiperparámetros son: max_depth=20 n_estimators=100


In [70]:
rf = RandomForest(max_depth=max_depth, n_estimators=n_estimators, save_model=True)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print_metrics(y_test, y_pred)


Spearman's correlation coef: 0.6631028976774571
-----------
R^2 = 0.43731577492319496
R = 0.6612985520347031
-----------


In [50]:

max_depth_values = [5, 10, 15, 20, 30]
n_estimators_values = [10, 50, 100, 200]


max_depth, n_estimators = grid_search_cv_for_ensembles(XGBoost(), max_depth_values, n_estimators_values, X_train, y_train, scoring_function, k=3, verbose=1)

print()
print("Mejores hiperparámetros son: max_depth=" + str(max_depth) + " n_estimators=" + str(n_estimators))

score=0.7217380217276164 | max_depth=5 n_estimators=10
score=0.7511812564494326 | max_depth=5 n_estimators=50
score=0.7428138700219561 | max_depth=5 n_estimators=100
score=0.7526284555324304 | max_depth=5 n_estimators=200
score=0.6975442755384388 | max_depth=10 n_estimators=10
score=0.6952075050462417 | max_depth=10 n_estimators=50
score=0.7123745459327208 | max_depth=10 n_estimators=100
score=0.7030752030529602 | max_depth=10 n_estimators=200
score=0.6944836602959455 | max_depth=15 n_estimators=10
score=0.6804872131434797 | max_depth=15 n_estimators=50
score=0.69516659500275 | max_depth=15 n_estimators=100
score=0.7111820103464739 | max_depth=15 n_estimators=200
score=0.6935425335460758 | max_depth=20 n_estimators=10
score=0.6936676766614323 | max_depth=20 n_estimators=50
score=0.6978170748606759 | max_depth=20 n_estimators=100
score=0.6856781404788693 | max_depth=20 n_estimators=200
score=0.6800584971530986 | max_depth=30 n_estimators=10
score=0.6927421973892133 | max_depth=30 n_esti

In [75]:
xgboost = XGBoost(save_model=True)

xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)
print_metrics(y_test, y_pred)


Spearman's correlation coef: 0.6874196870544353
-----------
R^2 = 0.455032532639719
R = 0.6745609925275245
-----------


In [55]:
c_values = [1.0, 2.0, 5.0, 10.0, 20.0]

best_c = find_best_C(SupportVectorMachine(), c_values, X_train, y_train, scoring_function, k=3, verbose=1)

print()
print("El mejor C es " + str(best_c))

score=0.7136750274997627 | C=1.0
score=0.7269684259553048 | C=2.0
score=0.7301882882433711 | C=5.0
score=0.7293531806037441 | C=10.0
score=0.733927492529622 | C=20.0

El mejor C es 20.0


In [76]:

svm = SupportVectorMachine(C=best_c, save_model=True)

svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print_metrics(y_test, y_pred)


Spearman's correlation coef: 0.7112324607545297
-----------
R^2 = 0.48479668560347955
R = 0.6962734273282871
-----------


In [77]:

mlp = MultilayerPerceptron(input_dim=X_train.shape[1], save_model=True, verbose=0)

mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)
print_metrics(y_test, y_pred)


  super(Adam, self).__init__(name, **kwargs)


Spearman's correlation coef: 0.713240039046694
-----------
R^2 = 0.43448109368855103
R = 0.6591517986689797
-----------
