In [1]:
import numpy as np


def bootstrap_significance_testing(y_true, y_predA, y_predB, metric, n=int(1e5)):
   
    v1 = metric(y_true, y_predA)
    v2 = metric(y_true, y_predB)
    d = 2 * (v1 - v2)

    s = 0

    l = len(y_true)
    for i in range(n):
        idx = np.random.choice(l, l)

        v1i = metric(y_true[idx], y_predA[idx])
        v2i = metric(y_true[idx], y_predB[idx])
        di = v1i - v2i

        if di > d:
            s += 1

    return s / n

In [3]:
import sys
from scipy.stats import spearmanr

metric = lambda predA, predB: abs(spearmanr(predA, predB)[0])
n = int(1e4)

def flesch(df):
  
    
    # Flesch 
    df["Flesch"] = 206.835 - 1.015 * df["Avg_words_per_sentence"] - 84.6 * df["Avg_syllables_per_word"]
    
    return df


# DALE-CHALL


def dale_chall(df):
   


    # Dale-Chall 
    df["Dale_Chall"] = 0.1579 * (df["Difficult_word_percent"] * 100) + 0.0496 * df["Avg_words_per_sentence"]
    
    
    df.loc[df["Difficult_word_percent"] > 0.05, "Dale_Chall"] += 3.6365
        
    return df


# GUNNING FOG


def gunning_fog(df):
    

    df["Gunning_fog"] = 0.4 * (df["Avg_words_per_sentence"] + 100 * df["Complex_word_percent"])
    
    return df

In [4]:
import pandas as pd

X_train = pd.read_csv("weebit_train_with_features.csv", index_col=0)
X_test = pd.read_csv("weebit_test_with_features.csv", index_col=0)

# get Y
y_train = X_train["Level"]
y_test = X_test["Level"]

# remove Y and Text columns 
X_train.drop(columns=['Text', 'Level'], inplace=True)
X_test.drop(columns=['Text', 'Level'], inplace=True)

# whole set
X = pd.concat([X_train, X_test]).reset_index(drop=True)
y = pd.concat([y_train, y_test]).reset_index(drop=True)

In [5]:
X = flesch(X)
X = dale_chall(X)
X = gunning_fog(X)

In [6]:
metric(y, X["Dale_Chall"])


0.38278239601713643

In [7]:
metric(y, X["Flesch"])


0.359949119283807

In [8]:
p_value = bootstrap_significance_testing(y, X['Flesch'], X['Dale_Chall'], metric, n=n)
print("Estimated p-value: " + str(p_value))

Estimated p-value: 0.9025


In [9]:
metric(y, X["Flesch"])


0.359949119283807

In [10]:
metric(y, X["Gunning_fog"])


0.3730664167001242

In [11]:
p_value = bootstrap_significance_testing(y, X['Gunning_fog'], X['Flesch'], metric, n=n)
print("Estimated p-value: " + str(p_value))

Estimated p-value: 0.1294


In [12]:
metric(y, X["Dale_Chall"])


0.38278239601713643

In [13]:
metric(y, X["Gunning_fog"])


0.3730664167001242

In [14]:
p_value = bootstrap_significance_testing(y, X['Gunning_fog'], X['Dale_Chall'], metric, n=n)
print("Estimated p-value: " + str(p_value))

Estimated p-value: 0.6984


In [22]:
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, r2_score
from scipy.stats import spearmanr


def print_metrics(y_true, y_pred):
    
    print("================================================")
    print("Spearman's correlation coef: " + str(spearmanr(y_true, y_pred)[0]))
    print("================================================")
    
    print("-----------")
    print("R^2 = " + str(r2_score(y_true, y_pred)))
    print("R = " + str(np.sqrt(r2_score(y_true, y_pred))))
    print("-----------")
    
from scipy.stats import spearmanr
from sklearn.model_selection import KFold
import numpy as np


def grid_search_cv_for_ensembles(model, max_depth_values, n_estimators_values, X, y, scoring_function, k=5, verbose=0):
   
    
    best_score = 0.0
    best_n_estimators = 1
    best_max_depth = 1
    
    for max_depth in max_depth_values: 
        for n_estimators in n_estimators_values:
            
            kf = KFold(n_splits=k, random_state=None, shuffle=True)

            fold = 1
            scores = []
            for train_index, test_index in kf.split(X):

                # get train and test set for the i-th fold
                X_train, X_test = X.loc[train_index], X.loc[test_index]
                y_train, y_test = y[train_index], y[test_index]

                # train and predict
                model.set_hyperparams(max_depth, n_estimators)
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)

                scores.append(scoring_function(y_test, y_pred))

                fold += 1
              
            score = np.mean(scores)
            
            if verbose > 0:
                print("score=" + str(score) + " | max_depth=" + str(max_depth) + " n_estimators=" + str(n_estimators))

            if score > best_score:
                best_score = score
                best_n_estimators = n_estimators
                best_max_depth = max_depth

    return best_max_depth, best_n_estimators


def find_best_C(model, c_values, X, y, scoring_function, k=5, verbose=0):
    
    best_score = 0.0
    best_c = 1.0
    
    for c in c_values: 
            
        kf = KFold(n_splits=k, random_state=None, shuffle=True)

        fold = 1
        scores = []
        for train_index, test_index in kf.split(X):

            # get train and test set for the i-th fold
            X_train, X_test = X.loc[train_index], X.loc[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # train and predict
            model.set_hyperparams('linear', c)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            scores.append(scoring_function(y_test, y_pred))

            fold += 1

        score = np.mean(scores)

        if verbose > 0:
            print("score=" + str(score) + " | C=" + str(c))

        if score > best_score:
            best_score = score
            best_c = c

    return best_c

import numpy as np


def discretize(y_pred):
    
    
    for i in range(len(y_pred)):    
        if y_pred[i] < 0.5:
            y_pred[i] = 0.0
        elif y_pred[i] < 1.5:
            y_pred[i] = 1.0
        elif y_pred[i] < 2.5:
            y_pred[i] = 2.0
        elif y_pred[i] < 3.5:
            y_pred[i] = 3.0
        else:
            y_pred[i] = 4.0
            
    return y_pred

from sklearn.ensemble import RandomForestRegressor
import pickle





class RandomForest():
  
    
    def __init__(self, max_depth=20, n_estimators=100, save_model=False, use_saved_model=False, model_path='./models/saved_models/rf.pickle'):
        self.model_path = model_path
        self.save_model = save_model
        
        if use_saved_model:
            with open(self.model_path, 'rb') as file:
                self.model = pickle.load(file)
        else:
            self.model = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators)    
    
    
    def fit(self, X_train, y_train):
        self.model.fit(X_train, y_train)
        
        if self.save_model:
            with open(self.model_path, 'wb') as handle:
                pickle.dump(self.model, handle)
    
    
    def predict(self, X_test):
        return discretize(self.model.predict(X_test))
    
    
    def set_hyperparams(self, max_depth, n_estimators):
        
        self.model = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators)  
        
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout


import numpy as np
import tensorflow as tf



class MultilayerPerceptron():
  
    
    def __init__(self, input_dim=None, verbose=0, save_model=False, use_saved_model=False, model_path='./models/saved_models/mlp.h5'):
        self.model_path = model_path
        self.save_model = save_model
        
        self.input_dim = input_dim
        self.verbose = verbose

        
        if use_saved_model:
            self.model = load_model(model_path)
    
    
    def fit(self, X_train, y_train):
        self.model = self._make_model()
        
        y_train_cat = tf.keras.utils.to_categorical(y_train, num_classes=5)
        
        self.model.fit(
            X_train, y_train_cat,
            epochs=150,
            batch_size=64,
            verbose=self.verbose)
        
        if self.save_model:
            self.model.save(self.model_path)
    
    
    def predict(self, X_test): 
        y_pred_cat = self.model.predict(X_test)
        y_pred = np.argmax(y_pred_cat, axis=1)
        
        return discretize(y_pred)
    
    
    def _make_model(self):
        
        # architecture
        model = Sequential()
        model.add(Dense(64, activation='relu', input_dim=self.input_dim))
        model.add(Dropout(0.2))
        model.add(Dense(32, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(5, activation='linear'))
        
        # opitimizer
        adam = tf.keras.optimizers.Adam(lr=0.001)
        
        model.compile(optimizer=adam,
              loss='mse',
              metrics=['mse'])
        
        return model
    
from xgboost import XGBRegressor
import pickle




class XGBoost():
    
    
    def __init__(self, max_depth=30, n_estimators=200, save_model=False, use_saved_model=False, model_path='./models/saved_models/xgboost.pickle'):
        self.model_path = model_path
        self.save_model = save_model
        
        if use_saved_model:
            with open(self.model_path, 'rb') as file:
                self.model = pickle.load(file)
        else:
            self.model = xgboost = XGBRegressor(max_depth=max_depth, n_estimators=n_estimators, objective="reg:squarederror")  
    
    
    def fit(self, X_train, y_train):
        self.model.fit(X_train, y_train)
        
        if self.save_model:
            with open(self.model_path, 'wb') as handle:
                pickle.dump(self.model, handle)
    
    
    def predict(self, X_test):
        return discretize(self.model.predict(X_test))
    
    
    def set_hyperparams(self, max_depth, n_estimators):
       
        
        self.model = XGBRegressor(max_depth=max_depth, n_estimators=n_estimators, objective="reg:squarederror")  
        
from sklearn.svm import SVR
import pickle




class SupportVectorMachine():
 
    
    def __init__(self, kernel='linear', C=10.0, save_model=False, use_saved_model=False, model_path='./models/saved_models/svm.pickle'):
        self.model_path = model_path
        self.save_model = save_model
        
        if use_saved_model:
            with open(self.model_path, 'rb') as file:
                self.model = pickle.load(file)
        else:
            self.model = SVR(kernel=kernel, C=C)
    
    
    def fit(self, X_train, y_train):
        self.model.fit(X_train, y_train)
        
        if self.save_model:
            with open(self.model_path, 'wb') as handle:
                pickle.dump(self.model, handle)
    
    
    def predict(self, X_test):
        return discretize(self.model.predict(X_test))
    
    
    def set_hyperparams(self, kernel, c):
       
        
        self.model = SVR(kernel=kernel, C=c)

In [23]:
rf = RandomForest(use_saved_model=True, model_path='rf.pickle')
y_pred_rf = rf.predict(X_test)

xgboost = XGBoost(use_saved_model=True, model_path='xgboost.pickle')
y_pred_xgboost = xgboost.predict(X_test)

svm = SupportVectorMachine(use_saved_model=True, model_path='svm.pickle')
y_pred_svm = svm.predict(X_test)

mlp = MultilayerPerceptron(input_dim=X_train.shape[1], use_saved_model=True, verbose=0, model_path='mlp.h5')
y_pred_mlp = mlp.predict(X_test)

In [24]:
p_value = bootstrap_significance_testing(y_test, y_pred_mlp, y_pred_xgboost, metric, n=n)
print("Estimated p-value: " + str(p_value))

Estimated p-value: 0.1374


In [25]:
p_value = bootstrap_significance_testing(y_test, y_pred_mlp, y_pred_svm, metric, n=n)
print("Estimated p-value: " + str(p_value))

Estimated p-value: 0.4633


In [26]:
p_value = bootstrap_significance_testing(y_test, y_pred_rf, y_pred_xgboost, metric, n=n)
print("Estimated p-value: " + str(p_value))

Estimated p-value: 0.8933


In [27]:
p_value = bootstrap_significance_testing(y_test, y_pred_rf, y_pred_svm, metric, n=n)
print("Estimated p-value: " + str(p_value))

Estimated p-value: 0.9817


In [28]:
p_value = bootstrap_significance_testing(y_test, y_pred_mlp, y_pred_rf, metric, n=n)
print("Estimated p-value: " + str(p_value))

Estimated p-value: 0.0222


In [29]:
X_test = gunning_fog(X_test)
p_value = bootstrap_significance_testing(y_test, y_pred_mlp, X_test['Gunning_fog'], metric, n=n)
print("Estimated p-value: " + str(p_value))

Estimated p-value: 0.0
