In [1]:
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor

from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance

import pickle
import tensorflow as tf

In [41]:
def result(predictions):
    predictions = predictions.tolist()
    result_list=[]
    for pred in predictions:
        result = pred // 0.5 * 0.5
        if (pred - result) > 0.25:
            result += 0.5
        if result < 1.0:
            result = 1.0
        if result > 5.0:
            result = 5.0
        result_list.append(result)
    return result_list

# Accuracy score
def accuracy(Ypred, Ytrue):
    Ytrue = Ytrue.tolist()
    accurate = 0
    for i in range(len(Ytrue)):
        if Ytrue[i] == Ypred[i]:
            accurate += 1
    return accurate / len(Ytrue)

# approximate accurancy rate
def score(pred, test):
    test = test.tolist()
    correct = 0
    for i in range(len(test)):
        p = pred[i]
        t = test[i]
        if p < t+0.5 and p > t-0.5:
            correct += 1
    return correct / len(test)

# Total error / total number of points => by average what's the error for each point
def error_rate(Ypred, Ytrue):
    Ytrue=Ytrue.tolist()
    error=0
    for i in range(len(Ytrue)):
        error += abs(Ytrue[i] - Ypred[i])
    return error/len(Ytrue)

# Ensemble Learning

For target = vocabulary & target = cohesion:

linear regressor = lr1, lr2

k neighbours regressor = knn1, knn2

decision tree = dt1, dt2

neural network = nn1, nn2

In [37]:
lr1 = pickle.load(open("Models_sav\lr_vocab.sav", 'rb'))
lr2 = pickle.load(open("Models_sav\lr_cohesion.sav", 'rb'))
dt1 = pickle.load(open("Models_sav\Decision_Tree_vocab.sav", 'rb'))
dt2 = pickle.load(open("Models_sav\Decision_Tree_cohesion.sav", 'rb'))
knn1 = pickle.load(open("Models_sav\knn_vocab.sav", 'rb'))
knn2 = pickle.load(open("Models_sav\knn_cohesion.sav", 'rb'))

#nn1 = pickle.load(open("./neural_network_vocab.sav", 'rb'))
#nn2 = pickle.load(open("./neural_network.sav", 'rb'))
nn1 = tf.keras.models.load_model("model_deep_vocabulary")
nn2 = tf.keras.models.load_model("model_deep_cohesion")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## 1 Predict Vocabulary

In [8]:
df = pd.read_csv("Processed_Data.csv")
y = df['vocabulary']
X = df.iloc[:, 7:]
X = X.drop('corrected_text', axis = 1)

### 1.1 Voting

We use LinearRegression & DecisionTreeRegressor & KNeighboursRegressor to do a voting and check the accuracy and approximate accuracy.

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)
vr = VotingRegressor(
    estimators=[('linear regressor', lr1), ('decision tree', dt1), ('KNN', knn1)]
)

In [11]:
vr.fit(X_train, y_train)
y_pred_vr = vr.predict(X_test)
print(accuracy(result(y_pred_vr), y_test))
print(score(y_pred_vr, y_test))

Fitting 5 folds for each of 2842 candidates, totalling 14210 fits
0.3997955010224949
0.7198364008179959


### 1.2 AdaBoost

Besides, we build a AdaBoostRegressor, whose base estimator would be a DecisionTreeRegressor with optimal parameters got from the training of the single DecisionTreeRegressor dt1, to see the accuracy and approximate accuracy.

In [189]:
ar1 = AdaBoostRegressor(
    base_estimator=DecisionTreeRegressor(criterion='friedman_mse', max_depth=4, min_samples_split=2)
)
ar1.fit(X_train, y_train)

In [190]:
y_pred_ar1 = ar1.predict(X_test)
print(accuracy(result(y_pred_ar1), y_test))
print(score(y_pred_ar1, y_test))

0.3936605316973415
0.7188139059304703


In [191]:
ar1.estimators_

[DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=446857737),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=301215708),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=742893256),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=119440548),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=1507096625),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=65555849),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=393976674),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=779732698),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=1435535032),
 Decision

In [192]:
ar1.estimator_weights_

array([1.49290476, 1.20815133, 0.94494894, 0.97130432, 0.91712142,
       0.87092545, 0.88733875, 0.68742175, 0.82637169, 0.73881704,
       0.92609676, 0.87947796, 0.78771169, 0.52967959, 0.59876907,
       0.6170221 , 0.81262245, 0.73103227, 0.96657774, 1.0788217 ,
       0.61318263, 0.4904077 , 0.90273235, 0.57022386, 1.03170716,
       0.85966721, 0.8399737 , 0.69625894, 0.60657307, 0.86715339,
       1.01955672, 0.75928882, 1.1335785 , 0.50616945, 0.78731618,
       0.60799294, 0.73090286, 0.84281938, 0.50301127, 0.83666541,
       0.77351769, 0.76219604, 0.90017847, 0.79975564, 0.59634408,
       1.13152158, 0.40614461, 0.80534803, 1.00067073, 0.93686305])

### 1.3 Gradient Boost

We also build a GradientBoostingRegressor, whose base estimator is a DecisionTreeRegressor, and see its accuracy and approximate accuracy.

Need to find the best parameters for GradientBoostingRegressor.

In [194]:
gbr1 = GradientBoostingRegressor(n_estimators=300)
parameters = {
    'loss':['squared_error', 'absolute_error', 'huber', 'quantile'],
    'learning_rate':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
    'criterion':['friedman_mse', 'squared_error', 'mse']
}
gs_gbr1 = GridSearchCV(gbr1, parameters)
gs_gbr1.fit(X_train, y_train)
print(gs_gbr1.best_params_)
print(gs_gbr1.best_score_)

{'criterion': 'friedman_mse', 'learning_rate': 0.1, 'loss': 'huber'}
0.36875762649157334


In [196]:
gbr1_adj = GradientBoostingRegressor(
    criterion = 'friedman_mse',
    learning_rate = 0.1,
    loss = 'huber',
    n_estimators=300
)
gbr1_adj.fit(X_train, y_train)
y_pred_gbr1_adj = gbr1_adj.predict(X_test)

print(accuracy(result(y_pred_gbr1_adj), y_test))
print(score(y_pred_gbr1_adj, y_test))

0.40081799591002043
0.7075664621676891


Build a default GradientBoostingRegressor and compare.

In [197]:
gbr1_1 = GradientBoostingRegressor(n_estimators=300)
gbr1_1.fit(X_train, y_train)
y_pred_gbr1_1 = gbr1_1.predict(X_test)
print(accuracy(result(y_pred_gbr1_1), y_test))
print(score(y_pred_gbr1_1, y_test))

0.4049079754601227
0.7177914110429447


The default regressor works better.

Look at its feature importance.

In [232]:
fi_gbr1_1 = pd.DataFrame({'feature name': gbr1_1.feature_names_in_, 'feature importance': gbr1_1.feature_importances_})
fi_gbr1_1 = fi_gbr1_1.sort_values(by='feature importance',ascending=False)
fi_gbr1_1

Unnamed: 0,feature name,feature importance
17,Incorrect_form_ratio,0.365954
23,number_of_diff_words,0.183722
26,coherence_score,0.038847
24,freq_diff_words,0.034135
31,freq_of_pronoun,0.022599
25,ttr,0.019631
27,lexrank_avg_min_diff,0.018827
0,number_of_words,0.018469
11,freq_of_wrong_words,0.017976
20,dale_chall_readability_score,0.017822


Extract only the features that have an feature importance value more than 0.01 and build a restricted GradientBoostingRegressor.

In [245]:
X_train_gbr1_1_res = X_train[fi_gbr1_1[fi_gbr1_1['feature importance'] > 0.01]['feature name'].values]
y_train_gbr1_1_res = y_train
X_test_gbr1_1_res = X_test[fi_gbr1_1[fi_gbr1_1['feature importance'] > 0.01]['feature name'].values]
y_test_gbr1_1_res = y_test

In [246]:
gbr1_1_res = GradientBoostingRegressor()
gbr1_1_res.fit(X_train_gbr1_1_res, y_train_gbr1_1_res)
y_pred_gbr1_1_res = gbr1_1_res.predict(X_test_gbr1_1_res)
print(accuracy(result(y_pred_gbr1_1_res), y_test_gbr1_1_res))
print(score(y_pred_gbr1_1_res, y_test_gbr1_1_res))

0.4049079754601227
0.7290388548057259


The approximate accuracy of the restricted regressor improves, as the accuracy remains unchanged. Thus, the restricted regressor performs better and it's retained.

### 1.4 Stacking

Last, we use stacking to combine LinearRegression, KNeighboursRegressor and DecisionTree and predict 'vocabulary'.

In [12]:
base_models1 = list()
base_models1.append(('lr', lr1))
base_models1.append(('knn', knn1))
base_models1.append(('dt', dt1))

meta_learner1 = LinearRegression()
sr1 = StackingRegressor(estimators=base_models1, final_estimator=meta_learner1, cv=5)

In [13]:
X_train_sr1 = pd.DataFrame([lr1.predict(X_train), knn1.predict(X_train), dt1.predict(X_train)]).T
X_train_sr1



Unnamed: 0,0,1,2
0,3.379225,3.222222,3.219136
1,3.441336,3.361111,3.313167
2,2.900822,3.194444,2.827621
3,3.119983,3.222222,3.142578
4,3.368893,3.111111,3.635385
...,...,...,...
2928,2.953055,2.944444,3.182432
2929,3.156671,3.388889,3.313167
2930,3.229754,3.083333,3.313167
2931,3.564251,3.500000,3.635385


In [14]:
sr1.fit(X_train_sr1, y_train)
X_test_sr1 = pd.DataFrame([lr1.predict(X_test), knn1.predict(X_test), dt1.predict(X_test)]).T
y_pred_sr1 = sr1.predict(X_test_sr1)
print(accuracy(result(y_pred_sr1), y_test))
print(score(y_pred_sr1, y_test))

Fitting 5 folds for each of 2842 candidates, totalling 14210 fits
Fitting 5 folds for each of 2842 candidates, totalling 14210 fits
Fitting 5 folds for each of 2842 candidates, totalling 14210 fits
Fitting 5 folds for each of 2842 candidates, totalling 14210 fits
Fitting 5 folds for each of 2842 candidates, totalling 14210 fits
Fitting 5 folds for each of 2842 candidates, totalling 14210 fits
0.4100204498977505
0.712678936605317




Among Voting, AdaBoosting, GradientBoosting, and Stacking, Stacking works the best for predicting vocabulary, in terms of the accuracy & approximate accuracy of its prediction.

## 2 Predict Cohesion with Prediction of Vocabulary

### 2.1 Voting

We use LinearRegression & DecisionTreeRegressor & KNeighboursRegressor to do a voting and check the accuracy and approximate accuracy.

In [38]:
y2 = df['cohesion']
X2 = df.iloc[:, 7:]
X2 = X2.drop('corrected_text', axis = 1)

X_features_vr2 = pd.DataFrame([lr1.predict(X2), knn1.predict(X2), dt1.predict(X2)]).T
y_features_vr2 = sr1.predict(X_features_vr2)
X2['vocabulary'] = y_features_vr2



In [39]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, random_state=42)
vr2 = VotingRegressor(
    estimators=[('linear regressor', lr2), ('decision tree', dt2), ('KNN', knn2)]
)

In [42]:
vr2.fit(X_train2, y_train2)
y_pred_vr2 = vr2.predict(X_test2)
print(accuracy(result(y_pred_vr2), y_test2))
print(score(y_pred_vr2, y_test2))
print(error_rate(result(y_pred_vr2), y_test2))

0.3292433537832311
0.6257668711656442
0.43098159509202455


### 2.2 AdaBoost

Besides, we build a AdaBoostRegressor, whose base estimator would be a DecisionTreeRegressor with optimal parameters got from the training of the single DecisionTreeRegressor dt1, to see the accuracy and approximate accuracy.

In [43]:
ar2 = AdaBoostRegressor(
    base_estimator=DecisionTreeRegressor(criterion='friedman_mse', max_depth=3, min_samples_split=2)
)
ar2.fit(X_train2, y_train2)

In [44]:
y_pred_ar2 = ar2.predict(X_test2)
print(accuracy(result(y_pred_ar2), y_test2))
print(score(y_pred_ar2, y_test2))

0.3261758691206544
0.6247443762781186


In [46]:
ar2.estimators_

[DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                       random_state=1386855033),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                       random_state=768808940),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                       random_state=14216133),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                       random_state=305034183),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                       random_state=258400370),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                       random_state=1345506011),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                       random_state=1961507266),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                       random_state=1025583119),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                       random_state=187780172),
 Decisi

In [47]:
ar2.estimator_weights_

array([1.13148511, 1.15527328, 0.99949646, 0.72950173, 0.69212107,
       0.56420284, 0.30643586, 0.73259716, 0.52768525, 0.71164656,
       0.43807635, 0.49079162, 0.25275893, 0.24145393, 0.2908627 ,
       0.16899363, 0.30561844, 0.23841934, 0.41869392, 0.31035078,
       0.32828247, 0.54342179, 0.41188816, 0.30492591, 0.42249987,
       0.3301788 , 0.44621735, 0.2382099 , 0.5227858 , 0.40215741,
       0.29305219, 0.41584206, 0.27869237, 0.10790143, 0.56112589,
       0.10426728, 0.28257568, 0.48430785, 0.64890477, 0.42217596,
       0.40144659, 0.73577359, 0.40300788, 0.43063835, 0.52855413,
       0.34840228, 0.63295939, 0.10049314, 0.2813549 , 0.06279304])

### 2.3 Gradient Boosting

We also build a GradientBoostingRegressor, whose base estimator is a DecisionTreeRegressor, and see its accuracy and approximate accuracy.

Need to find the best parameters for GradientBoostingRegressor.

In [217]:
gbr2 = GradientBoostingRegressor(n_estimators=100)
parameters2 = {
    'loss':['squared_error', 'absolute_error', 'huber', 'quantile'],
    'learning_rate':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
    'criterion':['friedman_mse', 'squared_error', 'mse']
}
gs_gbr2 = GridSearchCV(gbr1, parameters2)
gs_gbr2.fit(X_train2, y_train2)
print(gs_gbr2.best_params_)
print(gs_gbr2.best_score_)

{'criterion': 'squared_error', 'learning_rate': 0.1, 'loss': 'absolute_error'}
0.31443808963024145


In [54]:
gbr2_adj = GradientBoostingRegressor(
    criterion = 'squared_error',
    learning_rate = 0.1,
    loss = 'absolute_error',
    n_estimators=100
)
gbr2_adj.fit(X_train2, y_train2)
y_pred_gbr2_adj = gbr2_adj.predict(X_test2)

print(accuracy(result(y_pred_gbr2_adj), y_test2))
print(score(y_pred_gbr2_adj, y_test2))

0.3220858895705521
0.5593047034764826


Build a default GradientBoostingRegressor and compare.

In [60]:
gbr2_1 = GradientBoostingRegressor(n_estimators=100)
gbr2_1.fit(X_train2, y_train2)
y_pred_gbr2_1 = gbr2_1.predict(X_test2)
print(accuracy(result(y_pred_gbr2_1), y_test2))
print(score(y_pred_gbr2_1, y_test2))
print(error_rate(result(y_pred_gbr2_1), y_test2))

0.3292433537832311
0.6349693251533742
0.4279141104294479


The default regressor works better.

Look at its feature importance.

In [57]:
fi_gbr2_1 = pd.DataFrame({'feature name': gbr2_1.feature_names_in_, 'feature importance': gbr2_1.feature_importances_})
fi_gbr2_1 = fi_gbr2_1.sort_values(by='feature importance',ascending=False)
fi_gbr2_1

Unnamed: 0,feature name,feature importance
35,vocabulary,0.658759
17,Incorrect_form_ratio,0.032724
3,punctuations,0.028944
25,ttr,0.023088
2,av_word_per_sen,0.020179
14,sentiment_negative,0.015232
10,sentence_complexity,0.013756
31,freq_of_pronoun,0.012911
22,mcalpine_eflaw,0.012702
29,freq_of_noun,0.011964


In [58]:
X_train_gbr2_1_res = X_train2[fi_gbr2_1[fi_gbr2_1['feature importance'] > 0.01]['feature name'].values]
y_train_gbr2_1_res = y_train2
X_test_gbr2_1_res = X_test2[fi_gbr2_1[fi_gbr2_1['feature importance'] > 0.01]['feature name'].values]
y_test_gbr2_1_res = y_test2

Extract only the features that have an feature importance value more than 0.01 and build a restricted GradientBoostingRegressor.

In [59]:
gbr2_1_res = GradientBoostingRegressor()
gbr2_1_res.fit(X_train_gbr2_1_res, y_train_gbr2_1_res)
y_pred_gbr2_1_res = gbr2_1_res.predict(X_test_gbr2_1_res)
print(accuracy(result(y_pred_gbr2_1_res), y_test_gbr2_1_res))
print(score(y_pred_gbr2_1_res, y_test_gbr2_1_res))
print(error_rate(result(y_pred_gbr2_1_res), y_test_gbr2_1_res))

0.33026584867075665
0.6400817995910021
0.4284253578732106


Both the accuracy and approximate accuracy of the restricted regressor improves, as the accuracy remains unchanged. Thus, the restricted regressor performs better and it's retained.

### 2.4 Stacking

Last, we use stacking to combine LinearRegression, KNeighboursRegressor and DecisionTree and predict 'cohesion'.

In [61]:
base_models2 = list()
base_models2.append(('lr', lr2))
base_models2.append(('knn', knn2))
base_models2.append(('dt', dt2))

meta_learner2 = LinearRegression()
sr2 = StackingRegressor(estimators=base_models2, final_estimator=meta_learner2, cv=5)

In [62]:
X_train_sr2 = pd.DataFrame([lr2.predict(X_train2), knn2.predict(X_train2), dt2.predict(X_train2)]).T
X_train_sr2



Unnamed: 0,0,1,2
0,3.298237,3.166667,3.178481
1,3.251153,3.194444,3.499154
2,2.971255,3.111111,2.744887
3,3.042594,3.000000,3.178481
4,3.070328,3.194444,3.005362
...,...,...,...
2928,2.813369,2.888889,2.744887
2929,3.208429,3.527778,3.178481
2930,3.207558,3.027778,3.178481
2931,3.491453,3.527778,3.499154


In [63]:
sr2.fit(X_train_sr2, y_train2)
X_test_sr2 = pd.DataFrame([lr2.predict(X_test2), knn2.predict(X_test2), dt2.predict(X_test2)]).T
y_pred_sr2 = sr2.predict(X_test_sr2)
print(accuracy(result(y_pred_sr2), y_test2))
print(score(y_pred_sr2, y_test2))
print(error_rate(result(y_pred_sr2), y_test2))

0.31901840490797545
0.6206543967280164
0.4396728016359918




Among Voting, AdaBoosting, GradientBoosting, and Stacking, GradientBoosting with restricted features whose feature importance is more than 0.01 works the best for predicting cohesion, considering the combination of accuracy & approximate accuracy & error rate of its prediction.