In [182]:
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor

from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance

import pickle
import tensorflow as tf

In [183]:
def result(predictions):
    predictions = predictions.tolist()
    result_list=[]
    for pred in predictions:
        result = pred // 0.5 * 0.5
        if (pred - result) > 0.25:
            result += 0.5
        if result < 1.0:
            result = 1.0
        if result > 5.0:
            result = 5.0
        result_list.append(result)
    return result_list

# Accuracy score
def accuracy(Ypred, Ytrue):
    Ytrue = Ytrue.tolist()
    accurate = 0
    for i in range(len(Ytrue)):
        if Ytrue[i] == Ypred[i]:
            accurate += 1
    return accurate / len(Ytrue)

# approximate accurancy rate
def score(pred, test):
    test = test.tolist()
    correct = 0
    for i in range(len(test)):
        p = pred[i]
        t = test[i]
        if p < t+0.5 and p > t-0.5:
            correct += 1
    return correct / len(test)

# Ensemble Learning

For target = vocabulary & target = cohesion:

linear regressor = lr1, lr2

k neighbours regressor = knn1, knn2

decision tree = dt1, dt2

neural network = nn1, nn2

In [255]:
lr1 = pickle.load(open("lr_vocab.sav", 'rb'))
lr2 = pickle.load(open("lr_cohesion.sav", 'rb'))
dt1 = pickle.load(open("Decision_Tree_vocab.sav", 'rb'))
dt2 = pickle.load(open("Decision_Tree_cohesion.sav", 'rb'))
knn1 = pickle.load(open("knn_vocab.sav", 'rb'))
knn2 = pickle.load(open("knn_cohesion.sav", 'rb'))

#nn1 = pickle.load(open("./neural_network_vocab.sav", 'rb'))
#nn2 = pickle.load(open("./neural_network.sav", 'rb'))
nn1 = tf.keras.models.load_model("model_deep_vocabulary")
nn2 = tf.keras.models.load_model("model_deep_cohesion")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## 1 Predict Vocabulary

In [186]:
df = pd.read_csv("Processed_Data.csv")
y = df['vocabulary']
X = df.iloc[:, 7:]
X = X.drop('corrected_text', axis = 1)

### 1.1 Voting

linear regression:
exact accuracy = 0.3987730061349693
approximate accuracy = 

decision tree:
exact accuracy = 0.4061302681992337
approximate accuracy =  0.7049808429118773

In [187]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)
vr = VotingRegressor(
    estimators=[('linear regressor', lr1), ('decision tree', dt1), ('KNN', knn1)]
)

In [188]:
vr.fit(X_train, y_train)
y_pred_vr = vr.predict(X_test)
print(accuracy(result(y_pred_vr), y_test))
print(score(y_pred_vr, y_test))

0.401840490797546
0.7208588957055214


### 1.2 AdaBoost

In [189]:
ar1 = AdaBoostRegressor(
    base_estimator=DecisionTreeRegressor(criterion='friedman_mse', max_depth=4, min_samples_split=2)
)
ar1.fit(X_train, y_train)

In [190]:
y_pred_ar1 = ar1.predict(X_test)
print(accuracy(result(y_pred_ar1), y_test))
print(score(y_pred_ar1, y_test))

0.3936605316973415
0.7188139059304703


In [191]:
ar1.estimators_

[DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=446857737),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=301215708),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=742893256),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=119440548),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=1507096625),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=65555849),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=393976674),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=779732698),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=1435535032),
 Decision

In [192]:
ar1.estimator_weights_

array([1.49290476, 1.20815133, 0.94494894, 0.97130432, 0.91712142,
       0.87092545, 0.88733875, 0.68742175, 0.82637169, 0.73881704,
       0.92609676, 0.87947796, 0.78771169, 0.52967959, 0.59876907,
       0.6170221 , 0.81262245, 0.73103227, 0.96657774, 1.0788217 ,
       0.61318263, 0.4904077 , 0.90273235, 0.57022386, 1.03170716,
       0.85966721, 0.8399737 , 0.69625894, 0.60657307, 0.86715339,
       1.01955672, 0.75928882, 1.1335785 , 0.50616945, 0.78731618,
       0.60799294, 0.73090286, 0.84281938, 0.50301127, 0.83666541,
       0.77351769, 0.76219604, 0.90017847, 0.79975564, 0.59634408,
       1.13152158, 0.40614461, 0.80534803, 1.00067073, 0.93686305])

### 1.3 Gradient Boost

Use the default decision tree in GradientBoostingRegressor.

Find best parameters for GradientBoostingRegressor.

In [194]:
gbr1 = GradientBoostingRegressor(n_estimators=300)
parameters = {
    'loss':['squared_error', 'absolute_error', 'huber', 'quantile'],
    'learning_rate':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
    'criterion':['friedman_mse', 'squared_error', 'mse']
}
gs_gbr1 = GridSearchCV(gbr1, parameters)
gs_gbr1.fit(X_train, y_train)
print(gs_gbr1.best_params_)
print(gs_gbr1.best_score_)

{'criterion': 'friedman_mse', 'learning_rate': 0.1, 'loss': 'huber'}
0.36875762649157334


In [196]:
gbr1_adj = GradientBoostingRegressor(
    criterion = 'friedman_mse',
    learning_rate = 0.1,
    loss = 'huber',
    n_estimators=300
)
gbr1_adj.fit(X_train, y_train)
y_pred_gbr1_adj = gbr1_adj.predict(X_test)

print(accuracy(result(y_pred_gbr1_adj), y_test))
print(score(y_pred_gbr1_adj, y_test))

0.40081799591002043
0.7075664621676891


In [197]:
gbr1_1 = GradientBoostingRegressor(n_estimators=300)
gbr1_1.fit(X_train, y_train)
y_pred_gbr1_1 = gbr1_1.predict(X_test)
print(accuracy(result(y_pred_gbr1_1), y_test))
print(score(y_pred_gbr1_1, y_test))

0.4049079754601227
0.7177914110429447


Compare the default regressor and the fine tuned one. The default regressor works better.

Look at the feature importance.

In [232]:
fi_gbr1_1 = pd.DataFrame({'feature name': gbr1_1.feature_names_in_, 'feature importance': gbr1_1.feature_importances_})
fi_gbr1_1 = fi_gbr1_1.sort_values(by='feature importance',ascending=False)
fi_gbr1_1

Unnamed: 0,feature name,feature importance
17,Incorrect_form_ratio,0.365954
23,number_of_diff_words,0.183722
26,coherence_score,0.038847
24,freq_diff_words,0.034135
31,freq_of_pronoun,0.022599
25,ttr,0.019631
27,lexrank_avg_min_diff,0.018827
0,number_of_words,0.018469
11,freq_of_wrong_words,0.017976
20,dale_chall_readability_score,0.017822


In [245]:
X_train_gbr1_1_res = X_train[fi_gbr1_1[fi_gbr1_1['feature importance'] > 0.01]['feature name'].values]
y_train_gbr1_1_res = y_train
X_test_gbr1_1_res = X_test[fi_gbr1_1[fi_gbr1_1['feature importance'] > 0.01]['feature name'].values]
y_test_gbr1_1_res = y_test

In [246]:
gbr1_1_res = GradientBoostingRegressor()
gbr1_1_res.fit(X_train_gbr1_1_res, y_train_gbr1_1_res)
y_pred_gbr1_1_res = gbr1_1_res.predict(X_test_gbr1_1_res)
print(accuracy(result(y_pred_gbr1_1_res), y_test_gbr1_1_res))
print(score(y_pred_gbr1_1_res, y_test_gbr1_1_res))

0.4049079754601227
0.7290388548057259


### 1.4 Stacking

In [201]:
base_models1 = list()
base_models1.append(('lr', lr1))
base_models1.append(('knn', knn1))
base_models1.append(('dt', dt1))

meta_learner1 = LinearRegression()
sr1 = StackingRegressor(estimators=base_models1, final_estimator=meta_learner1, cv=5)

In [202]:
X_train_sr1 = pd.DataFrame([lr1.predict(X_train), knn1.predict(X_train), dt1.predict(X_train)]).T
X_train_sr1



Unnamed: 0,0,1,2
0,3.376831,3.327586,3.219136
1,3.418306,3.396552,3.313167
2,2.880286,3.155172,2.827621
3,3.148041,3.206897,3.142578
4,3.373357,3.206897,3.635385
...,...,...,...
2928,2.894175,3.172414,3.182432
2929,3.099393,3.155172,3.313167
2930,3.162822,3.293103,3.313167
2931,3.561623,3.413793,3.635385


In [203]:
sr1.fit(X_train_sr1, y_train)
X_test_sr1 = pd.DataFrame([lr1.predict(X_test), knn1.predict(X_test), dt1.predict(X_test)]).T
y_pred_sr1 = sr1.predict(X_test_sr1)
print(accuracy(result(y_pred_sr1), y_test))
print(score(y_pred_sr1, y_test))

0.4263803680981595
0.7249488752556237




## 2 Predict Cohesion with Prediction of Vocabulary

### 2.1 Voting

linear regression:
exact accuracy = 0.33844580777096117
approximate accuracy = 

decision tree:
exact accuracy = 0.3282247765006386
approximate accuracy = 0.5900383141762452

In [256]:
y2 = df['cohesion']
X2 = df.iloc[:, 7:]
X2 = X2.drop('corrected_text', axis = 1)

X_features_vr2 = pd.DataFrame([lr1.predict(X2), knn1.predict(X2), dt1.predict(X2)]).T
y_features_vr2 = sr1.predict(X_features_vr2)
X2['vocabulary'] = y_features_vr2



In [257]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, random_state=42)
vr2 = VotingRegressor(
    estimators=[('linear regressor', lr2), ('decision tree', dt2), ('KNN', knn2)]
)

In [258]:
vr2.fit(X_train2, y_train2)
y_pred_vr2 = vr2.predict(X_test2)
print(accuracy(result(y_pred_vr2), y_test2))
print(score(y_pred_vr2, y_test2))

0.33640081799591004
0.6247443762781186


### 2.2 AdaBoost

In [207]:
ar2 = AdaBoostRegressor(
    base_estimator=DecisionTreeRegressor(criterion='friedman_mse', max_depth=3, min_samples_split=2)
)
ar2.fit(X_train2, y_train2)

In [208]:
y_pred_ar2 = ar2.predict(X_test2)
print(accuracy(result(y_pred_ar2), y_test2))
print(score(y_pred_ar2, y_test2))

0.34049079754601225
0.6267893660531697


In [209]:
ar1.estimators_

[DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=446857737),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=301215708),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=742893256),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=119440548),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=1507096625),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=65555849),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=393976674),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=779732698),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=1435535032),
 Decision

In [210]:
ar1.estimator_weights_

array([1.49290476, 1.20815133, 0.94494894, 0.97130432, 0.91712142,
       0.87092545, 0.88733875, 0.68742175, 0.82637169, 0.73881704,
       0.92609676, 0.87947796, 0.78771169, 0.52967959, 0.59876907,
       0.6170221 , 0.81262245, 0.73103227, 0.96657774, 1.0788217 ,
       0.61318263, 0.4904077 , 0.90273235, 0.57022386, 1.03170716,
       0.85966721, 0.8399737 , 0.69625894, 0.60657307, 0.86715339,
       1.01955672, 0.75928882, 1.1335785 , 0.50616945, 0.78731618,
       0.60799294, 0.73090286, 0.84281938, 0.50301127, 0.83666541,
       0.77351769, 0.76219604, 0.90017847, 0.79975564, 0.59634408,
       1.13152158, 0.40614461, 0.80534803, 1.00067073, 0.93686305])

### 2.3 Gradient Boosting

In [217]:
gbr2 = GradientBoostingRegressor(n_estimators=100)
parameters2 = {
    'loss':['squared_error', 'absolute_error', 'huber', 'quantile'],
    'learning_rate':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
    'criterion':['friedman_mse', 'squared_error', 'mse']
}
gs_gbr2 = GridSearchCV(gbr1, parameters2)
gs_gbr2.fit(X_train2, y_train2)
print(gs_gbr2.best_params_)
print(gs_gbr2.best_score_)

{'criterion': 'squared_error', 'learning_rate': 0.1, 'loss': 'absolute_error'}
0.31443808963024145


In [218]:
gbr2_adj = GradientBoostingRegressor(
    criterion = 'squared_error',
    learning_rate = 0.1,
    loss = 'absolute_error',
    n_estimators=300
)
gbr2_adj.fit(X_train2, y_train2)
y_pred_gbr2_adj = gbr2_adj.predict(X_test2)

print(accuracy(result(y_pred_gbr2_adj), y_test2))
print(score(y_pred_gbr2_adj, y_test2))

0.3333333333333333
0.6298568507157464


In [220]:
gbr2_1 = GradientBoostingRegressor(n_estimators=300)
gbr2_1.fit(X_train2, y_train2)
y_pred_gbr2_1 = gbr2_1.predict(X_test2)
print(accuracy(result(y_pred_gbr2_1), y_test2))
print(score(y_pred_gbr2_1, y_test2))

0.3496932515337423
0.623721881390593


In [248]:
fi_gbr2_1 = pd.DataFrame({'feature name': gbr2_1.feature_names_in_, 'feature importance': gbr2_1.feature_importances_})
fi_gbr2_1 = fi_gbr2_1.sort_values(by='feature importance',ascending=False)
fi_gbr2_1

Unnamed: 0,feature name,feature importance
35,vocabulary,0.490194
17,Incorrect_form_ratio,0.031249
3,punctuations,0.029685
10,sentence_complexity,0.022745
1,stopwords_frequency,0.021553
28,lexrank_interquartile,0.021387
25,ttr,0.020861
29,freq_of_noun,0.020452
22,mcalpine_eflaw,0.019602
31,freq_of_pronoun,0.018946


In [253]:
X_train_gbr2_1_res = X_train2[fi_gbr2_1[fi_gbr2_1['feature importance'] > 0.01]['feature name'].values]
y_train_gbr2_1_res = y_train2
X_test_gbr2_1_res = X_test2[fi_gbr2_1[fi_gbr2_1['feature importance'] > 0.01]['feature name'].values]
y_test_gbr2_1_res = y_test2

In [254]:
gbr2_1_res = GradientBoostingRegressor()
gbr2_1_res.fit(X_train_gbr2_1_res, y_train_gbr2_1_res)
y_pred_gbr2_1_res = gbr2_1_res.predict(X_test_gbr2_1_res)
print(accuracy(result(y_pred_gbr2_1_res), y_test_gbr2_1_res))
print(score(y_pred_gbr2_1_res, y_test_gbr2_1_res))

0.3394683026584867
0.6319018404907976


### 2.4 Stacking

In [259]:
base_models2 = list()
base_models2.append(('lr', lr2))
base_models2.append(('knn', knn2))
base_models2.append(('dt', dt2))

meta_learner2 = LinearRegression()
sr2 = StackingRegressor(estimators=base_models2, final_estimator=meta_learner2, cv=5)

In [260]:
X_train_sr2 = pd.DataFrame([lr2.predict(X_train2), knn2.predict(X_train2), dt2.predict(X_train2)]).T
X_train_sr2



Unnamed: 0,0,1,2
0,3.276418,3.224138,3.178481
1,3.272916,3.241379,3.499154
2,2.901435,3.000000,2.744887
3,3.057268,3.155172,3.178481
4,3.064445,3.155172,3.148148
...,...,...,...
2928,2.869188,3.155172,3.178481
2929,3.158848,3.293103,3.178481
2930,3.197612,3.017241,3.178481
2931,3.480935,3.448276,3.499154


In [261]:
sr2.fit(X_train_sr2, y_train2)
X_test_sr2 = pd.DataFrame([lr2.predict(X_test2), knn2.predict(X_test2), dt2.predict(X_test2)]).T
y_pred_sr2 = sr2.predict(X_test_sr2)
print(accuracy(result(y_pred_sr2), y_test2))
print(score(y_pred_sr2, y_test2))

0.3333333333333333
0.6339468302658486


