In [128]:
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor

from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance

import pickle
import tensorflow as tf

In [4]:
def result(predictions):
    predictions = predictions.tolist()
    result_list=[]
    for pred in predictions:
        result = pred // 0.5 * 0.5
        if (pred - result) > 0.25:
            result += 0.5
        if result < 1.0:
            result = 1.0
        if result > 5.0:
            result = 5.0
        result_list.append(result)
    return result_list

# Accuracy score
def accuracy(Ypred, Ytrue):
    Ytrue = Ytrue.tolist()
    accurate = 0
    for i in range(len(Ytrue)):
        if Ytrue[i] == Ypred[i]:
            accurate += 1
    return accurate / len(Ytrue)

# approximate accurancy rate
def score(pred, test):
    test = test.tolist()
    correct = 0
    for i in range(len(test)):
        p = pred[i]
        t = test[i]
        if p < t+0.5 and p > t-0.5:
            correct += 1
    return correct / len(test)

# Ensemble Learning

For target = vocabulary & target = cohesion:

linear regressor = lr1, lr2

k neighbours regressor = knn1, knn2

decision tree = dt1, dt2

neural network = nn1, nn2

In [154]:
lr1 = pickle.load(open("lr.sav", 'rb'))
lr2 = pickle.load(open("lr_second.sav", 'rb'))
dt1 = pickle.load(open("Decision_Tree_vocab.sav", 'rb'))
dt2 = pickle.load(open("Decision_Tree_cohesion.sav", 'rb'))
knn1 = pickle.load(open("knn_vocab.sav", 'rb'))
knn2 = pickle.load(open("knn_cohesion.sav", 'rb'))

#nn1 = pickle.load(open("./neural_network_vocab.sav", 'rb'))
#nn2 = pickle.load(open("./neural_network.sav", 'rb'))
nn1 = tf.keras.models.load_model("model_deep_vocabulary")
nn2 = tf.keras.models.load_model("model_deep_cohesion")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## 1 Predict Vocabulary

In [155]:
df = pd.read_csv("Processed_Data.csv")
y = df['vocabulary']
X = df.iloc[:, 7:]
X = X.drop('corrected_text', axis = 1)

### 1.1 Voting

linear regression:
exact accuracy = 0.3987730061349693
approximate accuracy = 

decision tree:
exact accuracy = 0.4061302681992337
approximate accuracy =  0.7049808429118773

In [156]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)
vr = VotingRegressor(
    estimators=[('linear regressor', lr1), ('decision tree', dt1), ('KNN', knn1)]
)

In [157]:
vr.fit(X_train, y_train)
y_pred_vr = vr.predict(X_test)
print(accuracy(result(y_pred_vr), y_test))
print(score(y_pred_vr, y_test))

0.401840490797546
0.7208588957055214


### 1.2 AdaBoost

In [170]:
ar1 = AdaBoostRegressor(
    base_estimator=DecisionTreeRegressor(criterion='friedman_mse', max_depth=4, min_samples_split=2)
)
ar1.fit(X_train, y_train)

In [175]:
y_pred_ar1 = ar1.predict(X_test)
print(accuracy(result(y_pred_ar1), y_test))
print(score(y_pred_ar1, y_test))

0.4100204498977505
0.7137014314928425


In [172]:
ar1.estimators_

[DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=529175681),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=44697635),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=763985914),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=1926139716),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=1461068265),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=1698457272),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=676729355),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=1098255066),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=1447539163),
 Decis

In [173]:
ar1.estimator_weights_

array([1.7823737 , 1.19053327, 0.99432258, 1.24729975, 0.64027864,
       1.36005862, 0.55533881, 1.01139172, 0.68841491, 0.73887161,
       1.01831409, 0.83423352, 1.05956824, 0.66331197, 0.66210282,
       0.74381795, 0.6048641 , 0.5427817 , 0.81701955, 0.62435331,
       0.91666127, 0.88684738, 0.91820161, 0.65662289, 0.63365547,
       0.72864977, 0.55690124, 0.84762456, 0.60164676, 1.20740062,
       0.62109291, 0.93066639, 0.9222574 , 0.79707249, 0.73026024,
       0.56734544, 0.98520078, 1.08263001, 0.71610782, 0.581803  ,
       0.91428308, 0.99513543, 0.58781487, 0.98442389, 0.86228317,
       0.78513723, 0.49925585, 0.57556409, 0.79380271, 0.68852784])

### 1.3 Gradient Boost

Use the default decision tree in GradientBoostingRegressor.

Find best parameters for GradientBoostingRegressor.

In [95]:
gbr1 = GradientBoostingRegressor(n_estimators=300)
parameters = {
    'loss':['squared_error', 'absolute_error', 'huber', 'quantile'],
    'learning_rate':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
    'criterion':['friedman_mse', 'squared_error', 'mse']
}
gs_gbr1 = GridSearchCV(gbr1, parameters)
print(gs_gbr1.best_params_)
print(gs_gbr1.best_score_)

60 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\12560\anaconda3\envs\BT4222\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\12560\anaconda3\envs\BT4222\lib\site-packages\sklearn\ensemble\_gb.py", line 577, in fit
    self._check_params()
  File "C:\Users\12560\anaconda3\envs\BT4222\lib\site-packages\sklearn\ensemble\_gb.py", line 270, in _check_params
    check_scalar(
  File "C:\Users\12560\anaconda3\envs\BT4222\lib\site-packages\sklearn\utils\validation.py", line 1480, in check_scalar
    raise ValueErr

{'criterion': 'friedman_mse', 'learning_rate': 0.1, 'loss': 'huber'}
0.369309112331468


In [100]:
gbr1_adj = GradientBoostingRegressor(
    criterion = 'friedman_mse',
    learning_rate = 0.1,
    loss = 'huber',
    n_estimators=300
)
gbr1_adj.fit(X_train, y_train)
y_pred_gbr1_adj = gbr1_adj.predict(X_test)

print(accuracy(result(y_pred_gbr1_adj), y_test))
print(score(y_pred_gbr1_adj, y_test))

0.39775051124744376
0.7085889570552147


In [106]:
gbr1_1 = GradientBoostingRegressor(n_estimators=300)
gbr1_1.fit(X_train, y_train)
y_pred_gbr1_1 = gbr1_1.predict(X_test)
print(accuracy(result(y_pred_gbr1_1), y_test))
print(score(y_pred_gbr1_1, y_test))

0.4059304703476483
0.7198364008179959


Compare the default regressor and the fine tuned one. The default regressor works better.

Look at the feature importance.

In [110]:
fi_gbr1_1 = pd.DataFrame([gbr1_1.feature_names_in_, gbr1_1.feature_importances_]).T
fi_gbr1_1 = fi_gbr1_1.sort_values(by=1,ascending=False)
fi_gbr1_1.head(10)

Unnamed: 0,0,1
17,Incorrect_form_ratio,0.365601
23,number_of_diff_words,0.183728
26,coherence_score,0.038382
24,freq_diff_words,0.034787
31,freq_of_pronoun,0.022882
25,ttr,0.019596
27,lexrank_avg_min_diff,0.018763
0,number_of_words,0.01869
11,freq_of_wrong_words,0.018025
20,dale_chall_readability_score,0.017734


In [119]:
X_train_gbr1_1_res = X_train[['Incorrect_form_ratio','number_of_diff_words']]
y_train_gbr1_1_res = y_train
X_test_gbr1_1_res = X_test[['Incorrect_form_ratio','number_of_diff_words']]
y_test_gbr1_1_res = y_test

In [120]:
gbr1_1_res = GradientBoostingRegressor()
gbr1_1_res.fit(X_train_gbr1_1_res, y_train_gbr1_1_res)
y_pred_gbr1_1_res = gbr1_1_res.predict(X_test_gbr1_1_res)
print(accuracy(result(y_pred_gbr1_1_res), y_test_gbr1_1_res))
print(score(y_pred_gbr1_1_res, y_test_gbr1_1_res))

0.40286298568507156
0.7249488752556237


### 1.4 Stacking

In [158]:
base_models1 = list()
base_models1.append(('lr', lr1))
base_models1.append(('knn', knn1))
base_models1.append(('dt', dt1))

meta_learner1 = LinearRegression()
sr1 = StackingRegressor(estimators=base_models1, final_estimator=meta_learner1, cv=5)

In [159]:
X_train_sr1 = pd.DataFrame([lr1.predict(X_train), knn1.predict(X_train), dt1.predict(X_train)]).T
X_train_sr1



Unnamed: 0,0,1,2
0,3.370360,3.327586,3.219136
1,3.409681,3.396552,3.313167
2,2.880778,3.155172,2.827621
3,3.172333,3.206897,3.142578
4,3.400355,3.206897,3.635385
...,...,...,...
2928,2.930220,3.172414,3.182432
2929,3.125897,3.155172,3.313167
2930,3.167199,3.293103,3.313167
2931,3.544529,3.413793,3.635385


In [160]:
sr1.fit(X_train_sr1, y_train)
X_test_sr1 = pd.DataFrame([lr1.predict(X_test), knn1.predict(X_test), dt1.predict(X_test)]).T
y_pred_sr1 = sr1.predict(X_test_sr1)
print(accuracy(result(y_pred_sr1), y_test))
print(score(y_pred_sr1, y_test))

0.4284253578732106
0.7280163599182005




## 2 Predict Cohesion with Prediction of Vocabulary

### 2.1 Voting

linear regression:
exact accuracy = 0.33844580777096117
approximate accuracy = 

decision tree:
exact accuracy = 0.3282247765006386
approximate accuracy = 0.5900383141762452

In [167]:
y2 = df['cohesion']
X2 = df.iloc[:, 7:]
X2 = X2.drop('corrected_text', axis = 1)

X_features_vr2 = pd.DataFrame([lr1.predict(X2), knn1.predict(X2), dt1.predict(X2)]).T
y_features_vr2 = sr1.predict(X_features_vr2)
X2['vocabulary'] = y_features_vr2



In [168]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, random_state=42)
vr2 = VotingRegressor(
    estimators=[('linear regressor', lr2), ('decision tree', dt2), ('KNN', knn2)]
)

In [169]:
vr2.fit(X_train2, y_train2)
y_pred_vr2 = vr2.predict(X_test2)
print(accuracy(result(y_pred_vr2), y_test2))
print(score(y_pred_vr2, y_test2))

0.3343558282208589
0.6298568507157464


### 2.2 AdaBoost

In [174]:
ar2 = AdaBoostRegressor(
    base_estimator=DecisionTreeRegressor(criterion='friedman_mse', max_depth=3, min_samples_split=2)
)
ar2.fit(X_train2, y_train2)

In [177]:
y_pred_ar2 = ar2.predict(X_test2)
print(accuracy(result(y_pred_ar2), y_test2))
print(score(y_pred_ar2, y_test2))

0.33844580777096117
0.623721881390593


In [178]:
ar1.estimators_

[DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=529175681),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=44697635),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=763985914),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=1926139716),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=1461068265),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=1698457272),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=676729355),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=1098255066),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=4,
                       random_state=1447539163),
 Decis

In [179]:
ar1.estimator_weights_

array([1.7823737 , 1.19053327, 0.99432258, 1.24729975, 0.64027864,
       1.36005862, 0.55533881, 1.01139172, 0.68841491, 0.73887161,
       1.01831409, 0.83423352, 1.05956824, 0.66331197, 0.66210282,
       0.74381795, 0.6048641 , 0.5427817 , 0.81701955, 0.62435331,
       0.91666127, 0.88684738, 0.91820161, 0.65662289, 0.63365547,
       0.72864977, 0.55690124, 0.84762456, 0.60164676, 1.20740062,
       0.62109291, 0.93066639, 0.9222574 , 0.79707249, 0.73026024,
       0.56734544, 0.98520078, 1.08263001, 0.71610782, 0.581803  ,
       0.91428308, 0.99513543, 0.58781487, 0.98442389, 0.86228317,
       0.78513723, 0.49925585, 0.57556409, 0.79380271, 0.68852784])

### 2.3 Gradient Boosting

In [None]:
gbr2 = GradientBoostingRegressor(n_estimators=100)
parameters2 = {
    'loss':['squared_error', 'absolute_error', 'huber', 'quantile'],
    'learning_rate':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
    'criterion':['friedman_mse', 'squared_error', 'mse']
}
gs_gbr2 = GridSearchCV(gbr1, parameter2)
print(gs_gbr2.best_params_)
print(gs_gbr2.best_score_)

In [None]:
gbr2_adj = GradientBoostingRegressor(
    criterion = 'friedman_mse',
    learning_rate = 0.1,
    loss = 'huber',
    n_estimators=300
)
gbr2_adj.fit(X_train2, y_train2)
y_pred_gbr2_adj = gbr2_adj.predict(X_test2)

print(accuracy(result(y_pred_gbr2_adj), y_test2))
print(score(y_pred_gbr2_adj, y_test2))

In [None]:
gbr2_1 = GradientBoostingRegressor(n_estimators=100)
gbr2_1.fit(X_train2, y_train2)
y_pred_gbr2_1 = gbr2_1.predict(X_test2)
print(accuracy(result(y_pred_gbr2_1), y_test2))
print(score(y_pred_gbr2_1, y_test2))

In [None]:
fi_gbr2_1 = pd.DataFrame([gbr2_1.feature_names_in_, gbr2_1.feature_importances_]).T
fi_gbr2_1 = fi_gbr2_1.sort_values(by=1,ascending=False)
fi_gbr2_1.head(10)

In [None]:
X_train_gbr2_1_res = X_train2[['Incorrect_form_ratio','number_of_diff_words']]
y_train_gbr2_1_res = y_train2
X_test_gbr2_1_res = X_test2[['Incorrect_form_ratio','number_of_diff_words']]
y_test_gbr2_1_res = y_test2

In [None]:
gbr2_1_res = GradientBoostingRegressor()
gbr2_1_res.fit(X_train_gbr2_1_res, y_train_gbr2_1_res)
y_pred_gbr2_1_res = gbr2_1_res.predict(X_test_gbr2_1_res)
print(accuracy(result(y_pred_gbr2_1_res), y_test_gbr2_1_res))
print(score(y_pred_gbr2_1_res, y_test_gbr2_1_res))

### 2.4 Stacking

In [180]:
base_models2 = list()
base_models2.append(('lr', lr2))
base_models2.append(('knn', knn2))
base_models2.append(('dt', dt2))

meta_learner2 = LinearRegression()
sr2 = StackingRegressor(estimators=base_models2, final_estimator=meta_learner2, cv=5)

In [181]:
X_train_sr2 = pd.DataFrame([lr2.predict(X_train2), knn2.predict(X_train2), dt2.predict(X_train2)]).T
X_train_sr2

ValueError: X has 36 features, but KNeighborsRegressor is expecting 35 features as input.

In [None]:
sr2.fit(X_train_sr2, y_train2)
X_test_sr2 = pd.DataFrame([lr2.predict(X_test2), knn2.predict(X_test2), dt2.predict(X_test2)]).T
y_pred_sr2 = sr2.predict(X_test_sr2)
print(accuracy(result(y_pred_sr2), y_test2))
print(score(y_pred_sr2, y_test2))