In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge

In [54]:
def result(predictions):
    predictions = predictions.tolist()
    result_list=[]
    for pred in predictions:
        result = pred // 0.5 * 0.5
        if (pred - result) > 0.25:
            result += 0.5
        if result < 1.0:
            result = 1.0
        if result > 5.0:
            result = 5.0
        result_list.append(result)
    return result_list

# Accuracy score
def accuracy(Ypred, Ytrue):
    Ytrue = Ytrue.tolist()
    accurate = 0
    for i in range(len(Ytrue)):
        if Ytrue[i] == Ypred[i]:
            accurate += 1
    return accurate / len(Ytrue)

def accuracy_range(Ytrue, Ypred):
    Ytrue=Ytrue.tolist()
    accurate_range=0
    for i in range(len(Ytrue)):
        if abs(Ytrue[i] - Ypred[i])<=0.5:
            accurate_range+=1
    return accurate_range/len(Ytrue)

# Total error / total number of points => by average what's the error for each point
def error_rate(Ytrue, Ypred):
    Ytrue=Ytrue.tolist()
    error=0
    for i in range(len(Ytrue)):
        error += abs(Ytrue[i] - Ypred[i])
    return error/len(Ytrue)

### Use 35 features to predict cohesion

In [55]:
df = pd.read_csv("Processed_Data.csv")
df = df.dropna(axis=0)
# y = df[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]]
y_coh = df["cohesion"]
X = df.iloc[:, 7:]
X = X.drop("corrected_text", axis=1)
X["text_standard"].mask(df["text_standard"] == "-", 0, inplace=True)
X["verb_to_adv"].mask(np.isinf(df["verb_to_adv"]), 0, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y_coh, test_size=1/4, random_state=42)

In [56]:
from sklearn.neighbors import KNeighborsRegressor
## Hyperparameters tuning
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p=[1,2]
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)

knn = GridSearchCV(KNeighborsRegressor(), hyperparameters, cv=5, n_jobs=-1, verbose=1)
knn.fit(X_train, y_train)

Fitting 5 folds for each of 2842 candidates, totalling 14210 fits


GridSearchCV(cv=5, estimator=KNeighborsRegressor(), n_jobs=-1,
             param_grid={'leaf_size': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                       23, 24, 25, 26, 27, 28, 29, 30, ...],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29],
                         'p': [1, 2]},
             verbose=1)

In [57]:
knn.best_estimator_

KNeighborsRegressor(leaf_size=1, n_neighbors=28)

In [32]:
pred = knn.predict(X_test)
adj_pred = result(pred)

print("accuracy range: %s \n accuracy: %s \n error rate: %s \n" 
    % (accuracy_range(y_test, pred), accuracy(adj_pred, y_test), error_rate(y_test, adj_pred)))

accuracy range: 0.6094069529652352 
 accuracy: 0.3343558282208589 
 error rate: 0.4437627811860941 



### Predict the vocabulary first then use it as feature to predict cohesion

In [58]:
y_vocab = df["vocabulary"]
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y_vocab, test_size=1/4, random_state=42)

In [59]:
knn_vocab = GridSearchCV(KNeighborsRegressor(), hyperparameters, cv=5, n_jobs=-1, verbose=1)
knn_vocab.fit(X_train1, y_train1)

Fitting 5 folds for each of 2842 candidates, totalling 14210 fits


GridSearchCV(cv=5, estimator=KNeighborsRegressor(), n_jobs=-1,
             param_grid={'leaf_size': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                       23, 24, 25, 26, 27, 28, 29, 30, ...],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29],
                         'p': [1, 2]},
             verbose=1)

In [61]:
# Save the model
knn_vocab.best_estimator_


KNeighborsRegressor(leaf_size=1, n_neighbors=18)

In [62]:
### Save Model
import pickle
knn_vocab_sav = KNeighborsRegressor(leaf_size=1, n_neighbors=18).fit(X_train1, y_train1)
pickle.dump(knn_vocab_sav, open('Models_sav/knn_vocab.sav', 'wb'))


In [38]:
new_feature = knn_vocab.predict(X)
vocab = pd.Series(new_feature)
features = X.loc[:]
features["vocab"] = vocab
target = df["cohesion"]
X_train2, X_test2, y_train2, y_test2 = train_test_split(features, target, test_size=1/4, random_state=42)

In [39]:
knn_coh = GridSearchCV(KNeighborsRegressor(), hyperparameters, cv=5, n_jobs=-1, verbose=1)
knn_coh.fit(X_train2, y_train2)

Fitting 5 folds for each of 2842 candidates, totalling 14210 fits


GridSearchCV(cv=5, estimator=KNeighborsRegressor(), n_jobs=-1,
             param_grid={'leaf_size': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                       23, 24, 25, 26, 27, 28, 29, 30, ...],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29],
                         'p': [1, 2]},
             verbose=1)

In [43]:
knn_coh.best_estimator_

KNeighborsRegressor(leaf_size=1, n_neighbors=28)

In [40]:
pred_coh = knn_coh.predict(X_test2)
adj_pred_coh = result(pred_coh)

print("accuracy range: %s \n accuracy: %s \n error rate: %s \n" 
    % (accuracy_range(y_test2, pred_coh), accuracy(adj_pred_coh, y_test2), error_rate(y_test2, adj_pred_coh)))

accuracy range: 0.6094069529652352 
 accuracy: 0.3343558282208589 
 error rate: 0.4437627811860941 



In [63]:
knn_coh_sav = KNeighborsRegressor(leaf_size=1, n_neighbors=18).fit(X_train2, y_train2)
pickle.dump(knn_coh_sav, open('Models_sav/knn_cohesion.sav', 'wb'))