In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge

In [16]:
def result(predictions):
    predictions = predictions.tolist()
    result_list=[]
    for pred in predictions:
        result = pred // 0.5 * 0.5
        if (pred - result) > 0.25:
            result += 0.5
        if result < 1.0:
            result = 1.0
        if result > 5.0:
            result = 5.0
        result_list.append(result)
    return result_list

# Accuracy score
def accuracy(Ypred, Ytrue):
    Ytrue = Ytrue.tolist()
    accurate = 0
    for i in range(len(Ytrue)):
        if Ytrue[i] == Ypred[i]:
            accurate += 1
    return accurate / len(Ytrue)

def accuracy_range(Ytrue, Ypred):
    Ytrue=Ytrue.tolist()
    accurate_range=0
    for i in range(len(Ytrue)):
        if abs(Ytrue[i] - Ypred[i])<=0.5:
            accurate_range+=1
    return accurate_range/len(Ytrue)

# Total error / total number of points => by average what's the error for each point
def error_rate(Ytrue, Ypred):
    Ytrue=Ytrue.tolist()
    error=0
    for i in range(len(Ytrue)):
        error += abs(Ytrue[i] - Ypred[i])
    return error/len(Ytrue)

In [39]:
df = pd.read_csv("/Users/lokki/Documents/GitHub/Text_Marker/Processed_Data.csv")
df = df.dropna(axis=0)
y_coh = df["cohesion"]
X = df.iloc[:, 7:]
X = X.drop("corrected_text", axis=1)
X["text_standard"].mask(df["text_standard"] == "-", 0, inplace=True)
X["verb_to_adv"].mask(np.isinf(df["verb_to_adv"]), 0, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y_coh, test_size=1/4, random_state=42)

In [43]:
from sklearn.linear_model import Ridge
params = {'alpha': [0.2, 0.5, 1, 2, 4, 10, 8, 12, 20, 30, 50]}
rr = GridSearchCV(Ridge(random_state=42), param_grid = params, scoring='r2', cv=10)
rr.fit(X_train, y_train)
rr.best_estimator_

Ridge(alpha=0.2, random_state=42)

In [44]:
pred = rr.predict(X_test)
adj_pred = result(pred)

print("accuracy range: %s \n accuracy: %s \n error rate: %s \n" 
    % (accuracy_range(y_test, pred), accuracy(adj_pred, y_test), error_rate(y_test, adj_pred)))

accuracy range: 0.6319018404907976 
 accuracy: 0.3149284253578732 
 error rate: 0.434560327198364 



### Predict the vocabulary first then predict the cohesion

In [45]:
y_vocab = df["vocabulary"]
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y_vocab, test_size=1/4, random_state=42)

In [46]:
params = {'alpha': [0.2, 0.5, 1, 2, 4, 10, 8, 12, 20, 30, 50]}
rr_vocab = GridSearchCV(Ridge(random_state=42), param_grid = params, scoring='r2', cv=10)
rr_vocab.fit(X_train1, y_train1)
rr_vocab.best_estimator_

Ridge(alpha=0.2, random_state=42)

In [53]:
# Save the model
import pickle
pickle.dump(rr_vocab, open('rr_vocab.sav', 'wb'))

In [47]:
pred_vocab = rr.predict(X_test1)
adj_pred_vocab = result(pred_vocab)

print("accuracy range: %s \n accuracy: %s \n error rate: %s \n" 
    % (accuracy_range(y_test1, pred_vocab), accuracy(adj_pred_vocab, y_test1), error_rate(y_test1, adj_pred_vocab)))

accuracy range: 0.7024539877300614 
 accuracy: 0.4059304703476483 
 error rate: 0.3696319018404908 



In [55]:
new_feature = rr_vocab.predict(X)
vocab = pd.Series(new_feature)
features = X.loc[:]
features["vocab"] = vocab
target = df["cohesion"]
X_train2, X_test2, y_train2, y_test2 = train_test_split(features, target, test_size=1/4, random_state=42)

In [56]:
params = {'alpha': [0.2, 0.5, 1, 2, 4, 10, 8, 12, 20, 30, 50]}
rr_coh = GridSearchCV(Ridge(random_state=42), param_grid = params, scoring='r2', cv=10)
rr_coh.fit(X_train2, y_train2)
rr_coh.best_estimator_

Ridge(alpha=0.5, random_state=42)

In [57]:
pred_coh = rr_coh.predict(X_test2)
adj_pred_coh = result(pred_coh)

print("accuracy range: %s \n accuracy: %s \n error rate: %s \n" 
    % (accuracy_range(y_test2, pred_coh), accuracy(adj_pred_coh, y_test2), error_rate(y_test2, adj_pred_coh)))

accuracy range: 0.6359918200408998 
 accuracy: 0.3169734151329243 
 error rate: 0.4340490797546012 



In [54]:
pickle.dump(rr_coh, open('rr_cohesion.sav', 'wb'))