In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge

In [2]:
df = pd.read_csv("processed_data.csv")
df = df.dropna(axis=0)
# y = df[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]]
y = df["vocabulary"]
X = df.iloc[:, 7:]
X = X.drop("corrected_text", axis=1)
X["text_standard"].mask(df["text_standard"] == "-", 0, inplace=True)
X["verb_to_adv"].mask(np.isinf(df["verb_to_adv"]), 0, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4)

In [3]:
def result(predictions):
    predictions = predictions.tolist()
    result_list=[]
    for pred in predictions:
        result = pred // 0.5 * 0.5
        if (pred - result) > 0.25:
            result += 0.5
        if result < 1.0:
            result = 1.0
        if result > 5.0:
            result = 5.0
        result_list.append(result)
    return result_list

# Accuracy score
def accuracy(Ypred, Ytrue):
    Ytrue = Ytrue.tolist()
    accurate = 0
    for i in range(len(Ytrue)):
        if Ytrue[i] == Ypred[i]:
            accurate += 1
    return accurate / len(Ytrue)

# approximate accurancy rate
def score(pred, test):
    test = test.tolist()
    correct = 0
    for i in range(len(test)):
        p = pred[i]
        t = test[i]
        if p < t+0.5 and p > t-0.5:
            correct += 1
    return correct / len(test)

In [4]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5, p=2)
knn.fit(X_train, y_train)

KNeighborsRegressor()

In [5]:
pred = knn.predict(X_test)
adj_pred = result(pred)
score(pred, y_test)
accuracy(adj_pred, y_test)


0.3946830265848671

In [6]:
# Save the model
import pickle
pickle.dump(knn, open('knn_vocab.sav', 'wb'))

In [7]:
new_feature = knn.predict(X)
vocab = pd.Series(new_feature)
features = X.loc[:]
features["vocab"] = vocab
target = y = df["cohesion"]
X_train2, X_test2, y_train2, y_test2 = train_test_split(features, target, test_size=1/4)

In [8]:
new_knn = KNeighborsRegressor(n_neighbors=5, p=2)
new_knn.fit(X_train2, y_train2)
prediction2 = new_knn.predict(X_test2)
# make prediction to the nearest 0.5 level
adj_prediction2 = result(prediction2)
score(adj_prediction2, y_test2)

0.3149284253578732

In [None]:
pickle.dump(knn, open('knn_cohesion.sav', 'wb'))