In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge

In [None]:
df = pd.read_csv("./Text_Marker/Processed_Data.csv")
df = df.dropna(axis=0)
# y = df[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]]
y = df["vocabulary"]
X = df.iloc[:, 7:]
X = X.drop("corrected_text", axis=1)
X["text_standard"].mask(df["text_standard"] == "-", 0, inplace=True)
X["verb_to_adv"].mask(np.isinf(df["verb_to_adv"]), 0, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4)

In [3]:
def result(predictions):
    predictions = predictions.tolist()
    result_list=[]
    for pred in predictions:
        result = pred // 0.5 * 0.5
        if (pred - result) > 0.25:
            result += 0.5
        if result < 1.0:
            result = 1.0
        if result > 5.0:
            result = 5.0
        result_list.append(result)
    return result_list

# Accuracy score
def accuracy(Ypred, Ytrue):
    Ytrue = Ytrue.tolist()
    accurate = 0
    for i in range(len(Ytrue)):
        if Ytrue[i] == Ypred[i]:
            accurate += 1
    return accurate / len(Ytrue)

# approximate accurancy rate
def score(pred, test):
    test = test.tolist()
    correct = 0
    for i in range(len(test)):
        p = pred[i]
        t = test[i]
        if p < t+0.5 and p > t-0.5:
            correct += 1
    return correct / len(test)

In [15]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(leaf_size=1, n_neighbors=29, p=2)
knn.fit(X_train, y_train)

KNeighborsRegressor(leaf_size=1, n_neighbors=29)

In [16]:
pred = knn.predict(X_test)
adj_pred = result(pred)
score(pred, y_test)
accuracy(adj_pred, y_test)


0.44171779141104295

In [17]:
# Save the model
import pickle
pickle.dump(knn, open('knn_vocab.sav', 'wb'))

In [18]:
new_feature = knn.predict(X)
vocab = pd.Series(new_feature)
features = X.loc[:]
features["vocab"] = vocab
target = y = df["cohesion"]
X_train2, X_test2, y_train2, y_test2 = train_test_split(features, target, test_size=1/4)

In [21]:
new_knn = KNeighborsRegressor(leaf_size=1, n_neighbors=29)
new_knn.fit(X_train2, y_train2)
prediction2 = new_knn.predict(X_test2)
# make prediction to the nearest 0.5 level
adj_prediction2 = result(prediction2)
score(adj_prediction2, y_test2)

0.34662576687116564

In [20]:
pickle.dump(new_knn, open('knn_cohesion.sav', 'wb'))

### Already use the tuned hyperparameters in previous part

In [12]:
# Hyperparameters Tuning
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p=[1,2]
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)
tune_knn = GridSearchCV(KNeighborsRegressor(), hyperparameters, cv=3, n_jobs=-1, verbose=3).fit(X_train, y_train)

Fitting 3 folds for each of 2842 candidates, totalling 8526 fits
[CV 3/3] END ..leaf_size=1, n_neighbors=1, p=1;, score=-0.266 total time=   0.1s
[CV 2/3] END ..leaf_size=1, n_neighbors=1, p=1;, score=-0.371 total time=   0.1s
[CV 1/3] END ..leaf_size=1, n_neighbors=1, p=1;, score=-0.221 total time=   0.1s
[CV 3/3] END ..leaf_size=1, n_neighbors=1, p=2;, score=-0.300 total time=   0.1s
[CV 1/3] END ..leaf_size=1, n_neighbors=1, p=2;, score=-0.202 total time=   0.1s
[CV 1/3] END ...leaf_size=1, n_neighbors=2, p=2;, score=0.060 total time=   0.0s
[CV 2/3] END ..leaf_size=1, n_neighbors=1, p=2;, score=-0.334 total time=   0.1s
[CV 2/3] END ..leaf_size=1, n_neighbors=2, p=2;, score=-0.012 total time=   0.0s
[CV 3/3] END ...leaf_size=1, n_neighbors=2, p=1;, score=0.040 total time=   0.1s
[CV 3/3] END ...leaf_size=1, n_neighbors=2, p=2;, score=0.024 total time=   0.0s
[CV 1/3] END ...leaf_size=1, n_neighbors=3, p=1;, score=0.129 total time=   0.0s
[CV 2/3] END ...leaf_size=1, n_neighbors=3, 

In [14]:
tune_knn.best_estimator_

KNeighborsRegressor(leaf_size=1, n_neighbors=29)