In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge

In [11]:
df = pd.read_csv("/Users/lokki/Documents/GitHub/Text_Marker/Processed_Data.csv")
df = df.dropna(axis=0)
# y = df[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]]
y = df["vocabulary"]
X = df.iloc[:, 7:]
X = X.drop("corrected_text", axis=1)
X["text_standard"].mask(df["text_standard"] == "-", 0, inplace=True)
X["verb_to_adv"].mask(np.isinf(df["verb_to_adv"]), 0, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4)

In [12]:
rr = Ridge(random_state=42)
def result(predictions):
    predictions = predictions.tolist()
    result_list=[]
    for pred in predictions:
        result = pred // 0.5 * 0.5
        if (pred - result) > 0.25:
            result += 0.5
        if result < 1.0:
            result = 1.0
        if result > 5.0:
            result = 5.0
        result_list.append(result)
    return result_list

# Accuracy score
def accuracy(Ypred, Ytrue):
    Ytrue = Ytrue.tolist()
    accurate = 0
    for i in range(len(Ytrue)):
        if Ytrue[i] == Ypred[i]:
            accurate += 1
    return accurate / len(Ytrue)

# approximate accurancy rate
def score(pred, test):
    test = test.tolist()
    correct = 0
    for i in range(len(test)):
        p = pred[i]
        t = test[i]
        if p < t+0.5 and p > t-0.5:
            correct += 1
    return correct / len(test)

In [58]:
from sklearn.linear_model import Ridge
rr = Ridge(random_state=42, alpha=10).fit(X_train, y_train)

In [59]:
pred = rr.predict(X_test)
adj_pred = result(pred)
score(pred, y_test)
accuracy(adj_pred, y_test)

0.4335378323108384

In [60]:
# Save the model
import pickle
pickle.dump(rr, open('rr_vocab.sav', 'wb'))

In [61]:
new_feature = rr.predict(X)
vocab = pd.Series(new_feature)
features = X.loc[:]
features["vocab"] = vocab
target = y = df["cohesion"]
X_train2, X_test2, y_train2, y_test2 = train_test_split(features, target, test_size=1/4)

In [68]:
new_knn = Ridge(random_state=42, alpha=0.2)
new_knn.fit(X_train2, y_train2)
prediction2 = new_knn.predict(X_test2)
# make prediction to the nearest 0.5 level
adj_prediction2 = result(prediction2)
score(adj_prediction2, y_test2)

0.3374233128834356

In [67]:
pickle.dump(rr, open('rr_cohesion.sav', 'wb'))

### Hyperparamters Tuning

In [56]:
params = {'alpha': [0.2, 0.5, 1, 2, 4, 10, 8, 12, 20, 30, 50]}
ridge_grid = GridSearchCV(Ridge(random_state=42), param_grid = params, scoring='accuracy', verbose=3)
ridge_grid.fit(X_train, y_train)


Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV 1/5] END ...........................alpha=0.2;, score=nan total time=   0.0s
[CV 2/5] END ...........................alpha=0.2;, score=nan total time=   0.0s
[CV 3/5] END ...........................alpha=0.2;, score=nan total time=   0.0s
[CV 4/5] END ...........................alpha=0.2;, score=nan total time=   0.0s
[CV 5/5] END ...........................alpha=0.2;, score=nan total time=   0.0s
[CV 1/5] END ...........................alpha=0.5;, score=nan total time=   0.0s
[CV 2/5] END ...........................alpha=0.5;, score=nan total time=   0.0s
[CV 3/5] END ...........................alpha=0.5;, score=nan total time=   0.0s
[CV 4/5] END ...........................alpha=0.5;, score=nan total time=   0.0s
[CV 5/5] END ...........................alpha=0.5;, score=nan total time=   0.0s
[CV 1/5] END .............................alpha=1;, score=nan total time=   0.0s
[CV 2/5] END .............................alpha=

Traceback (most recent call last):
  File "/opt/anaconda3/envs/py39/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/anaconda3/envs/py39/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/anaconda3/envs/py39/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/opt/anaconda3/envs/py39/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 211, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/opt/anaconda3/envs/py39/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 104, in _check_targets
    raise ValueError("{0} is not supported".format(y_type))
ValueError: continuous is not supported

Traceback (most recent call last):
  File "/opt/anaconda3/envs/p

[CV 1/5] END ............................alpha=30;, score=nan total time=   0.0s
[CV 2/5] END ............................alpha=30;, score=nan total time=   0.0s
[CV 3/5] END ............................alpha=30;, score=nan total time=   0.0s
[CV 4/5] END ............................alpha=30;, score=nan total time=   0.0s
[CV 5/5] END ............................alpha=30;, score=nan total time=   0.0s
[CV 1/5] END ............................alpha=50;, score=nan total time=   0.0s
[CV 2/5] END ............................alpha=50;, score=nan total time=   0.0s
[CV 3/5] END ............................alpha=50;, score=nan total time=   0.0s
[CV 4/5] END ............................alpha=50;, score=nan total time=   0.0s
[CV 5/5] END ............................alpha=50;, score=nan total time=   0.0s


Traceback (most recent call last):
  File "/opt/anaconda3/envs/py39/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/anaconda3/envs/py39/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/anaconda3/envs/py39/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/opt/anaconda3/envs/py39/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 211, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/opt/anaconda3/envs/py39/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 104, in _check_targets
    raise ValueError("{0} is not supported".format(y_type))
ValueError: continuous is not supported

Traceback (most recent call last):
  File "/opt/anaconda3/envs/p

GridSearchCV(estimator=Ridge(random_state=42),
             param_grid={'alpha': [0.2, 0.5, 1, 2, 4, 10, 8, 12, 20, 30, 50]},
             scoring='accuracy', verbose=3)

In [57]:
ridge_grid.best_estimator_

Ridge(alpha=0.2, random_state=42)