In [1]:
import pickle
import pandas as pd
import numpy as np

from sklearn.svm import LinearSVC
from sklearn.linear_model import RidgeClassifier

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import sklearn

In [2]:
with open('data/pickle/vectors_neg.pkl', 'rb') as f:
    vecs_neg = np.array(pickle.load(f))
    
with open('data/pickle/vectors_pos.pkl', 'rb') as f:
    vecs_pos = np.array(pickle.load(f))

In [3]:
x = np.concatenate((vecs_neg, vecs_pos), axis = 0)
mean = np.mean(x, axis=0)
std = np.std(x, axis=0)
x = (x - mean) / std

In [4]:
x.shape

(200000, 768)

In [5]:
y = np.concatenate((np.zeros(len(vecs_neg)), np.ones(len(vecs_pos))))

In [6]:
y.shape

(200000,)

In [7]:
del vecs_neg
del vecs_pos

In [8]:
np.mean(x)

-3.632742255158897e-18

In [9]:
def train_and_pred(x_train, y_train, x_test):
    classifier = RidgeClassifier(alpha = ridge_param)
    classifier.fit(x_train, y_train)

    return classifier.predict(x_test)

def crossValidate(x, y, fold):
    accuracy = 0
    kf = KFold(n_splits=fold, shuffle = True)
    
    for train_index, test_index in kf.split(y):
        train_x = x[train_index, :]
        train_y = y[train_index]
        test_x = x[test_index, :]
        test_y = y[test_index]
        
        pred = train_and_pred(train_x, train_y, test_x)
        
        accuracy += accuracy_score(test_y, pred, normalize=False)
        print(accuracy_score(test_y, pred))
        
    print("avg fold accuracy : ", accuracy / len(y))



In [10]:
ridge_param = 0.0
for alpha in [0.1, 0.2, 1.0, 1.5, 2.0, 2.5, 5.0]:
    ridge_param = alpha
    print("=====================================")
    print("alpha = " + str(ridge_param))
    crossValidate(x, y, 5)

alpha = 0.1
0.821525
0.81865
0.818975
0.8215
0.82335
avg fold accuracy :  0.8208
alpha = 0.2
0.818125
0.820625
0.8226
0.821125
0.821875
avg fold accuracy :  0.82087
alpha = 1.0
0.822275
0.82035
0.82175
0.818925
0.8218
avg fold accuracy :  0.82102
alpha = 1.5
0.820825
0.82135
0.820075
0.8205
0.822025
avg fold accuracy :  0.820955
alpha = 2.0
0.823225
0.8235
0.819575
0.817975
0.820225
avg fold accuracy :  0.8209
alpha = 2.5
0.8227
0.8221
0.822175
0.821475
0.818525
avg fold accuracy :  0.821395
alpha = 5.0
0.822225
0.81955
0.81915
0.8219
0.8203
avg fold accuracy :  0.820625


In [11]:
def publishResults(test_pred, file_name):
    with open('data/' + file_name, "w") as f:
        f.write("Id,Prediction\n")
        for pred, index in zip(test_pred, range(1, len(test_pred) +1)):
            f.write(str(index) + "," + str(pred) + "\n")

In [12]:
ridge_param = 2.5 #keep best
with open('data/pickle/test_data.pkl', 'rb') as f:
    test_vecs = np.array(pickle.load(f))
test_vecs = (test_vecs - mean) / std
test_pred = train_and_pred(x, y, test_vecs)

In [13]:
test_pred = [-1 if x <= 0.0 else 1 for x in test_pred]
publishResults(test_pred, "bert_ridge.csv")