In [1]:
import pickle
import pandas as pd
import numpy as np

from sklearn.svm import LinearSVC
from sklearn.linear_model import RidgeClassifier

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import sklearn

In [2]:
with open('data/pickle/vectors_neg.pkl', 'rb') as f:
    vecs_neg = np.array(pickle.load(f))
    
with open('data/pickle/vectors_pos.pkl', 'rb') as f:
    vecs_pos = np.array(pickle.load(f))

In [3]:
x = np.concatenate((vecs_neg, vecs_pos), axis = 0)
mean = np.mean(x, axis=0)
std = np.std(x, axis=0)
x = (x - mean) / std

In [4]:
x.shape

(200000, 768)

In [5]:
y = np.concatenate((np.zeros(len(vecs_neg)), np.ones(len(vecs_pos))))

In [6]:
y.shape

(200000,)

In [7]:
del vecs_neg
del vecs_pos

In [8]:
np.mean(x)

-3.632742255158897e-18

In [9]:
def train_and_pred(x_train, y_train, x_test):
    classifier = RandomForestClassifier(n_jobs=-1, verbose = True, n_estimators = 1000)
    classifier.fit(x_train, y_train)

    return classifier.predict(x_test)

def crossValidate(x, y, fold):
    accuracy = 0
    kf = KFold(n_splits=fold, shuffle = True)
    
    for train_index, test_index in kf.split(y):
        train_x = x[train_index, :]
        train_y = y[train_index]
        test_x = x[test_index, :]
        test_y = y[test_index]
        
        pred = train_and_pred(train_x, train_y, test_x)
        
        accuracy += accuracy_score(test_y, pred, normalize=False)
        print(accuracy_score(test_y, pred))
        
    print("avg fold accuracy : ", accuracy / len(y))



In [10]:
crossValidate(x, y, 5)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   34.9s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  3.6min finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.2s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.6s
[Parallel(n_jobs=32)]: Done 736 tasks      | elapsed:    1.0s
[Parallel(n_jobs=32)]: Done 1000 out of 1000 | elapsed:    1.4s finished


0.8097


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   34.2s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  3.6min finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.1s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.4s
[Parallel(n_jobs=32)]: Done 736 tasks      | elapsed:    0.9s
[Parallel(n_jobs=32)]: Done 1000 out of 1000 | elapsed:    1.2s finished


0.80915


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   33.7s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  3.6min finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.2s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.6s
[Parallel(n_jobs=32)]: Done 736 tasks      | elapsed:    0.9s
[Parallel(n_jobs=32)]: Done 1000 out of 1000 | elapsed:    1.3s finished


0.810675


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   33.5s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  3.6min finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.2s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.5s
[Parallel(n_jobs=32)]: Done 736 tasks      | elapsed:    0.9s
[Parallel(n_jobs=32)]: Done 1000 out of 1000 | elapsed:    1.1s finished


0.81145


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   33.6s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  3.6min finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.2s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.6s
[Parallel(n_jobs=32)]: Done 736 tasks      | elapsed:    1.1s


0.806975
avg fold accuracy :  0.80959


[Parallel(n_jobs=32)]: Done 1000 out of 1000 | elapsed:    1.5s finished


In [11]:
def publishResults(test_pred, file_name):
    with open('data/' + file_name, "w") as f:
        f.write("Id,Prediction\n")
        for pred, index in zip(test_pred, range(1, len(test_pred) +1)):
            f.write(str(index) + "," + str(pred) + "\n")

In [12]:
with open('data/pickle/test_data.pkl', 'rb') as f:
    test_vecs = np.array(pickle.load(f))
test_vecs = (test_vecs - mean) / std
test_pred = train_and_pred(x, y, test_vecs)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   45.7s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  4.8min finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.1s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.1s
[Parallel(n_jobs=32)]: Done 736 tasks      | elapsed:    0.3s
[Parallel(n_jobs=32)]: Done 1000 out of 1000 | elapsed:    0.3s finished


In [13]:
test_pred = [-1 if x <= 0.0 else 1 for x in test_pred]
publishResults(test_pred, "bert_rft.csv")