# OvR

In [1]:
import pandas as pd
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
from sklearn.metrics import label_ranking_average_precision_score,make_scorer
import random
from sklearn.neighbors import KNeighborsClassifier

# Download data

In [2]:
#Load the previously transformed data.
NUM_FEATURES = 5000
NUM_CLASSES = 3993

x_train = pd.read_csv("../data/expanded/train_features.csv", names=range(NUM_FEATURES))
y_train = pd.read_csv("../data/expanded/train_labels.csv", names=range(NUM_CLASSES))

x_val = pd.read_csv("../data/expanded/dev_features.csv", names=range(NUM_FEATURES))
y_val = pd.read_csv("../data/expanded/dev_labels.csv", names=range(NUM_CLASSES))

# Model 1

Best model in notebook, achieves around 0.3 on validation.

In [None]:
#Training takes around 30min on my machine. 5000 classes  = 5000 seperate models in one-vs-rest scheme so not too efficient.
model = OneVsRestClassifier(LinearSVC(tol = np.exp(-4), random_state=0, dual = False),n_jobs = -1).fit(x_train,y_train)

In [None]:
y_pred = model.predict(x_val)

In [None]:
score = label_ranking_average_precision_score(y_val,y_pred)
score

In [None]:
np.savetxt("../public_data/svm.csv", y_pred, delimiter=",")

# Model 2 : Pegasos

In [None]:

def pegasos_fast(x_train,y_train, max_epoch, lam, watch_list=None, grad_checking=False, tfidf= False,min_loss = 0.01):
    """
    Description
    ==========
    Implementation of Pegasos Algorithm 

    Input
    =====
    review_list: list of reviewInstance's
    list of objects with labels and encoded input from reviews

    max_epoch: int 
    stopping condition

    lam: float 
    regularization parameter

    watch_list: list or reviewInstance's
    passed to accuracy_percent or magnitude_compare; default None

    grad_checking: bool 
    numerical test of gradient of svm objective
    
    tfidf: bool
    use tf-idf encoding of text in review_list

    Output
    ======
    weights
    """

    #Initialization
    w = np.zeros(x_train.shape[1])
    epoch = 0
    t = 1.
    samples = list(range(len(x_train)))
    s = 1.

    #Loop
    # Use the util.increment and util.dotProduct functions in update
    #while training_loss > min_loss:
    while epoch <max_epoch:
          
        random.shuffle(samples)
        for i in range(len(x_train)):
            t +=1
            lr = 1 / (t*lam)
            margin = y_train[i]*s*np.dot(w,x_train[i,:])

            scale = 1-lr*lam
            s = s*scale
            if s == 0:
                s = 1
                w = np.zeros(x_train.shape[1])
                
            if margin <1:
                second_scale = (1/s)*lr*y_train[i]
                w = w + x_train[i,:]*second_scale

        epoch +=1
        #temp_W = np.copy(w)
        #temp_W = temp_W*s
        #training_loss = pegasos_loss(x_train,y_train,temp_W)
        #print(training_loss)    
    w = w*s
    return w

In [None]:
y_single= y_train.values[:,3]
y_single[y_single==0] =-1

x_single = x_train.values

#pegasos_fast(x_single,y_single,4,0.1)

In [None]:
%prun pegasos_fast(x_single,y_single,4,0.1)

In [None]:
def pegasos_loss(x,y,weight):
    loss = 0
    for i in range(len(x)):     
        prediction = svm_predict(x[i,:],weight)
        
        if prediction != y[i]:
            loss+=1
    return loss/len(x)

In [None]:
def svm_predict(review_X, weight):
    if np.dot(review_X,weight)>0:
        return 1
    else:
        return -1

# Training Pegasos - multiclass

In [None]:
y_train_temp = y_train.values#[:,:1000]
x_train_temp = x_train.values

In [None]:
w = np.zeros((x_train.shape[1],y_train.values.shape[1]))

for i in range(y_train_temp.shape[1]):
    if i % 50==0:
        print(i)
    temp_w = pegasos_fast(x_train_temp,y_train_temp[:,i],5,0.1)
    w[:,i]=temp_w

In [None]:
def multi_class_predict(x,w):
    pred = np.matmul(x,w)
    return pred

In [None]:
y_pred = multi_class_predict(x_val.values,w)

In [None]:
score = label_ranking_average_precision_score(y_val.values[:,:y_train.values.shape[1]],y_pred)
score

In [None]:
y_train.values.shape[1]

# RF

Memory explodes on local machine. Andrew ran on lab and it did not perform well.

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_samples = 5000,n_jobs=1)

In [None]:
rf.fit(x_train,y_train)

In [None]:
rf.predict(x_val)

# KNN

Not that good, best result around 0.2 on validation.

In [None]:
neigh = KNeighborsClassifier(n_neighbors=10, metric = 'hamming')

In [None]:
neigh.fit(x_train,y_train)

In [None]:
y_pred = neigh.predict(x_val)

In [None]:
neigbhours = [3]
metrics = ['jaccard','matching','dice','kulsinski','rogerstanimoto','russellrao']
pred_scores = []

for n in neigbhours:
    for metric in metrics:
        print(metric)
        neigh = KNeighborsClassifier(n_neighbors=n, metric = metric)
        neigh.fit(x_train,y_train)
        y_pred = neigh.predict(x_val)
        score = label_ranking_average_precision_score(y_val,y_pred)
        print(score)
        pred_scores.append(score)

In [None]:
pred_scores

In [None]:
pred_scores

# best is 3 and jaccard

In [None]:
neigh = KNeighborsClassifier(n_neighbors=3, metric = 'jaccard')

In [None]:
neigh.fit(x_train,y_train)
y_pred = neigh.predict(x_val)

In [None]:
score = label_ranking_average_precision_score(y_val,y_pred)
score

In [None]:
np.savetxt("../public_data/knn.csv", y_pred, delimiter=",")

In [None]:
y_pred = pd.read_csv("../public_data/knn.csv", names=range(NUM_CLASSES))

In [None]:
y_pred.to_csv("../public_data/knn.csv")

# xgboost

In [4]:
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier

In [5]:
clf_multilabel = OneVsRestClassifier(XGBClassifier())

In [None]:
clf_multilabel.fit(x_train,y_train)