# OvR

This notebook tries to create a baseline for the task. Noteworthy points:

-Using LRAP loss metric in cross-validation in Model 2. 

-Best score is around 0.3 in Model 2. It seems low but I feel this task is quite difficult so maybe very high scores are difficult.

-Balancing cross-validation using n_jobs and pre_dispatch in order to preserve maximum CPU usage but prevent memory from blowing up.

In [54]:
import pandas as pd
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
from sklearn.metrics import label_ranking_average_precision_score,make_scorer

# Pre-process

In [26]:
#Load the previously transformed data.
NUM_FEATURES = 5000
NUM_CLASSES = 3993
features = pd.read_csv("../data/expanded/train_features.csv", names=range(NUM_FEATURES))
labels = pd.read_csv("../data/expanded/train_labels.csv", names=range(NUM_CLASSES))

In [105]:
#Split into test and train
X_train,y_train = features.iloc[:10000,:],labels.iloc[:10000,:]
X_test,y_test = features.iloc[1000:,:],labels.iloc[1000:,:]

In [106]:
#Shuffle data randomly
X_train,y_train = shuffle(X_train,y_train,random_state = 5)

In [10]:
# Centering the data (by just subtracting the mean)
#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)

In [11]:
# Arbitarily chosen (for now). THIS IS A HYPER-PARAMETER!
#n_components = 40

#pca = PCA(n_components=n_components)
#X_train = pca.fit_transform(X_train)
#X_train = pd.DataFrame(X_train)

#Apply equivalent transformation to test data.
#X_test = scaler.transform(X_test)
#X_test = pca.transform(X_test)

# Model 1

In [42]:
#Training takes around 30min on my machine. 5000 classes  = 5000 seperate models in one-vs-rest scheme so not too efficient.
model = OneVsRestClassifier(LinearSVC(loss = 'hinge',tol = np.exp(-3), random_state=0),n_jobs = -1).fit(X_train,y_train)

In [43]:
y_pred = model.predict(X_test.iloc[:100,:])
#np.nonzero(y_pred)

In [45]:
score = label_ranking_average_precision_score(y_test.iloc[:100,:], y_pred)
score

0.11932993611836587

In [46]:
np.nonzero(y_pred[0,:])


(array([   3,    8,  172,  891, 1812], dtype=int64),)

In [47]:
np.nonzero(y_test.iloc[0,:].values)

(array([  3,   8, 658, 891, 902], dtype=int64),)

# Model 2

In [93]:
#Training takes around 30min on my machine. 5000 classes  = 5000 seperate models in one-vs-rest scheme so not too efficient.

model = OneVsRestClassifier(LinearSVC(tol = np.exp(-3), random_state=0),n_jobs = -1).fit(X_train,y_train)

In [110]:
#classifier = OneVsRestClassifier(SVC(tol = np.exp(-3), random_state=0))
model = OneVsRestClassifier(LinearSVC(tol = np.exp(-3), random_state=0),n_jobs =-1)
custom_scorer = make_scorer(label_ranking_average_precision_score)

parameters = {
    "estimator__loss":['hinge','squared_hinge'],
    "estimator__C": [0.001,0.01,0.1,1]}
     

grid = GridSearchCV(model, param_grid=parameters,
                             scoring=custom_scorer,cv=2,n_jobs = -1,pre_dispatch = 1)
            

grid.fit(X_train,y_train)

GridSearchCV(cv=2, error_score='raise-deprecating',
             estimator=OneVsRestClassifier(estimator=LinearSVC(C=1.0,
                                                               class_weight=None,
                                                               dual=True,
                                                               fit_intercept=True,
                                                               intercept_scaling=1,
                                                               loss='squared_hinge',
                                                               max_iter=1000,
                                                               multi_class='ovr',
                                                               penalty='l2',
                                                               random_state=0,
                                                               tol=0.049787068367863944,
                                                               ve

In [114]:
grid.cv_results_

{'mean_fit_time': array([255.69439769, 254.15666366, 247.67789364, 240.46170056,
        240.99707687, 240.7784766 , 241.14050972, 241.06294298]),
 'std_fit_time': array([3.15262604, 1.25932622, 8.22004986, 1.80250061, 2.17292058,
        1.54014885, 1.3164922 , 0.59134769]),
 'mean_score_time': array([146.48480499, 146.62636364, 142.61903417, 135.33090889,
        134.67668939, 135.19280529, 135.29620957, 134.75686789]),
 'std_score_time': array([0.84751546, 0.7098552 , 8.3423568 , 0.53756869, 1.42698932,
        0.47986221, 0.78837252, 0.01386571]),
 'param_estimator__C': masked_array(data=[0.001, 0.001, 0.01, 0.01, 0.1, 0.1, 1, 1],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_estimator__loss': masked_array(data=['hinge', 'squared_hinge', 'hinge', 'squared_hinge',
                    'hinge', 'squared_hinge', 'hinge', 'squared_hinge'],
              mask=[False, False, False, False, False, Fal

In [115]:
grid.best_params_

{'estimator__C': 0.01, 'estimator__loss': 'hinge'}

In [116]:
grid.best_score_

0.23875351847880935