In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [2]:
X_train = pd.read_csv("X_train.csv")
X_train = X_train.iloc[:, 1:]
X_test = pd.read_csv("X_test.csv")
X_test = X_test.iloc[:, 1:]
Y_train = pd.read_csv("y_train.csv")
Y_test = pd.read_csv("y_test.csv")
Y_train = Y_train['Rating as Factor'].astype('category') #factorize trainset
Y_test = Y_test['Rating as Factor'].astype('category')   #factorize testset

In [3]:
#Get predictions
def SVM_poly(X_train,Y_train):
    '''
    X_train: Training Set of X values
    Y_train: Training Set of Y values(factorized)
    '''
    
    # Create pipeline object with standard scaler and SVC estimator
    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('svm_poly', SVC(kernel='poly', random_state=0, max_iter=100000))])
    # Define parameter grid
    param_grid = {'svm_poly__C': [1000], 
                  'svm_poly__degree': [4],
                  'svm_poly__gamma': [0.05],
                  'svm_poly__coef0':[0.6]}  #Larger gridsearch yielded 0.6 to be the best coef0 with this combination. As it does not greatly change the cv accuracy(<1%) we don't include it in this grid search to lower the computing time.

    # Run grid search
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
    grid = grid.fit(X_train, Y_train)
    return(grid)


In [4]:
#######Polynomial Kernel Function#######
print(datetime.datetime.now()) #computation time
poly = SVM_poly(X_train,Y_train)
print('Best CV accuracy: {:.2f}'.format(poly.best_score_))
print('Test score:       {:.2f}'.format(poly.score(X_test, Y_test)))
print('Best parameters: {}'.format(poly.best_params_))
print(datetime.datetime.now()) #20min

# Predict classes
y_pred = poly.predict(X_test)

# Manual confusion matrix as pandas DataFrame
confm = pd.DataFrame({'Predicted': y_pred,
                      'True': Y_test})
print('Polynomial Kernel Function yields the following confusion matrix:')
print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted')) 


2020-04-09 13:40:39.919023




Best CV accuracy: 0.95
Test score:       0.96
Best parameters: {'svm_poly__C': 1000, 'svm_poly__coef0': 0.6, 'svm_poly__degree': 4, 'svm_poly__gamma': 0.05}
2020-04-09 13:44:09.188225
Polynomial Kernel Function yields the following confusion matrix:
Predicted     0      1      2     3       4      5     6      7      8     9   \
True                                                                            
0          716.0    6.0    NaN   NaN     1.0    NaN   NaN    6.0    1.0   NaN   
1            2.0  378.0    7.0   NaN     NaN    NaN   1.0    1.0    NaN   NaN   
2            NaN    5.0  192.0   1.0     NaN    NaN   NaN    NaN    NaN   NaN   
3            NaN    NaN    NaN  59.0     NaN    NaN   NaN    NaN    NaN   NaN   
4            2.0    3.0    NaN   NaN  1042.0   12.0   NaN    2.0   27.0   NaN   
5            NaN    1.0    NaN   NaN    22.0  609.0   NaN    NaN    3.0   NaN   
6            NaN    NaN    3.0   NaN     NaN    NaN  95.0    NaN    NaN   NaN   
7            8.0    N

ROC AUC only works when translating the problem into binary using a OneVsAll approach which we don't do in our methods. Therefore we cannot compute ROC AUC curves.

In [5]:
from sklearn import metrics
print('Accuracy:', metrics.accuracy_score(Y_test, y_pred))
print('Error Rate:', 1-metrics.accuracy_score(Y_test, y_pred))
print('Confusion Matrix:')
confm = pd.DataFrame({'Predicted': y_pred,'True': Y_test})
print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted'))

Accuracy: 0.9598775926555594
Error Rate: 0.040122407344440614
Confusion Matrix:
Predicted     0      1      2     3       4      5     6      7      8     9   \
True                                                                            
0          716.0    6.0    NaN   NaN     1.0    NaN   NaN    6.0    1.0   NaN   
1            2.0  378.0    7.0   NaN     NaN    NaN   1.0    1.0    NaN   NaN   
2            NaN    5.0  192.0   1.0     NaN    NaN   NaN    NaN    NaN   NaN   
3            NaN    NaN    NaN  59.0     NaN    NaN   NaN    NaN    NaN   NaN   
4            2.0    3.0    NaN   NaN  1042.0   12.0   NaN    2.0   27.0   NaN   
5            NaN    1.0    NaN   NaN    22.0  609.0   NaN    NaN    3.0   NaN   
6            NaN    NaN    3.0   NaN     NaN    NaN  95.0    NaN    NaN   NaN   
7            8.0    NaN    NaN   NaN     4.0    NaN   NaN  754.0    9.0   NaN   
8            1.0    1.0    NaN   NaN    11.0    8.0   NaN   16.0  899.0   NaN   
9            NaN    NaN    Na

Print the precision, recall, f1-score, accuracy for each class. Support stands for number of samples for each class in Y_test. Macro avg is unweighted average between all rating groups. Weighted avg is weighted average between all rating groups. (weighted by support)

In [6]:
print(metrics.classification_report(Y_test, y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.982     0.979     0.981       731
           1      0.959     0.972     0.966       389
           2      0.950     0.970     0.960       198
           3      0.983     1.000     0.992        59
           4      0.960     0.955     0.958      1091
           5      0.953     0.949     0.951       642
           6      0.969     0.969     0.969        98
           7      0.967     0.972     0.969       776
           8      0.956     0.960     0.958       936
           9      1.000     1.000     1.000        40
          10      0.769     1.000     0.870        10
          11      1.000     0.931     0.964        29
          12      0.912     0.961     0.936        76
          13      0.934     0.930     0.932       214
          14      0.948     0.939     0.944       330
          15      0.948     0.918     0.933       219
          16      0.977     0.955     0.966        44

    accuracy              

Despite not being very visually appealing, the following package shows all possible Performance Metrics one might want to use in the multiclass classification case.

In [7]:
from pycm import *
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
Y_test_cm = np.array(Y_test)
cm = ConfusionMatrix(actual_vector=Y_test_cm, predict_vector=y_pred) # Create CM From Data
print(cm)


Predict    0          1          2          3          4          5          6          7          8          9          10         11         12         13         14         15         16         
Actual
0          716        6          0          0          1          0          0          6          1          0          0          0          0          1          0          0          0          

1          2          378        7          0          0          0          1          1          0          0          0          0          0          0          0          0          0          

2          0          5          192        1          0          0          0          0          0          0          0          0          0          0          0          0          0          

3          0          0          0          59         0          0          0          0          0          0          0          0          0          0          0          0          0      

If confusion matrix has too many zeros (sparse matrix) you can set `sparse` flag to True in printing functions otherwise by using save_csv method to save the confusion matrix in csv format you'll have better demonstration.
