In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [2]:
X_train = pd.read_csv("X_train.csv")
X_train = X_train.iloc[:, 1:]
X_test = pd.read_csv("X_test.csv")
X_test = X_test.iloc[:, 1:]
Y_train = pd.read_csv("y_train.csv")
Y_test = pd.read_csv("y_test.csv")
Y_train = Y_train['Rating as Factor'].astype('category') #factorize trainset
Y_test = Y_test['Rating as Factor'].astype('category')   #factorize testset

In [3]:
def LogReg(X_train, Y_train):
    '''
    X_train: Training Set of X values
    Y_train: Training Set of Y values(factorized)
    '''
    lor = LogisticRegression(max_iter=100, tol=0.001,random_state=1, n_jobs=-1,solver='saga',warm_start=True)

    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('logreg', lor)])

    param_grid = {'logreg__penalty': ['elasticnet'], #elastic nets combines l1&l2
                  'logreg__C':[6,6.5,7,7.5,8],
                  'logreg__l1_ratio':[0,0.05,0.1,0.15,0.2,1]} #if 0, or 1 then l2 or l1 would be best. If between then the combination of both

    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
    grid = grid.fit(X_train,Y_train)
    
    return(grid)

In [4]:
import datetime
print(datetime.datetime.now())
grid = LogReg(X_train,Y_train)
print('Best parameters:', grid.best_params_) #best parameters are C=7 & ratio=0 -> l2 penalty function
print('Best CV accuracy:', grid.best_score_)
print('Test score:', grid.score(X_test,Y_test)) #30%
print(datetime.datetime.now()) #took my computer 5minutes

2020-04-02 19:16:12.846335
Best parameters: {'logreg__C': 7, 'logreg__l1_ratio': 0, 'logreg__penalty': 'elasticnet'}
Best CV accuracy: 0.3027588539829543
Test score: 0.31009860591635496
2020-04-02 19:22:40.639718




In [3]:
def LogReg(X_train, Y_train):
    '''
    X_train: Training Set of X values
    Y_train: Training Set of Y values(factorized)
    '''
    lor = LogisticRegression(max_iter=100, tol=0.001,random_state=1, n_jobs=-1,solver='saga',warm_start=True)

    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('logreg', lor)])

    param_grid = {'logreg__penalty': ['elasticnet'], #elastic nets combines l1&l2
                  'logreg__C':[6.5,7,7.5],
                  'logreg__l1_ratio':[0,0.1,0.2,1]} #if 0, or 1 then l2 or l1 would be best. If between then the combination of both

    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
    grid = grid.fit(X_train,Y_train)
    
    return(grid)

In [4]:
import datetime
print(datetime.datetime.now())
grid = LogReg(X_train,Y_train)
print('Best parameters:', grid.best_params_) #best parameters are C=7 & ratio=0 -> l2 penalty function
print('Best CV accuracy:', grid.best_score_)
print('Test score:', grid.score(X_test,Y_test)) #30%
print(datetime.datetime.now()) #took my computer 5minutes

2020-04-03 09:20:26.180019
Best parameters: {'logreg__C': 7, 'logreg__l1_ratio': 0, 'logreg__penalty': 'elasticnet'}
Best CV accuracy: 0.3103246674943647
Test score: 0.3151989119347161
2020-04-03 09:22:48.130018




In [7]:
# Predict classes
y_pred = grid.predict(X_test)

# Manual confusion matrix as pandas DataFrame
confm = pd.DataFrame({'Predicted': y_pred,
                      'True': Y_test})
print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted')) 

Predicted     0      1     2     3      4      5     6      7      8    11  \
True                                                                         
0          241.0   35.0   9.0   NaN  216.0   12.0   NaN   71.0  143.0  NaN   
1           60.0  127.0  23.0   NaN   68.0   17.0   4.0   15.0   72.0  NaN   
2           84.0   20.0  24.0   NaN   20.0    2.0   2.0    NaN   45.0  NaN   
3           19.0   16.0   4.0  16.0    NaN    NaN   3.0    NaN    1.0  NaN   
4           94.0   50.0   3.0   1.0  631.0   41.0   1.0   44.0  182.0  NaN   
5           65.0   25.0   8.0   2.0  278.0  110.0   4.0   16.0  100.0  NaN   
6           10.0   11.0   9.0   5.0   20.0    NaN  21.0    7.0   15.0  NaN   
7           93.0   25.0   5.0   NaN  282.0    6.0   2.0  211.0  148.0  NaN   
8           72.0   47.0   6.0   2.0  389.0   34.0   NaN   76.0  285.0  NaN   
9           23.0    NaN   NaN   NaN    NaN    NaN   NaN   10.0    7.0  NaN   
10           NaN    NaN   NaN   NaN    3.0    NaN   NaN    NaN  