In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [2]:
X_train = pd.read_csv("X_train.csv")
X_train = X_train.iloc[:, 1:]
X_test = pd.read_csv("X_test.csv")
X_test = X_test.iloc[:, 1:]
Y_train = pd.read_csv("y_train.csv")
Y_test = pd.read_csv("y_test.csv")
Y_train = Y_train['Rating as Factor'].astype('category') #factorize trainset
Y_test = Y_test['Rating as Factor'].astype('category')   #factorize testset

In [3]:
def LogReg(X_train, Y_train):
    '''
    X_train: Training Set of X values
    Y_train: Training Set of Y values(factorized)
    '''
    lor = LogisticRegression(max_iter=100, tol=0.001,random_state=1, n_jobs=-1,solver='saga',warm_start=True)

    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('logreg', lor)])

    param_grid = {'logreg__penalty': ['elasticnet'], #elastic nets combines l1&l2
                  'logreg__C':[6,6.5,7,7.5,8],
                  'logreg__l1_ratio':[0,0.05,0.1,0.15,0.2,1]} #if 0, or 1 then l2 or l1 would be best. If between then the combination of both

    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
    grid = grid.fit(X_train,Y_train)
    
    return(grid)

In [4]:
import datetime
print(datetime.datetime.now())
grid = LogReg(X_train,Y_train)
print('Best parameters:', grid.best_params_) #best parameters are C=7 & ratio=0 -> l2 penalty function
print('Best CV accuracy:', grid.best_score_)
print('Test score:', grid.score(X_test,Y_test)) #30%
print(datetime.datetime.now()) #took my computer 5minutes

2020-04-02 19:16:12.846335
Best parameters: {'logreg__C': 7, 'logreg__l1_ratio': 0, 'logreg__penalty': 'elasticnet'}
Best CV accuracy: 0.3027588539829543
Test score: 0.31009860591635496
2020-04-02 19:22:40.639718


