In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn.preprocessing as skl_pre
import sklearn.linear_model as skl_lm
import sklearn.discriminant_analysis as skl_da
import sklearn.neighbors as skl_nb
import sklearn.model_selection as skl_ms

from Preprocessing import X, Y, n_fold, cv, random_split 
trainX, trainY, testX, testY = random_split(0.8)
help(plt.plot)

In [None]:
n_fold = 10 #number of splits in KFold
Cvals = np.linspace(0.00001, 100, 1000) #Test C-values
solvers = ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']

best_Cvals_lst = [] #list with best C values for each solver


for solver in solvers:
    error = [] #list with error for specific solver
    for C in Cvals:
        model = skl_lm.LogisticRegression(solver=solver,C=C)
        fold_error = [] #error list for each C value
        cv = skl_ms.KFold(n_splits=n_fold, random_state=1, shuffle=True)
        for train_index, val_index in cv.split(X):
            #Doing logisitc regression
            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            Y_train, Y_val = Y.iloc[train_index], Y.iloc[val_index]
            model.fit(X_train, Y_train)
            prediction = model.predict(X_val)
            fold_error.append(np.mean(prediction != Y_val))
        error.append(np.mean(fold_error))
    index_best_C = np.argmin(error) #find in index for C value for minimum error

    best_Cvals_lst.append((solver,Cvals[index_best_C])) 

    plt.plot(Cvals, error, label = f"{solver}") #plot a curve for each solver

#print C value that gives minimum error for each solver
for solver, best_C in best_Cvals_lst:
    print(f"Best C value for {solver}: {best_C}")

plt.title('Validation error vs. Invers of regularization strength C')
plt.xlabel('Invers of regularization strength C')
plt.ylabel('Validation error')
plt.legend()
plt.show()


Following code calculate the validation error for each solver and with C=82.8828845945946

In [None]:
n_fold = 10 #number of folds
solvers = ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga'] #Diffrent solves to test
validation_errors = [] #list that will contain validation error for the solvers


for solver in solvers:
    val_error = [] #list with validation error for specific solver
    model = skl_lm.LogisticRegression(solver=solver, C=82.8828845945946,max_iter=10000) #model with specific solver
    cv = skl_ms.KFold(n_splits=n_fold, random_state=1, shuffle=True)
    #Train the model and calculate validation error
    for train_index, val_index in cv.split(X): 
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        Y_train, Y_val = Y.iloc[train_index], Y.iloc[val_index]
        model.fit(X_train, Y_train)
        prediction = model.predict(X_val)
        val_error.append(np.mean(prediction != Y_val))
    mean_val_error = np.mean(val_error) #Get mean validation error for specific solver
    validation_errors.append((solver, mean_val_error))

#Print validation error for each solver
for solver, error in validation_errors:
    print(f"Validation error for {solver}: {error}")
#Shows that liblinear is the best solver, for C=82.8828845945946 and else default settings.

In [None]:
#Print validation error for liblinear solver with best C
print(f"Validation error for liblinear solver with best C: {validation_errors[1][1]}")

Validation error for liblinear solver with best C: 0.8525


In [114]:
finalModel = model = skl_lm.LogisticRegression(solver=solver,C=82.8828845945946)
finalModel.fit(trainX, trainY)
# Test the final model using the test set
print('Accuracy of final logistic regression model: ', finalModel.score(testX, testY))
print('Confusion matrix:')
pd.crosstab(testY, finalModel.predict(testX), rownames=['True'], colnames=['Predicted'], margins=False)

Accuracy of final logistic regression model:  0.815625
Confusion matrix:




Predicted,high_bike_demand,low_bike_demand
True,Unnamed: 1_level_1,Unnamed: 2_level_1
high_bike_demand,18,49
low_bike_demand,10,243
