# Multiclass Logistic Regression Classifier

In [30]:
# Libraries
import numpy as np
import pandas as pd
import cv2
import os
import matplotlib.pyplot as plt

In [31]:
# Get directories & labels
train_dir = "C:/Users/bodin/School/MATH 5680/Projec/tumor_images/Training"
test_dir = "C:/Users/bodin/School/MATH 5680/Projec/tumor_images/Testing"

labels = ["glioma_tumor","meningioma_tumor","no_tumor","pituitary_tumor"]

In [32]:
# Create list of data
train_img_list = []
test_img_list = []

test_label_list = []
train_label_list = []

# Training dataset
for label in labels:
    for img_file in os.listdir(train_dir + '/' + label):
        img = cv2.imread(train_dir + '/' + label + '/' + img_file)
        img = cv2.resize(img, (200,200))
        train_img_list.append(img)
        train_label_list.append(label)

# Testing dataset
for label in labels:
    for img_file in os.listdir(test_dir + '/' + label):
        img = cv2.imread(test_dir + '/' + label + '/' + img_file)
        img = cv2.resize(img, (200,200))
        test_img_list.append(img)
        test_label_list.append(label)

In [33]:
# Encode labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

train_label_list = label_encoder.fit_transform(train_label_list)
test_label_list = label_encoder.fit_transform(test_label_list)

# Convert images to int64 array
train_img_list = np.array(train_img_list).astype(np.int64)
test_img_list = np.array(test_img_list).astype(np.int64)

In [34]:
# Value Counts for each type of tumor
"""
Pituitary = 3
Glioma = 0
Meningioma = 1
None = 2
"""
pd.Series(train_label_list).value_counts()

3    827
0    826
1    822
2    395
dtype: int64

In [35]:
# Convert images from (200, 200, 3) to column vector (120000)
X_train = train_img_list.reshape(len(train_img_list), -1)
X_test = test_img_list.reshape(len(test_img_list), -1)
Y_train = train_label_list
Y_test = test_label_list

In [36]:
# Standardize values between 0-1
X_train = X_train / 255
X_test = X_test / 255

In [37]:
# Find best model using cross validation
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import warnings
from sklearn.exceptions import ConvergenceWarning

# Ignore convergence warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

"""
Log. Reg. parameters to alter:

penalty: Specify the norm of the penalty.
{‘l1’, ‘l2’, ‘elasticnet’, None}, default=’l2’

tol: Tolerance for stopping criteria
float, default=1e-4

C: Inverse of regularization strength
float, default=1.0

multi_class: {‘auto’, ‘ovr’, ‘multinomial’}, default=’auto’
"""
# Paramters to test
param_grid = {'penalty': ['l1', 'l2', 'elasticnet', 'None'], 
              'tol': [1e-4, 1e-6, 1e-2], 
              'multi_class': ['ovr', 'multinomial']}

# Build model and search for best params
lr = LogisticRegression()
grid_search = GridSearchCV(lr, param_grid, cv=5, scoring='accuracy', verbose=2)
grid_search.fit(X_train, Y_train)

# Print the best parameters and corresponding accuracy score
print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy Score: {:.4f}".format(grid_search.best_score_))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END ............multi_class=ovr, penalty=l1, tol=0.0001; total time=   3.0s
[CV] END ............multi_class=ovr, penalty=l1, tol=0.0001; total time=   0.9s
[CV] END ............multi_class=ovr, penalty=l1, tol=0.0001; total time=   0.4s
[CV] END ............multi_class=ovr, penalty=l1, tol=0.0001; total time=   0.4s
[CV] END ............multi_class=ovr, penalty=l1, tol=0.0001; total time=   0.4s
[CV] END .............multi_class=ovr, penalty=l1, tol=1e-06; total time=   0.4s
[CV] END .............multi_class=ovr, penalty=l1, tol=1e-06; total time=   0.4s
[CV] END .............multi_class=ovr, penalty=l1, tol=1e-06; total time=   0.4s
[CV] END .............multi_class=ovr, penalty=l1, tol=1e-06; total time=   0.4s
[CV] END .............multi_class=ovr, penalty=l1, tol=1e-06; total time=   0.4s
[CV] END ..............multi_class=ovr, penalty=l1, tol=0.01; total time=   0.4s
[CV] END ..............multi_class=ovr, penalty

[CV] END multi_class=multinomial, penalty=elasticnet, tol=0.01; total time=   0.3s
[CV] END multi_class=multinomial, penalty=elasticnet, tol=0.01; total time=   0.3s
[CV] END multi_class=multinomial, penalty=elasticnet, tol=0.01; total time=   0.3s
[CV] END multi_class=multinomial, penalty=elasticnet, tol=0.01; total time=   0.3s
[CV] END multi_class=multinomial, penalty=elasticnet, tol=0.01; total time=   0.3s
[CV] END ..multi_class=multinomial, penalty=None, tol=0.0001; total time=   0.3s
[CV] END ..multi_class=multinomial, penalty=None, tol=0.0001; total time=   0.3s
[CV] END ..multi_class=multinomial, penalty=None, tol=0.0001; total time=   0.3s
[CV] END ..multi_class=multinomial, penalty=None, tol=0.0001; total time=   0.3s
[CV] END ..multi_class=multinomial, penalty=None, tol=0.0001; total time=   0.3s
[CV] END ...multi_class=multinomial, penalty=None, tol=1e-06; total time=   0.3s
[CV] END ...multi_class=multinomial, penalty=None, tol=1e-06; total time=   0.3s
[CV] END ...multi_

90 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\bodin\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\bodin\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\bodin\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

----------------------------

Best Parameters:  {'multi_class': 'ovr', 'penalty': 'l2', 'tol': 0.0001}
Best Accuracy Score: 0.6087


In [43]:
lr = LogisticRegression(multi_class = 'ovr', 
                        penalty = 'l2', 
                        tol = 0.0001, 
                        max_iter = 100)
lr.fit(X_train, Y_train)

In [46]:
# Evaluation
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)

y_pred = lr.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
print(f'Accuracy: {accuracy}\n')
print(f"Confusion Matrix: \n", confusion_matrix(Y_test, y_pred), "\n")
print(classification_report(Y_test, y_pred))

Accuracy: 0.7385786802030457

Confusion Matrix: 
 [[ 26  21  44   9]
 [  1 111   3   0]
 [  0   0 105   0]
 [  0  14  11  49]] 

              precision    recall  f1-score   support

           0       0.96      0.26      0.41       100
           1       0.76      0.97      0.85       115
           2       0.64      1.00      0.78       105
           3       0.84      0.66      0.74        74

    accuracy                           0.74       394
   macro avg       0.80      0.72      0.70       394
weighted avg       0.80      0.74      0.70       394

