# Assingment - Logistic regression
Mikko Kettunen<br>
Last edited: 05.03.2020<br>
Mathematics and Methods in Machine Learning and Neural Networks<br>

# Background
The aim of this notebook is to produce a probability of a patient having vertebral abnormality based on the patients numerical quantities obtained by radiographic measurements.
<br>
<br>

# Data and preprocessing

In [118]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import tensorflow as tf

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

# read dataset
df = pd.read_csv('column_2C.dat', sep=' ', names=['pelvic incidence', 'pelvic tilt', 'lumbar lordosis angle', 
                                                  'sacral slope', 'pelvic radius', 'grade of spondylolisthesis',
                                                 'Normal (NO) and Abnormal (AB)'])
# Shuffle data
df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,pelvic incidence,pelvic tilt,lumbar lordosis angle,sacral slope,pelvic radius,grade of spondylolisthesis,Normal (NO) and Abnormal (AB)
0,51.62,15.97,35.00,35.66,129.39,1.01,NO
1,47.32,8.57,35.56,38.75,120.58,1.63,NO
2,44.55,21.93,26.79,22.62,111.07,2.65,AB
3,48.03,3.97,58.34,44.06,125.35,35.00,AB
4,91.47,24.51,84.62,66.96,117.31,52.62,AB
...,...,...,...,...,...,...,...
305,84.59,30.36,65.48,54.22,108.01,25.12,AB
306,43.20,19.66,35.00,23.54,124.85,-2.92,AB
307,50.83,9.06,56.30,41.76,79.00,23.04,AB
308,39.36,7.01,37.00,32.35,117.82,1.90,NO


In [119]:
# re-encode 'Normal (NO) and Abnormal' (AB) column
df['Normal (NO) and Abnormal (AB)'].replace(['AB','NO'], [1,0], inplace=True)
df

Unnamed: 0,pelvic incidence,pelvic tilt,lumbar lordosis angle,sacral slope,pelvic radius,grade of spondylolisthesis,Normal (NO) and Abnormal (AB)
0,51.62,15.97,35.00,35.66,129.39,1.01,0
1,47.32,8.57,35.56,38.75,120.58,1.63,0
2,44.55,21.93,26.79,22.62,111.07,2.65,1
3,48.03,3.97,58.34,44.06,125.35,35.00,1
4,91.47,24.51,84.62,66.96,117.31,52.62,1
...,...,...,...,...,...,...,...
305,84.59,30.36,65.48,54.22,108.01,25.12,1
306,43.20,19.66,35.00,23.54,124.85,-2.92,1
307,50.83,9.06,56.30,41.76,79.00,23.04,1
308,39.36,7.01,37.00,32.35,117.82,1.90,0


In [120]:
# split into explanatory and response variables 
X = df.iloc[:,:6]
Y = df.iloc[:,6]

# Divide data into training and testing sets on a 80/20 division
X, X2, Y, Y2 = train_test_split(X, Y, test_size=0.20, shuffle=False)

# Building the model and computing predictions

In [121]:
# build and fit model
reg = LogisticRegression()
reg.fit(X,Y)

# compute predicted values from training set
Y_pred = reg.predict(X)

# Results

In [122]:
# Print confusion matrix
cm = confusion_matrix(Y, Y_pred)
print("Confusion matrix:\n",cm)

# Print accuracy
accuracy = (cm[0][0]+cm[1][1])/(cm[0][0]+cm[1][1]+cm[0][1]+cm[1][0])
print("Accuracy calculated from the training set = %.3f" % (accuracy))

# Print classification report
print(classification_report(Y, Y_pred, target_names=['no', 'yes']))

Confusion matrix:
 [[ 56  19]
 [ 23 150]]
Accuracy calculated from the training set = 0.831
              precision    recall  f1-score   support

          no       0.71      0.75      0.73        75
         yes       0.89      0.87      0.88       173

    accuracy                           0.83       248
   macro avg       0.80      0.81      0.80       248
weighted avg       0.83      0.83      0.83       248



As can bee seen from the results above, the model predicted the right outcome with an accuracy of 83.1%.
<br>
<br>

In [123]:
# retrieve estimated probabilities (from training set)
prob = reg.predict_proba(X)
prob

array([[6.17207990e-01, 3.82792010e-01],
       [7.62285166e-01, 2.37714834e-01],
       [2.16299755e-01, 7.83700245e-01],
       [8.54162288e-02, 9.14583771e-01],
       [1.81913322e-03, 9.98180867e-01],
       [1.84434183e-01, 8.15565817e-01],
       [8.63001722e-01, 1.36998278e-01],
       [1.70382231e-02, 9.82961777e-01],
       [3.28626000e-01, 6.71374000e-01],
       [2.91945187e-01, 7.08054813e-01],
       [7.80720492e-01, 2.19279508e-01],
       [1.07936157e-03, 9.98920638e-01],
       [6.10850395e-01, 3.89149605e-01],
       [2.38668471e-01, 7.61331529e-01],
       [2.56296191e-01, 7.43703809e-01],
       [5.31036697e-01, 4.68963303e-01],
       [2.96957523e-02, 9.70304248e-01],
       [5.61285903e-01, 4.38714097e-01],
       [9.32328746e-01, 6.76712538e-02],
       [5.48596095e-01, 4.51403905e-01],
       [7.23599165e-01, 2.76400835e-01],
       [5.70087897e-01, 4.29912103e-01],
       [4.45269998e-01, 5.54730002e-01],
       [3.34718026e-06, 9.99996653e-01],
       [7.316917

# Validation

In [124]:
# cross-validate
# number of folds
k = 10
scores = cross_val_score(estimator=reg,
                        X=X2,
                        y=Y2,
                        scoring="accuracy",
                        cv=k)
print("Accuracies from %d individual folds:" % k)
print(scores)
print("Accuracy calculated using %d-fold cross validation = %.3f" % (k, scores.mean()))

Accuracies from 10 individual folds:
[0.85714286 1.         0.57142857 0.71428571 0.85714286 0.83333333
 1.         0.8        1.         1.        ]
Accuracy calculated using 10-fold cross validation = 0.863


As can be seen from the results above, the model predicted the right outcome (using validation data) with an accuracy of 86.3%.
<br>
<br>

# Making a dataframe to show the probability of a patient having vertebral abnormality

In [125]:
# Change numpy array to dataframe to make it compatible for the pd.concat function
dataset = pd.DataFrame({'Column1': prob[:, 0], 'Column2': prob[:, 1]})
print(dataset)

      Column1   Column2
0    0.617208  0.382792
1    0.762285  0.237715
2    0.216300  0.783700
3    0.085416  0.914584
4    0.001819  0.998181
..        ...       ...
243  0.096985  0.903015
244  0.191127  0.808873
245  0.193853  0.806147
246  0.054076  0.945924
247  0.000061  0.999939

[248 rows x 2 columns]


In [126]:
# merge observed, predicted values and probabilities 
# of the person having abnormality or not
df2 = pd.concat([Y, pd.Series(Y_pred), dataset['Column1'], dataset['Column2']], axis=1)
df2.columns=['observed','predicted', 'Probability NO', 'Probability AB']

# Rename the index header
df2.index.names = ['Patient number']

print(df2)

                observed  predicted  Probability NO  Probability AB
Patient number                                                     
0                      0          0        0.617208        0.382792
1                      0          0        0.762285        0.237715
2                      1          1        0.216300        0.783700
3                      1          1        0.085416        0.914584
4                      1          1        0.001819        0.998181
...                  ...        ...             ...             ...
243                    1          1        0.096985        0.903015
244                    1          1        0.191127        0.808873
245                    1          1        0.193853        0.806147
246                    1          1        0.054076        0.945924
247                    1          1        0.000061        0.999939

[248 rows x 4 columns]
