# Logistic Regression Exercises

## Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn import preprocessing
from sklearn.datasets import load_digits
from sklearn.neighbors import KNeighborsClassifier

## Problem 1: Binary Classification
## Running Logistic Regression on the Heart Dataset
- The dataset file is uploaded with this notebook as 'D6_Heart_Dataset_2.csv'. This dataset contains numeric data only.
- Source: https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset?resource=download
- This heart disease dataset is used for binary classification. 
- The dataset contains 304 observations.
- It has 13 features and 1 class label with 0 and 1 values. 
- The attributes are discussed below.
1. Age in years
2. Gender (1 = male; 0 = female)
3. Cp chest pain type:  
   Value 1: typical angina,
   Value 2: atypical angina,
   Value 3: non-anginal pain
   Value 4: asymptomatic,
4. (trestbps) resting blood pressure (in mm Hg on admission to the hospital)
5. (chol) serum cholestoral in mg/dl
6. (fbs) (fasting blood sugar &gt; 120 mg/dl) (1 = true; 0 = false)
7. (restecg) resting electrocardiographic results
8. (thalach) maximum heart rate achieved
9. (exang) exercise induced angina (1 = yes; 0 = no)
10. (oldpeak) ST depression induced by exercise relative to rest
11. (slope) the slope of the peak exercise ST segment
12. (ca) number of major vessels (0-3) colored by flourosopy
13. (thal) 3 = normal; 6 = fixed defect; 7 = reversable defect
14. target 1 or 0 (num) (the predicted attribute) diagnosis of heart disease (angiographic disease status):
    Value 0: < 50% diameter narrowing,
    Value 1: > 50% diameter narrowing,


In [None]:
#Loading and viewing the dataset
dataset=pd.read_csv('D6_Heart_Dataset_2.csv')
dataset

In [None]:
#Moving features and target into separate dataframes
features=dataset.drop('target',axis=1)
target=dataset['target']
#Splitting the dataset into train and test 
X_train,X_test,Y_train,Y_test = train_test_split(features, target,test_size=0.2,random_state=0)

### Model 1: Running Logictic Regression with standard settings
- Regularization is applied by default in LogisticRegression.

In [None]:
%%time

#Creating logistic regression object
#logistic_regression_model1 = LogisticRegression()
logistic_regression_model1 = LogisticRegression(max_iter=1000)

#Use the following statement if you want to try another solver like 'liblinear', otherwise default solver is 'lbfgs'.
#logistic_regression_model1 = LogisticRegression(solver="liblinear", random_state=0)

# Train model
model1 = logistic_regression_model1.fit(X_train, Y_train)

In [None]:
#Making predictions on test data
Y_pred_model1 = model1.predict(X_test)
print("The accuracy is "+str(metrics.accuracy_score(Y_test,Y_pred_model1)*100)+"%")
print(confusion_matrix(Y_test, Y_pred_model1))
print('Precision: ',metrics.precision_score(Y_test,Y_pred_model1))
print('Recall score: ',metrics.recall_score(Y_test,Y_pred_model1))
print('F1 score: ',metrics.f1_score(Y_test,Y_pred_model1))

### Model 2: Running Logistic Regression with scaling (standardization)
- Both test and train sets need to be scaled.

In [None]:
#Let us see values in X_train
X_train

In [None]:
#Standardizing train data
standard_scaler = preprocessing.StandardScaler()
X_train_standardized=pd.DataFrame(standard_scaler.fit_transform(X_train)) # returns standardized array
X_train_standardized

In [None]:
#Standardizing test data
X_test_standardized=pd.DataFrame(standard_scaler.fit_transform(X_test))
X_test_standardized

In [None]:
%%time
#Creating logistic regression object
logistic_regression_model2 = LogisticRegression() #converged faster, no need to increase max_iter

#Training the model
model2 = logistic_regression_model2.fit(X_train_standardized, Y_train)

Notice the reduction in wall time with scaled data.

In [None]:
#Making predictions on test data
Y_pred_model2 = model2.predict(X_test_standardized)
print("The accuracy is "+str(metrics.accuracy_score(Y_test,Y_pred_model2)*100)+"%")
print(confusion_matrix(Y_test, Y_pred_model2))
print('Precision: ',metrics.precision_score(Y_test,Y_pred_model2))
print('Recall score: ',metrics.recall_score(Y_test,Y_pred_model2))
print('F1 score: ',metrics.f1_score(Y_test,Y_pred_model2))

In [None]:
d=confusion_matrix(Y_test, Y_pred_model2)
d.diagonal()

### Model 3: Running Logistic Regression with scaling (Normalization)
- Both test and train sets need to be scaled.

In [None]:
#Normalizing the train data
normal_scaler = preprocessing.MinMaxScaler()
X_train_normalized=pd.DataFrame(normal_scaler.fit_transform(X_train)) # returns standardized array
X_train_normalized

In [None]:
#Normalizing the test data
X_test_normalized=pd.DataFrame(normal_scaler.fit_transform(X_test)) # returns standardized array
X_test_normalized

In [None]:
%%time
#Creating logistic regression object
logistic_regression_model3 = LogisticRegression() #converged faster, no need to increase max_iter

#Training the model
model3 = logistic_regression_model3.fit(X_train_normalized, Y_train)

In [None]:
#Making predictions
Y_pred_model3 = model3.predict(X_test_normalized)
print("The accuracy is "+str(metrics.accuracy_score(Y_test,Y_pred_model3)*100)+"%")
print(confusion_matrix(Y_test, Y_pred_model3))
print('Precision: ',metrics.precision_score(Y_test,Y_pred_model3))
print('Recall score: ',metrics.recall_score(Y_test,Y_pred_model3))
print('F1 score: ',metrics.f1_score(Y_test,Y_pred_model3))

### Model 4: Running K-Nearest Neighbour (a non-parametric algorithm)

In [None]:
model4 = KNeighborsClassifier()
model4.fit(X_train, Y_train)
Y_pred_model4 = pd.DataFrame(model4.predict(X_test))
print("The accuracy is "+str(metrics.accuracy_score(Y_test,Y_pred_model4)*100)+"%")
print(confusion_matrix(Y_test, Y_pred_model4))
print('Precision: ',metrics.precision_score(Y_test,Y_pred_model4))
print('Recall score: ',metrics.recall_score(Y_test,Y_pred_model4))
print('F1 score: ',metrics.f1_score(Y_test,Y_pred_model4))

## Problem 2: Multiclass Classification
## Running Logistic Regression on number dataset from sklearn
- This dataset is one of the toy datasets of sklearn module.
- Contains 1797 training examples.
- Each training example is an 8x8 image of a hand-written digit.
- Total classes: 10 (1 for each digit from 0 to 9)
- Samples per class: 180 (approx)
- Dimensions: 64
- For more information visit: https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html

### Loading and exploring the number dataset fron sklearn

In [None]:
#Loading dataset from sklearn
digit_dataset=load_digits()

In [None]:
#Printing description of the dataset
print(dir(digit_dataset))
print(digit_dataset.DESCR)

In [None]:
#Displaying features
digit_dataset.data

In [None]:
#Displaying target
digit_dataset.target

In [None]:
#Taking a peak on one of the images
some_digit = digit_dataset.data[0]
plt.imshow(some_digit.reshape(8, 8))
plt.axis("off")
plt.show()

In [None]:
#Looks like a 0, let's check the corresponding target value
digit_dataset.target[0]

In [None]:
#Splitting the dataset
X2_train,X2_test,Y2_train,Y2_test = train_test_split(digit_dataset.data, digit_dataset.target,test_size=0.2,random_state=0)

### Model 5: Running Logistic Regression
- The model in Problem 2 is number 5 just to avoid name clashes with models of problem 1.

In [None]:
%%time
#Creating logistic regression object
#logistic_regression_model5 = LogisticRegression()
logistic_regression_model5 = LogisticRegression(max_iter=10000)
#Training the model
model5 = logistic_regression_model5.fit(X2_train, Y2_train)

In [None]:
#Making predictions
Y2_pred_model5 = model5.predict(X2_test)
print("The accuracy is "+str(metrics.accuracy_score(Y2_test,Y2_pred_model5)*100)+"%")
c_matrix=confusion_matrix(Y2_test, Y2_pred_model5)
print(c_matrix)
print('Precision: ',metrics.precision_score(Y2_test,Y2_pred_model5,average=None))
print('Micro Precision: ',metrics.precision_score(Y2_test,Y2_pred_model5,average='micro'))
print('Macro Precision: ',metrics.precision_score(Y2_test,Y2_pred_model5,average='macro'))
print('Weighted Precision: ',metrics.precision_score(Y2_test,Y2_pred_model5,average='weighted'))
print()
print('Recall score: ',metrics.recall_score(Y2_test,Y2_pred_model5,average=None))
print('Micro Recall score: ',metrics.recall_score(Y2_test,Y2_pred_model5,average='micro'))
print('Macro Recall score: ',metrics.recall_score(Y2_test,Y2_pred_model5,average='macro'))
print('Weighted Recall score: ',metrics.recall_score(Y2_test,Y2_pred_model5,average='weighted'))
print()
print('F1 score: ',metrics.f1_score(Y2_test,Y2_pred_model5,average=None))
print('Micro F1 score: ',metrics.f1_score(Y2_test,Y2_pred_model5,average='micro'))
print('Macro F1 score: ',metrics.f1_score(Y2_test,Y2_pred_model5,average='macro'))
print('Weighted F1 score: ',
      metrics.f1_score(Y2_test,Y2_pred_model5,average='weighted'))


## Problem 3: Multiclass Multioutput Classification

### Creating two labels (outputs) on same data
- Label 1: For large numbers greater than 7
- Label 2: For odd numbers

In [None]:
#Creating two labels for train data
y2_train_large = np.array(Y2_train >= 7)
y2_multilabel_train = pd.DataFrame(data=y2_train_large, columns=['large'])
y2_train_odd = np.array(Y2_train % 2 == 1)
y2_multilabel_train['odd'] = y2_train_odd
y2_multilabel_train

In [None]:
#Creating two labels for test data
y2_test_large = np.array(Y2_test >= 7)
y2_multilabel_test = pd.DataFrame(data=y2_test_large, columns=['large'])
y2_test_odd = np.array(Y2_test % 2 == 1)
y2_multilabel_test['odd'] = y2_test_odd
y2_multilabel_test

### Model 6: Running K-Nearest Neighbours on multiclass output

In [None]:
#Running K-Nearest Neighbours on train data and testing on test data
model6 = KNeighborsClassifier()
model6.fit(X2_train, y2_multilabel_train)
Y2_pred_model6 = pd.DataFrame(model6.predict(X2_test),columns=['large','odd'])

In [None]:
#c_matrix for 'large'
c_matrix_large=confusion_matrix(y2_multilabel_test['large'], Y2_pred_model6['large'])
print(c_matrix_large)
print('Precision: ',metrics.precision_score(y2_multilabel_test['large'], Y2_pred_model6['large']))
print('Recall score: ',metrics.recall_score(y2_multilabel_test['large'], Y2_pred_model6['large']))
print('F1 score: ',metrics.f1_score(y2_multilabel_test['large'], Y2_pred_model6['large']))

In [None]:
#c_matrix for 'odd'
c_matrix_odd=confusion_matrix(y2_multilabel_test['odd'], Y2_pred_model6['odd'])
print(c_matrix_odd)
print('Precision: ',metrics.precision_score(y2_multilabel_test['odd'], Y2_pred_model6['odd']))
print('Recall score: ',metrics.recall_score(y2_multilabel_test['odd'], Y2_pred_model6['odd']))
print('F1 score: ',metrics.f1_score(y2_multilabel_test['odd'], Y2_pred_model6['odd']))

### Model 7: Running Logistic Regression on multiclass output
- Unlike K-Nearest Neighbours, logistic regression doesnot support multioutput classification directly.
- We will have to run it separately on each output.

In [None]:
#Running Logistic Regression on first output 'large'

#Creating logistic regression object
logistic_regression_model7 = LogisticRegression(random_state=0, max_iter=10000)
#Training the model
model7_large = logistic_regression_model7.fit(X2_train, y2_multilabel_train['large'])
model7_odd = logistic_regression_model7.fit(X2_train, y2_multilabel_train['odd'])

In [None]:
#Making predictions for output 'large'
Y2_pred_model7 = model7_large.predict(X2_test)
print("The accuracy is "+str(metrics.accuracy_score(y2_multilabel_test['large'],Y2_pred_model7)*100)+"%")
c_matrix=confusion_matrix(y2_multilabel_test['large'],Y2_pred_model7)
print(c_matrix)
print('Precision: ',metrics.precision_score(y2_multilabel_test['large'],Y2_pred_model7))
print('Recall score: ',metrics.recall_score(y2_multilabel_test['large'],Y2_pred_model7))
print('F1 score: ',metrics.f1_score(y2_multilabel_test['large'],Y2_pred_model7))

In [None]:
#Making predictions for output 'odd'
Y2_pred_model7 = model7_odd.predict(X2_test)
print("The accuracy is "+str(metrics.accuracy_score(y2_multilabel_test['odd'],Y2_pred_model7)*100)+"%")
c_matrix=confusion_matrix(y2_multilabel_test['odd'],Y2_pred_model7)
print(c_matrix)
print('Precision: ',metrics.precision_score(y2_multilabel_test['odd'],Y2_pred_model7))
print('Recall score: ',metrics.recall_score(y2_multilabel_test['odd'],Y2_pred_model7))
print('F1 score: ',metrics.f1_score(y2_multilabel_test['odd'],Y2_pred_model7))