# Logistic Regression Implementation

In [14]:
from sklearn.datasets import load_iris

In [15]:
dataset = load_iris()

In [16]:
dataset

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [17]:
print(dataset.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

:Number of Instances: 150 (50 in each of three classes)
:Number of Attributes: 4 numeric, predictive attributes and the class
:Attribute Information:
    - sepal length in cm
    - sepal width in cm
    - petal length in cm
    - petal width in cm
    - class:
            - Iris-Setosa
            - Iris-Versicolour
            - Iris-Virginica

:Summary Statistics:

                Min  Max   Mean    SD   Class Correlation
sepal length:   4.3  7.9   5.84   0.83    0.7826
sepal width:    2.0  4.4   3.05   0.43   -0.4194
petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

:Missing Attribute Values: None
:Class Distribution: 33.3% for each of 3 classes.
:Creator: R.A. Fisher
:Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
:Date: July, 1988

The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken
from Fis

In [18]:
dataset.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [19]:
import pandas as pd
import numpy as np

In [20]:
df = pd.DataFrame(dataset.data, columns = dataset.feature_names)

In [21]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [22]:
df['target'] = dataset.target

In [23]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [26]:
df.shape

(150, 5)

In [24]:
# Binary classification, so we remove the target = 2 rows now.

df1 = df[df['target'] != 2]

In [27]:
df1.shape

(100, 5)

# Separate independent and dependent features

In [28]:
x = df1.iloc[:, :-1]
y = df1.iloc[:, -1]

# Train test split

In [31]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42)

In [29]:
from sklearn.linear_model import LogisticRegression

In [30]:
classifier = LogisticRegression()

In [33]:
classifier.fit(x_train, y_train)

# Prediction

In [34]:
y_pred= classifier.predict(x_test)

In [43]:
classifier.predict_proba(x_test)    # gives probablities of both the classes.

array([[0.00144397, 0.99855603],
       [0.01745215, 0.98254785],
       [0.00359633, 0.99640367],
       [0.96426787, 0.03573213],
       [0.93474767, 0.06525233],
       [0.96669519, 0.03330481],
       [0.99197521, 0.00802479],
       [0.03413528, 0.96586472],
       [0.97021318, 0.02978682],
       [0.97500261, 0.02499739],
       [0.94874463, 0.05125537],
       [0.95431635, 0.04568365],
       [0.00502564, 0.99497436],
       [0.98309085, 0.01690915],
       [0.01045247, 0.98954753],
       [0.97787998, 0.02212002],
       [0.00251112, 0.99748888],
       [0.00154214, 0.99845786],
       [0.96994373, 0.03005627],
       [0.95517443, 0.04482557],
       [0.00835383, 0.99164617],
       [0.02377754, 0.97622246],
       [0.95342874, 0.04657126],
       [0.98084186, 0.01915814],
       [0.0288064 , 0.9711936 ],
       [0.97695936, 0.02304064],
       [0.97877889, 0.02122111],
       [0.01705948, 0.98294052],
       [0.96439212, 0.03560788],
       [0.0018038 , 0.9981962 ],
       [0.

In [36]:
x_test.shape

(33, 4)

In [38]:
y_pred    # for 33 x_test datapoints

array([1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1])

# Confusion matrix, accuracy score classification report

In [40]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [42]:
print("Confusion matrix", confusion_matrix(y_test, y_pred))

print("Accuracy score", accuracy_score(y_test, y_pred))

print("Classification report", classification_report(y_test, y_pred))

Confusion matrix [[19  0]
 [ 0 14]]
Accuracy score 1.0
Classification report               precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        14

    accuracy                           1.00        33
   macro avg       1.00      1.00      1.00        33
weighted avg       1.00      1.00      1.00        33



# Hyperparamter tuning

# 1.) GridSearchCV

In [44]:
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [45]:
parameters= {'penalty': ('l1', 'l2', 'elasticnet', None), 'C': [1,10,20]}

In [46]:
clf = GridSearchCV(classifier, param_grid = parameters, cv = 5)

# Splitting of train data into Validation data

In [47]:
clf.fit(x_train, y_train)

In [48]:
clf.best_params_

{'C': 1, 'penalty': 'l2'}

In [49]:
classifier = LogisticRegression(C=1, penalty = 'l2')

In [50]:
classifier.fit(x_train, y_train)

# Prediction after Hyperparameter tuning

In [52]:
# Now repeat the same steps as before to check Performance metric sof the ML model
y_pred = classifier.predict(x_test)

print("Confusion matrix", confusion_matrix(y_test, y_pred))

print("Accuracy score", accuracy_score(y_test, y_pred))

print("Classification report", classification_report(y_test, y_pred))

Confusion matrix [[19  0]
 [ 0 14]]
Accuracy score 1.0
Classification report               precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        14

    accuracy                           1.00        33
   macro avg       1.00      1.00      1.00        33
weighted avg       1.00      1.00      1.00        33



# 2.) RandomizedSearchCV

In [53]:
from sklearn.model_selection import RandomizedSearchCV

In [54]:
random_clf = RandomizedSearchCV(LogisticRegression(), param_distributions= parameters, cv= 5)

In [57]:
random_clf.fit(x_train, y_train)

In [58]:
random_clf.best_params_   # by default, n_iter = 10

{'penalty': 'l2', 'C': 20}

In [61]:
random_clf = LogisticRegression(penalty = 'l2' , C= 20)

In [62]:
random_clf.fit(x_train, y_train)

# Perforamance metrics after Hyperparameter tuning

In [64]:
# Now repeat the same steps as before to check Performance metric sof the ML model
y_pred = random_clf.predict(x_test)

print("Confusion matrix", confusion_matrix(y_test, y_pred))

print("Accuracy score", accuracy_score(y_test, y_pred))

print("Classification report", classification_report(y_test, y_pred))

Confusion matrix [[19  0]
 [ 0 14]]
Accuracy score 1.0
Classification report               precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        14

    accuracy                           1.00        33
   macro avg       1.00      1.00      1.00        33
weighted avg       1.00      1.00      1.00        33

