# Import the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import the dataset

In [2]:
from sklearn.datasets import make_classification

In [4]:
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)

In [5]:
X[:5]

array([[ 0.96479937, -0.06644898,  0.98676805, -0.35807945,  0.99726557,
         1.18189004, -1.61567885, -1.2101605 , -0.62807677,  1.22727382],
       [-0.91651053, -0.56639459, -1.00861409,  0.83161679, -1.17696211,
         1.82054391,  1.75237485, -0.98453405,  0.36389642,  0.20947008],
       [-0.10948373, -0.43277388, -0.4576493 ,  0.79381847, -0.26864575,
        -1.83635978,  1.23908594, -0.2463834 , -1.05814521, -0.29737608],
       [ 1.75041163,  2.02360622,  1.68815935,  0.00679984, -1.60766103,
         0.18474058, -2.61942676, -0.35744542, -1.47312719, -0.19003904],
       [-0.22472606, -0.71130323, -0.22077758,  0.11712422,  1.53606118,
         0.59753771,  0.34864462, -0.93915557,  0.17591477,  0.23622365]])

In [6]:
y[:5]

array([0, 1, 1, 0, 1])

In [12]:
df= pd.DataFrame(X)
df[:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.964799,-0.066449,0.986768,-0.358079,0.997266,1.18189,-1.615679,-1.210161,-0.628077,1.227274
1,-0.916511,-0.566395,-1.008614,0.831617,-1.176962,1.820544,1.752375,-0.984534,0.363896,0.20947
2,-0.109484,-0.432774,-0.457649,0.793818,-0.268646,-1.83636,1.239086,-0.246383,-1.058145,-0.297376
3,1.750412,2.023606,1.688159,0.0068,-1.607661,0.184741,-2.619427,-0.357445,-1.473127,-0.190039
4,-0.224726,-0.711303,-0.220778,0.117124,1.536061,0.597538,0.348645,-0.939156,0.175915,0.236224


****************

#### Since we create the dataset using make_classifications. we dont need to  scale the data. the data is already scaled

# Do the train test split

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1011)

In [15]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((700, 10), (300, 10), (700,), (300,))

# Create the logistic regression model

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
logistic = LogisticRegression()

In [18]:
logistic.fit(X_train, y_train)

In [26]:
y_pred = logistic.predict(X_test)
y_pred[:10]

array([1, 1, 1, 0, 1, 0, 0, 0, 0, 0])

#### Get the predictions using probability

In [29]:
y_pred_proba = logistic.predict_proba(X_test)
y_pred_proba[:5]

array([[0.24754401, 0.75245599],
       [0.02295846, 0.97704154],
       [0.20830732, 0.79169268],
       [0.97192971, 0.02807029],
       [0.27291635, 0.72708365]])

# Check the performance

In [30]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [31]:
score = accuracy_score(y_test, y_pred)
score

0.8766666666666667

In [33]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[133,   8],
       [ 29, 130]], dtype=int64)

In [34]:
class_report = classification_report(y_test, y_pred)
print(class_report)

              precision    recall  f1-score   support

           0       0.82      0.94      0.88       141
           1       0.94      0.82      0.88       159

    accuracy                           0.88       300
   macro avg       0.88      0.88      0.88       300
weighted avg       0.89      0.88      0.88       300



# Hyper parameter tuning

In [36]:
model = LogisticRegression()
penalty = ['l1', 'l2', 'elasticnet']
c_values = [100, 10, 1.0, 0.1, 0.01]
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

In [38]:
params = dict(penalty=penalty, C=c_values, solver=solver)
params

{'penalty': ['l1', 'l2', 'elasticnet'],
 'C': [100, 10, 1.0, 0.1, 0.01],
 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}

# GridSearchCV

In [42]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [43]:
cv = StratifiedKFold()

In [44]:
grid = GridSearchCV(estimator=model, param_grid=params, scoring='accuracy', cv=cv, n_jobs=-1)

In [45]:
grid

In [47]:
grid.fit(X_train, y_train)

200 fits failed out of a total of 375.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\user\anaconda3\envs\mlenv\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\user\anaconda3\envs\mlenv\lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\user\anaconda3\envs\mlenv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\user\anaconda3\envs\mlenv\lib\site-packages\sklearn\line

In [48]:
grid.best_params_

{'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}

In [49]:
grid.best_score_

0.8657142857142859

In [50]:
y_pred_grid = grid.predict(X_test)
y_pred_grid

array([1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0])

In [52]:
y_pred_grid_proba = grid.predict_proba(X_test)
y_pred_grid_proba[:5]

array([[0.33952693, 0.66047307],
       [0.03439003, 0.96560997],
       [0.23122021, 0.76877979],
       [0.95982926, 0.04017074],
       [0.31002469, 0.68997531]])

In [53]:
score_grid = accuracy_score(y_test, y_pred_grid)
score_grid

0.8733333333333333

In [54]:
cm_grid = confusion_matrix(y_test, y_pred_grid)
cm_grid

array([[134,   7],
       [ 31, 128]], dtype=int64)

In [55]:
class_report_grid = classification_report(y_test, y_pred_grid)
print(class_report_grid)

              precision    recall  f1-score   support

           0       0.81      0.95      0.88       141
           1       0.95      0.81      0.87       159

    accuracy                           0.87       300
   macro avg       0.88      0.88      0.87       300
weighted avg       0.88      0.87      0.87       300



*************

# Randomized SearchedCV

##### Using this is much more efficient when we have large number of parameters compare to gridsearchcv

In [56]:
from sklearn.model_selection import RandomizedSearchCV

In [58]:
model = LogisticRegression()

In [59]:
model = LogisticRegression()
penalty = ['l1', 'l2', 'elasticnet']
c_values = [100, 10, 1.0, 0.1, 0.01]
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']


params = dict(penalty=penalty, C=c_values, solver=solver)
params

{'penalty': ['l1', 'l2', 'elasticnet'],
 'C': [100, 10, 1.0, 0.1, 0.01],
 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}

In [60]:
random_cv = RandomizedSearchCV(estimator=model, param_distributions=params, cv=5, scoring='accuracy')

In [61]:
random_cv.fit(X_train, y_train)

20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\user\anaconda3\envs\mlenv\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\user\anaconda3\envs\mlenv\lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\user\anaconda3\envs\mlenv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\user\anaconda3\envs\mlenv\lib\site-packages\sklearn\linear

In [62]:
random_cv.best_estimator_

In [63]:
random_cv.best_score_

0.8657142857142859

In [64]:
random_cv.best_params_

{'solver': 'liblinear', 'penalty': 'l1', 'C': 0.1}

In [65]:
y_pred_random_cv = random_cv.predict(X_test)
y_pred_random_cv[:5]

array([1, 1, 1, 0, 1])

In [66]:
score_random_cv = accuracy_score(y_test, y_pred_random_cv)
score_random_cv

0.8733333333333333

In [67]:
cm_random_cv = confusion_matrix(y_test, y_pred_random_cv)
cm_random_cv

array([[134,   7],
       [ 31, 128]], dtype=int64)

In [68]:
class_report_random_cv = classification_report(y_test, y_pred_random_cv)
print(class_report_random_cv)

              precision    recall  f1-score   support

           0       0.81      0.95      0.88       141
           1       0.95      0.81      0.87       159

    accuracy                           0.87       300
   macro avg       0.88      0.88      0.87       300
weighted avg       0.88      0.87      0.87       300

