In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [28]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
# create database

In [4]:
x, y = make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=42)

In [6]:
model = LogisticRegression()
model

In [7]:
model.fit(x_train, y_train)

In [8]:
y_pred = model.predict(x_test)

In [9]:
y_pred

array([0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0])

In [10]:
# prabability of data where it will fall into. 0 or 1?
# model.predict_proba(x_test)

In [11]:
accuracy_score(y_test, y_pred)

0.8466666666666667

In [12]:
# 118 : true positive
# 17 : false positive (actual value : 0 , predicted value : 1)
# 29: false negative (actual value : 1, prediected value : 0)
# 136 : true negative
confusion_matrix(y_test, y_pred)

array([[118,  17],
       [ 29, 136]])

In [13]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.87      0.84       135
           1       0.89      0.82      0.86       165

    accuracy                           0.85       300
   macro avg       0.85      0.85      0.85       300
weighted avg       0.85      0.85      0.85       300



### Hyperparameter tuning and cross validation

In [14]:
model

In [15]:
panelty = ['l1', 'l2', 'elasticnet']
c_values = [100, 10, 1.0, 0.1, 0.01]
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

In [16]:
params = dict(penalty=panelty, C = c_values, solver=solver)

In [17]:
### Grid Search CV : finding best parameter that will fit for this dataset

In [18]:
cv = StratifiedKFold()
grid = GridSearchCV(estimator=model, param_grid=params, scoring="accuracy", cv=cv, n_jobs=-1)

In [19]:
grid

In [20]:
grid.fit(x_train, y_train)

200 fits failed out of a total of 375.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "H:\Python-all-in-one\ML-algorithams\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "H:\Python-all-in-one\ML-algorithams\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "H:\Python-all-in-one\ML-algorithams\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  

In [21]:
 grid.best_params_

{'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cg'}

In [22]:
grid.best_score_

np.float64(0.8785714285714287)

In [23]:
grid.predict_proba(x_test)

array([[0.68919936, 0.31080064],
       [0.13120589, 0.86879411],
       [0.59219753, 0.40780247],
       [0.24421564, 0.75578436],
       [0.85077215, 0.14922785],
       [0.15781112, 0.84218888],
       [0.87984452, 0.12015548],
       [0.8057668 , 0.1942332 ],
       [0.70073824, 0.29926176],
       [0.6010906 , 0.3989094 ],
       [0.75046283, 0.24953717],
       [0.39596704, 0.60403296],
       [0.53488983, 0.46511017],
       [0.34628876, 0.65371124],
       [0.70819558, 0.29180442],
       [0.77681965, 0.22318035],
       [0.12900281, 0.87099719],
       [0.42259876, 0.57740124],
       [0.39644103, 0.60355897],
       [0.25870421, 0.74129579],
       [0.54289206, 0.45710794],
       [0.83780716, 0.16219284],
       [0.33377606, 0.66622394],
       [0.69112152, 0.30887848],
       [0.74404043, 0.25595957],
       [0.10776162, 0.89223838],
       [0.77942447, 0.22057553],
       [0.7155422 , 0.2844578 ],
       [0.50599365, 0.49400635],
       [0.41706997, 0.58293003],
       [0.

In [24]:
y_pred = grid.predict(x_test)

In [25]:
accuracy_score(y_test, y_pred)

0.8533333333333334

In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.92      0.85       135
           1       0.92      0.80      0.86       165

    accuracy                           0.85       300
   macro avg       0.86      0.86      0.85       300
weighted avg       0.86      0.85      0.85       300



In [27]:
confusion_matrix(y_test, y_pred)

array([[124,  11],
       [ 33, 132]])

#### Randomized Search CV

In [29]:
randomcv = RandomizedSearchCV(estimator=model, param_distributions=params, cv=5, scoring='accuracy')

In [30]:
randomcv.fit(x_train, y_train)

25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "H:\Python-all-in-one\ML-algorithams\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "H:\Python-all-in-one\ML-algorithams\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "H:\Python-all-in-one\ML-algorithams\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be specified when penalty is elas

In [32]:
randomcv.best_score_

np.float64(0.8785714285714287)

In [33]:
randomcv.best_params_

{'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.01}

In [34]:
y_pred = randomcv.predict(x_test)

In [35]:
accuracy_score(y_test, y_pred)

0.8533333333333334