### Logistic Regression For Binary Classification

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
## Create some dataset
## This library will create the data set without the need of standarization

from sklearn.datasets import  make_classification
X,y = make_classification(n_samples=1000, n_features=10, n_classes=2,random_state=42)

In [None]:
pd.DataFrame(X).head()

In [None]:
y

In [None]:
## Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [None]:
## Model traning

from sklearn.linear_model import LogisticRegression
logistic_reg = LogisticRegression()
logistic_reg.fit(X_train, y_train)
y_pred = logistic_reg.predict(X_test)



In [None]:
## Perfomance metrics

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

score = accuracy_score(y_test, y_pred)
print(score)

cm = confusion_matrix(y_test, y_pred)
print(cm)

report = classification_report(y_test, y_pred)
print(report)

#### Hyperparamerter Tunning and Cross Validataion

In [None]:
model = LogisticRegression(max_iter=1000)
penalty = ['l1', 'l2', 'elasticnet', 'none']
c_value = [100, 10, 1.0, 0.1, 0.01]
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

# Create the parameter grid
param_grid = [
    {'penalty': ['l2'], 'C': c_value, 'solver': ['newton-cg', 'lbfgs', 'sag']},
    {'penalty': ['l1'], 'C': c_value, 'solver': ['liblinear', 'saga']},
    {'penalty': ['elasticnet'], 'C': c_value, 'solver': ['saga'], 'l1_ratio': [0.5]},
    {'penalty': [None], 'solver': ['newton-cg', 'lbfgs', 'sag', 'saga']}
]

print(param_grid)


In [None]:
## Grid Search CV

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedGroupKFold
cv = StratifiedGroupKFold()
grid = GridSearchCV(estimator=model, param_grid= param_grid, scoring='accuracy', cv=cv, n_jobs=-1)


In [None]:
grid.fit(X_train, y_train)

In [None]:
grid.best_params_

In [None]:
y_pred = grid.predict(X_test)

score = accuracy_score(y_test, y_pred)
print(score)

cm = confusion_matrix(y_test, y_pred)
print(cm)

report = classification_report(y_test, y_pred)
print(report)

In [None]:
### Hyper paratmer tunning with RandomizedSearchCV

from sklearn.model_selection import RandomizedSearchCV
randomCV = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=5, scoring='accuracy')

randomCV.fit(X_train, y_train)

In [None]:
randomCV.best_params_

In [None]:
y_pred = randomCV.predict(X_test)

score = accuracy_score(y_test, y_pred)
print(score)

cm = confusion_matrix(y_test, y_pred)
print(cm)

report = classification_report(y_test, y_pred)
print(report)

### Logistic Regression For Multiclass Classification

In [None]:
from sklearn.datasets import  make_classification
X,y = make_classification(n_samples=1000, n_features=10, n_informative=3, n_classes=3,random_state=42)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)    

In [None]:
logistic_reg = LogisticRegression(multi_class='ovr')
logistic_reg.fit(X_train, y_train)
y_pred = logistic_reg.predict(X_test)

score = accuracy_score(y_test, y_pred)
print(score)

cm = confusion_matrix(y_test, y_pred)
print(cm)

report = classification_report(y_test, y_pred)
print(report)

### Logistic Regression for Imbalanced Dataset

In [None]:
## Generate and plot a synthetic imbalance classification dataset

from collections import Counter
from sklearn.datasets import make_classification

In [None]:
## Imbalanced dataset

X, y = make_classification(n_samples=10000, n_features=2, n_clusters_per_class=1, n_redundant=0, weights=[0.99], random_state=10)

In [None]:
X

In [None]:
Counter(y)

In [None]:
import seaborn as sns
sns.scatterplot(x=X[:,0], y=X[:,1], hue=y)

In [None]:
from sklearn.model_selection import train_test_split   
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)

In [27]:
## Hyperparameter tunning
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
penalty = ['l1', 'l2', 'elasticnet', 'none']
c_value = [100, 10, 1.0, 0.1, 0.01]
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
class_weight= [{0:w,1:y} for w in [1,10,50,100] for y in [1,10,50,100]]

param_grid = dict(penalty=penalty, C=c_value, solver=solver, class_weight=class_weight)

In [28]:
param_grid

{'penalty': ['l1', 'l2', 'elasticnet', 'none'],
 'C': [100, 10, 1.0, 0.1, 0.01],
 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
 'class_weight': [{0: 1, 1: 1},
  {0: 1, 1: 10},
  {0: 1, 1: 50},
  {0: 1, 1: 100},
  {0: 10, 1: 1},
  {0: 10, 1: 10},
  {0: 10, 1: 50},
  {0: 10, 1: 100},
  {0: 50, 1: 1},
  {0: 50, 1: 10},
  {0: 50, 1: 50},
  {0: 50, 1: 100},
  {0: 100, 1: 1},
  {0: 100, 1: 10},
  {0: 100, 1: 50},
  {0: 100, 1: 100}]}

In [29]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

cv= StratifiedKFold()
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=cv, n_jobs=-1)

In [30]:
grid.fit(X_train, y_train)

5200 fits failed out of a total of 8000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
400 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/murtuzasaifee/Documents/Personal/Codes/python_ml_nlp/venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/murtuzasaifee/Documents/Personal/Codes/python_ml_nlp/venv/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/murtuzasaifee/Documents/Personal/Codes/python_ml_nlp/venv/lib/python3.12/site-packages/sklearn/linear_m

In [31]:
grid.best_params_

{'C': 100,
 'class_weight': {0: 50, 1: 100},
 'penalty': 'l1',
 'solver': 'liblinear'}

In [32]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = grid.predict(X_test)

score = accuracy_score(y_test, y_pred)
print(score)

cm = confusion_matrix(y_test, y_pred)
print(cm)

report = classification_report(y_test, y_pred)
print(report)

0.9896
[[2465    0]
 [  26    9]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2465
           1       1.00      0.26      0.41        35

    accuracy                           0.99      2500
   macro avg       0.99      0.63      0.70      2500
weighted avg       0.99      0.99      0.99      2500



## Logistic Regression with ROC curve and ROC AUC score

In [33]:
## roc curve and auc

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score


In [34]:
## genearate 2 class dataset

X,y = make_classification(n_samples=1000, n_classes=2, random_state=1)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [None]:
## we have created a dummy model with default 0 as output
dummy_model_prob = [0 for _ in range(len(y_test))]
dummy_model_prob


[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [38]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [39]:
## prediction based on probability
model_prob = model.predict_proba(X_test)

In [40]:
model_prob

array([[6.74190873e-04, 9.99325809e-01],
       [9.19840444e-01, 8.01595557e-02],
       [5.43753925e-01, 4.56246075e-01],
       [1.94320087e-01, 8.05679913e-01],
       [8.96955241e-01, 1.03044759e-01],
       [5.38990241e-01, 4.61009759e-01],
       [1.71352090e-01, 8.28647910e-01],
       [2.90543366e-01, 7.09456634e-01],
       [1.02301813e-01, 8.97698187e-01],
       [1.23538942e-01, 8.76461058e-01],
       [9.99667210e-01, 3.32790214e-04],
       [1.76332587e-01, 8.23667413e-01],
       [8.26464125e-01, 1.73535875e-01],
       [4.77087470e-01, 5.22912530e-01],
       [3.12717643e-01, 6.87282357e-01],
       [1.73398482e-01, 8.26601518e-01],
       [5.54531818e-02, 9.44546818e-01],
       [3.53323048e-01, 6.46676952e-01],
       [8.99702877e-01, 1.00297123e-01],
       [8.02792608e-01, 1.97207392e-01],
       [9.90033818e-01, 9.96618235e-03],
       [5.10780820e-01, 4.89219180e-01],
       [9.88330747e-02, 9.01166925e-01],
       [1.25107454e-03, 9.98748925e-01],
       [8.800607

In [41]:
## Lets only focus on the positive outcome
model_prob = model_prob[:,1]

In [42]:
model_prob

array([9.99325809e-01, 8.01595557e-02, 4.56246075e-01, 8.05679913e-01,
       1.03044759e-01, 4.61009759e-01, 8.28647910e-01, 7.09456634e-01,
       8.97698187e-01, 8.76461058e-01, 3.32790214e-04, 8.23667413e-01,
       1.73535875e-01, 5.22912530e-01, 6.87282357e-01, 8.26601518e-01,
       9.44546818e-01, 6.46676952e-01, 1.00297123e-01, 1.97207392e-01,
       9.96618235e-03, 4.89219180e-01, 9.01166925e-01, 9.98748925e-01,
       1.19939253e-01, 9.65172294e-03, 6.25724165e-01, 9.77606909e-01,
       3.56187875e-02, 1.86042701e-02, 3.57901844e-01, 2.61845696e-02,
       2.32160980e-01, 9.67711850e-01, 5.22332907e-01, 6.53656208e-01,
       1.53772964e-02, 2.56121980e-02, 7.88766865e-01, 2.31590934e-02,
       9.43088035e-01, 8.52350849e-01, 1.33521065e-02, 4.00524634e-04,
       5.30303930e-02, 5.26483686e-02, 9.98623014e-01, 6.50234894e-01,
       9.07343469e-01, 4.30780844e-02, 2.09623419e-02, 2.30408484e-02,
       9.46586259e-02, 5.14534062e-02, 5.68941486e-01, 9.90312178e-01,
      

In [43]:
## lets calculate the scrore

dummy_model_auc = roc_auc_score(y_test, dummy_model_prob)
model_auc = roc_auc_score(y_test, model_prob)
print(dummy_model_auc)
print(model_auc)

0.5
0.9044392939917989
