In [3]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
x, y = make_classification(
    n_samples=1000,
    n_features=10,
    n_informative=8,
    n_redundant=2,
    n_repeated=0,
    n_classes=2,
    random_state=42
)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [4]:
lr = LogisticRegression()
lr.fit(x_train,y_train)

y_pred = lr.predict(x_test)

cr = classification_report(y_test,y_pred)

print(cr)

              precision    recall  f1-score   support

           0       0.73      0.62      0.67       106
           1       0.63      0.73      0.68        94

    accuracy                           0.68       200
   macro avg       0.68      0.68      0.67       200
weighted avg       0.68      0.68      0.67       200



In [7]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5,shuffle=True,random_state=42)
score =[]

for train_index,test_index in kf.split(x,y):
    x_train,x_test = x[train_index],x[test_index]
    y_train,y_test = y[train_index],y[test_index]
    
    lr.fit(x_train,y_train)
    score.append(lr.score(x_test,y_test))
print(score)

[0.675, 0.715, 0.72, 0.645, 0.72]


In [11]:
from sklearn.model_selection import cross_val_score

score_log = cross_val_score(LogisticRegression(),x,y,cv=5)
score_log

array([0.71 , 0.69 , 0.655, 0.685, 0.7  ])

In [12]:
# Now Decision tree
from sklearn.tree import DecisionTreeClassifier

score_tree = cross_val_score(DecisionTreeClassifier(),x,y,cv=5)
score_tree

array([0.81 , 0.73 , 0.825, 0.8  , 0.82 ])

In [13]:
avg_score_log = np.average(score_log)
avg_score_tree = np.average(score_tree)

print(avg_score_log,avg_score_tree)

0.688 0.7969999999999999


In [14]:
# Randome forest

from sklearn.ensemble import RandomForestClassifier
score_rf = cross_val_score(RandomForestClassifier(),x,y,cv=5)
np.average(score_rf)

np.float64(0.8880000000000001)

In [15]:
score_rf = cross_val_score(RandomForestClassifier(n_estimators=20),x,y,cv=5)
np.average(score_rf)

np.float64(0.8780000000000001)

In [16]:
score_rf = cross_val_score(RandomForestClassifier(n_estimators=30),x,y,cv=5)
np.average(score_rf)

np.float64(0.876)

In [17]:
from sklearn.model_selection import cross_validate

cross_validate(DecisionTreeClassifier(),x,y,cv=5,scoring=['accuracy','roc_auc'])

{'fit_time': array([0.00540066, 0.00408912, 0.00714707, 0.00656509, 0.00492382]),
 'score_time': array([0.00203824, 0.00097489, 0.0031321 , 0.00163293, 0.0008409 ]),
 'test_accuracy': array([0.81 , 0.705, 0.815, 0.8  , 0.82 ]),
 'test_roc_auc': array([0.81 , 0.705, 0.815, 0.8  , 0.82 ])}

## Stratified K fold

In [18]:
x, y = make_classification(
    n_samples=1000,
    n_features=10,
    n_informative=8,
    n_redundant=2,
    n_repeated=0,
    n_classes=2,
    weights=[0.9,0.1],
    random_state=42
)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [19]:
from collections import Counter
Counter(y)

Counter({np.int64(0): 897, np.int64(1): 103})

In [21]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5,shuffle=True,random_state=42)

for train_index,test_index in kf.split(x,y):
    x_train,x_test = x[train_index],x[test_index]
    y_train,y_test = y[train_index],y[test_index]
    
    lr.fit(x_train,y_train)
    print(Counter(y_test))
    print(lr.score(x_test,y_test))

Counter({np.int64(0): 177, np.int64(1): 23})
0.89
Counter({np.int64(0): 179, np.int64(1): 21})
0.88
Counter({np.int64(0): 183, np.int64(1): 17})
0.915
Counter({np.int64(0): 181, np.int64(1): 19})
0.91
Counter({np.int64(0): 177, np.int64(1): 23})
0.89


In [22]:
from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

for train_index,test_index in kf.split(x,y):
    x_train,x_test = x[train_index],x[test_index]
    y_train,y_test = y[train_index],y[test_index]
    
    lr.fit(x_train,y_train)
    print(Counter(y_test))
    print(lr.score(x_test,y_test))

Counter({np.int64(0): 180, np.int64(1): 20})
0.915
Counter({np.int64(0): 180, np.int64(1): 20})
0.91
Counter({np.int64(0): 179, np.int64(1): 21})
0.895
Counter({np.int64(0): 179, np.int64(1): 21})
0.895
Counter({np.int64(0): 179, np.int64(1): 21})
0.895


In [23]:
cross_validate(DecisionTreeClassifier(),x,y,cv=5,scoring=['accuracy','roc_auc'])

{'fit_time': array([0.00627899, 0.00606704, 0.01259327, 0.00683022, 0.00498271]),
 'score_time': array([0.00120187, 0.00100112, 0.00168705, 0.00093198, 0.00085139]),
 'test_accuracy': array([0.895, 0.845, 0.865, 0.875, 0.92 ]),
 'test_roc_auc': array([0.65277778, 0.55833333, 0.6934025 , 0.74102155, 0.72412876])}

In [24]:
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
cross_validate(DecisionTreeClassifier(),x,y,cv=skf,scoring=['accuracy','roc_auc'])

{'fit_time': array([0.00567293, 0.00511312, 0.00889969, 0.00729084, 0.00482607]),
 'score_time': array([0.00108385, 0.00093102, 0.00285316, 0.00082111, 0.00087881]),
 'test_accuracy': array([0.91 , 0.89 , 0.91 , 0.845, 0.915]),
 'test_roc_auc': array([0.70555556, 0.71666667, 0.71854217, 0.59816441, 0.70031923])}