## K fold cross-validation

In [1]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = load_iris()

In [2]:
X, y = iris.data, iris.target

In [3]:
# Define the number of folds (K)
k = 5

# Initialize a KFold object
kf = KFold(n_splits=k, shuffle=True, random_state=42)
clf = LogisticRegression()

In [4]:
# Initialize a list to store the accuracies
accuracies = []

# Perform K-fold cross-validation
for train_index, test_index in kf.split(X):
    # Split the dataset into train and test sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train the classifier
    clf.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = clf.predict(X_test)

In [5]:
scores = cross_val_score(clf, X, y, cv = kf)

In [6]:
# Calculate the average accuracy across all folds
accuracy = accuracy_score(y_test, y_pred)
print("Test accuracy:", accuracy, "validation accuracy", scores.mean())

Test accuracy: 0.9666666666666667 validation accuracy 0.9733333333333334


## LOO

In [7]:
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import train_test_split

X, y = iris.data, iris.target

# Initialize the Leave-One-Out Cross-Validator
loo = LeaveOneOut()

# Initialize a classifier (e.g., Logistic Regression)
classifier = LogisticRegression()

In [8]:
# Initialize lists to store predictions and true labels
predictions = []
true_labels = []

# Perform LOOCV
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train the classifier on the training set
    classifier.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = classifier.predict(X_test)
  

In [9]:
scores = cross_val_score(classifier, X, y, cv = loo)

In [10]:
accuracy = accuracy_score(y_test, y_pred)
print("Test accuracy:", accuracy, "validation accuracy", scores.mean())

Test accuracy: 1.0 validation accuracy 0.9666666666666667


## Optimalization

In [11]:
X = iris.data
y = iris.target

In [12]:
cls = LogisticRegression(max_iter = 10000)
C = [0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2]

In [13]:
scores = []

for choice in C:
  cls.set_params(C=choice)
  cls.fit(X, y)
  scores.append(cls.score(X, y))

In [14]:
print(scores) 

[0.9666666666666667, 0.9666666666666667, 0.9733333333333334, 0.9733333333333334, 0.98, 0.98, 0.9866666666666667, 0.9866666666666667]


## Tune Hyperparameters

#### Hyperparameters are the variables that the user specify usually while building the Machine Learning model k in knn cls
#### Parameters are found by training the model, hyperparameters are set by the data scientist before training

In [15]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.8, random_state=20)
rf = RandomForestClassifier()

In [17]:
# grid search cv
grid_space={'max_depth':[3,5,10,None],
              'n_estimators':[10,100,200],
              'max_features':[1,3,5,7],
              'min_samples_leaf':[1,2,3],
              'min_samples_split':[1,2,3]
           }

In [18]:
grid = GridSearchCV(rf,param_grid=grid_space,cv=3,scoring='accuracy')
model_grid = grid.fit(X_train,y_train)

In [19]:
# grid search results
print('Best grid search hyperparameters are: '+str(model_grid.best_params_))
print('Best grid search score is: '+str(model_grid.best_score_))

Best grid search hyperparameters are: {'max_depth': 5, 'max_features': 1, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 10}
Best grid search score is: 1.0


### random search cv

In [20]:
rs_space={'max_depth':list(np.arange(10, 100, step=10)) + [None],
              'n_estimators':np.arange(10, 500, step=50),
              'max_features':randint(1,7),
              'criterion':['gini','entropy'],
              'min_samples_leaf':randint(1,4),
              'min_samples_split':np.arange(2, 10, step=2)
          }

In [21]:
rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(rf, rs_space, n_iter=500, scoring='accuracy', n_jobs=-1, cv=3)
model_random = rf_random.fit(X,y)

In [22]:
# random random search results
print('Best random search hyperparameters are: '+str(model_random.best_params_))
print('Best random search score is: '+str(model_random.best_score_))

Best random search hyperparameters are: {'criterion': 'entropy', 'max_depth': 80, 'max_features': 2, 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 10}
Best random search score is: 0.9733333333333333
