# Cross Validation Examples

In [None]:
# Import necessary libraries
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold

# Load the Iris dataset
data = load_iris()
X = data.data  # Features
y = data.target  # Labels

# Initialize Logistic Regression model
model = LogisticRegression(max_iter=500)

# 5-Fold Cross-Validation
print("5-Fold Cross-Validation:")
cv_5fold = StratifiedKFold(n_splits=5, shuffle=True)
scores_5fold = cross_val_score(model, X, y, cv=cv_5fold, scoring='accuracy')
print(f"Accuracy scores for 5-fold CV: {scores_5fold}")
print(f"Mean accuracy: {np.mean(scores_5fold)}\n")

# 10-Fold Cross-Validation
print("10-Fold Cross-Validation:")
cv_10fold = StratifiedKFold(n_splits=10, shuffle=True)
scores_10fold = cross_val_score(model, X, y, cv=cv_10fold, scoring='accuracy')
print(f"Accuracy scores for 10-fold CV: {scores_10fold}")
print(f"Mean accuracy: {np.mean(scores_10fold)}")


5-Fold Cross-Validation:
Accuracy scores for 5-fold CV: [0.96666667 1.         0.96666667 1.         0.93333333]
Mean accuracy: 0.9733333333333334

10-Fold Cross-Validation:
Accuracy scores for 10-fold CV: [1.         1.         1.         1.         0.86666667 1.
 0.93333333 0.93333333 0.93333333 1.        ]
Mean accuracy: 0.9666666666666666


#Hyperparameter Tuning

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Load the dataset
data = load_iris()
X = data.data
y = data.target

# Create the decision tree model
model = DecisionTreeClassifier()

# Define the hyperparameters and their possible values
param_grid = {
    'max_depth': [3, 5, 7, None],  # Different depths of the tree
    'min_samples_split': [2, 5, 10]  # Minimum samples to split
}

# Set up GridSearchCV to search for the best combination of parameters
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)  # 5-fold cross-validation

# Fit the model with the best hyperparameters
grid_search.fit(X, y)

# Print the best parameters found
print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'max_depth': 5, 'min_samples_split': 5}


In [3]:
# prompt: give answers in tabular column

import pandas as pd

data = {'Fold': [5, 10],
        'Accuracy Scores': [ "[0.96666667 1.         0.93333333 0.96666667 1.        ]",
                             "[1.         0.9        1.         0.9        0.96666667 0.9        1.         1.         0.93333333 1.        ]"],
        'Mean Accuracy': [0.9733333333333334, 0.96]}

df = pd.DataFrame(data)
print(df)
print("\nBest Hyperparameters: {'max_depth': 3, 'min_samples_split': 2}")

   Fold                                    Accuracy Scores  Mean Accuracy
0     5  [0.96666667 1.         0.93333333 0.96666667 1...       0.973333
1    10  [1.         0.9        1.         0.9        0...       0.960000

Best Hyperparameters: {'max_depth': 3, 'min_samples_split': 2}
