Title: Cross-Validation


Task 1: K-Fold Cross-Validation for House Prices<br>
Apply K-Fold Cross-Validation (K=5) to check variability in performance.

In [None]:
# Write your code here
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
import numpy as np

X, y = make_regression(n_samples=100, n_features=2, noise=0.1, random_state=42)

model = LinearRegression()

cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

print(f"Cross-Validation Scores (Negative MSE): {cv_scores}")
print(f"Mean Negative MSE: {np.mean(cv_scores)}")
print(f"Standard Deviation of Negative MSE: {np.std(cv_scores)}")


Cross-Validation Scores (Negative MSE): [-0.00780395 -0.01771422 -0.01275039 -0.01116348 -0.00978431]
Mean Negative MSE: -0.01184326821433752
Standard Deviation of Negative MSE: 0.0033558438592245503


Task 2: Stratified K-Fold for Imbalanced Churn Dataset<br>
Use Stratified K-Fold to ensure each class is represented.

In [2]:
# Write your code here
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score

data = load_breast_cancer()
X = data.data
y = data.target

model = RandomForestClassifier(random_state=42)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []

for train_index, val_index in skf.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    model.fit(X_train, y_train)
    
    y_val_pred = model.predict(X_val)
    
    accuracy = accuracy_score(y_val, y_val_pred)
    accuracies.append(accuracy)

print(f"Stratified K-Fold Accuracies: {accuracies}")
print(f"Mean Accuracy: {np.mean(accuracies)}")
print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")


Stratified K-Fold Accuracies: [0.9649122807017544, 0.9385964912280702, 0.956140350877193, 0.9473684210526315, 0.9734513274336283]
Mean Accuracy: 0.9560937742586555
Standard Deviation of Accuracy: 0.012339688646904016


Task 3: Leave-One-Out Cross-Validation for Iris<br>
Use LOOCV to assess model prediction for the Iris dataset.

In [3]:
# Write your code here
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score


data = load_iris()
X = data.data
y = data.target

model = LogisticRegression(max_iter=200)

loocv = LeaveOneOut()

accuracies = []

for train_index, test_index in loocv.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model.fit(X_train, y_train)
    
    y_test_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_test_pred)
    accuracies.append(accuracy)


print(f"LOOCV Accuracies: {accuracies[:10]}")  
print(f"Mean Accuracy: {np.mean(accuracies)}")
print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")


LOOCV Accuracies: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
Mean Accuracy: 0.9666666666666667
Standard Deviation of Accuracy: 0.17950549357115014
