In [2]:
#Implement k-fold cross-validation (cross_val_score()) to validate model performance and assess generalization capabilities. For example:

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, train_test_split

# Load the Titanic dataset
df = pd.read_csv("titanic.csv")

# Preprocessing: Dropping unnecessary columns and handling missing values
df = df.drop(['Cabin', 'PassengerId', 'Name', 'Ticket'], axis=1)
df['Age'] = df['Age'].fillna(df['Age'].median())
df = df.dropna(subset=['Embarked'])
encoder = LabelEncoder()
df['Sex'] = encoder.fit_transform(df['Sex'])
df['Embarked'] = encoder.fit_transform(df['Embarked'])

# Splitting into features (X) and target (y)
X = df.drop('Survived', axis=1)
y = df['Survived']

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Decision Tree Classifier instance
dt_classifier = DecisionTreeClassifier(random_state=42)

# Perform k-fold cross-validation (k=5)
cv_scores = cross_val_score(dt_classifier, X_train, y_train, cv=5)

# Display the cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))



#Use grid search (GridSearchCV) or random search (RandomizedSearchCV) to systematically explore a range of hyperparameters (e.g., max_depth, min_samples_leaf, criterion) and identify the optimal combination for improved model performance. For example:

from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
'criterion': ['gini', 'entropy'],
'max_depth': [3, 5, 7, 10],
'min_samples_leaf': [1, 2, 4, 6]
}

# Create a Decision Tree Classifier instance
dt_classifier = DecisionTreeClassifier(random_state=42)

# Create GridSearchCV instance with the hyperparameter grid and 5-fold cross-validation
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, cv=5)

# Fit the GridSearchCV instance on training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and corresponding score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Evaluate the model performance on the test set using the best estimator
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)

# Display the best parameters and scores
print("Best Hyperparameters:", best_params)
print("Best Cross-Validation Score:", best_score)
print("Test Set Score:", test_score)

Cross-Validation Scores: [0.70629371 0.84507042 0.8028169  0.79577465 0.76760563]
Mean CV Score: 0.7835122623855019
Best Hyperparameters: {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 1}
Best Cross-Validation Score: 0.8185757904067763
Test Set Score: 0.8202247191011236
