In [1]:
# Question 2: Feature Engineering & Hyperparameter Tuning on the Titanic Dataset

# Step 1: Load the Titanic dataset (Assume you have a file named titanic.csv ).
# Step 2: Create features and handle missing values.
# Step 3: Train a pipeline using a Random Forest with GridSearchCV.
# Step 4: Evaluate the tuned model with cross-validation.

import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Step 1: Load the Titanic dataset
try:
    df = pd.read_csv('titanic.csv')
except FileNotFoundError:
    # Create a larger synthetic dataset for demonstration to avoid warnings
    df = pd.DataFrame({
        'Pclass': [1, 3, 2, 1, 3, 2, 1, 3, 2, 1, 3, 2],
        'Sex': ['male', 'female', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male'],
        'Age': [22, 38, 26, 35, 28, 40, 30, 24, 29, 31, 27, 36],
        'Fare': [71.2833, 7.925, 13.0, 53.1, 8.05, 20.0, 15.5, 9.0, 12.5, 18.0, 7.75, 21.0],
        'Embarked': ['C', 'S', 'S', 'S', 'C', 'Q', 'S', 'C', 'Q', 'S', 'C', 'Q'],
        'Survived': [1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1]
    })

# Step 2: Create features and handle missing values
X = df.drop('Survived', axis=1)
y = df['Survived']

numeric_features = ['Age', 'Fare']
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_features = ['Pclass', 'Sex', 'Embarked']
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Step 3: Train a pipeline using a Random Forest with GridSearchCV
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

param_grid = {
    'clf__n_estimators': [50, 100],
    'clf__max_depth': [3, 5, None]
}

# Use StratifiedKFold to ensure balanced splits and avoid warnings
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline, param_grid, cv=cv)
grid_search.fit(X, y)

print("Best parameters:", grid_search.best_params_)

# Step 4: Evaluate the tuned model with cross-validation
cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=cv)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Best parameters: {'clf__max_depth': 3, 'clf__n_estimators': 50}
Cross-validation scores: [1.   0.75 0.75]
Mean CV score: 0.8333333333333334
