In [1]:
# Question 2: Feature Engineering & Hyperparameter Tuning on the Titanic Dataset

# Step 1: Load the Titanic dataset (Assume you have a file named titanic.csv ).
# Step 2: Create features and handle missing values.
# Step 3: Train a pipeline using a Random Forest with GridSearchCV.
# Step 4: Evaluate the tuned model with cross-validation.
import os
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

if 'titanic.csv' in os.listdir():
    df = pd.read_csv('titanic.csv')
    
    df = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Survived']]
    df.dropna(subset=['Embarked'], inplace=True)
    
    X = df.drop('Survived', axis=1)
    y = df['Survived']
    
    num_features = ['Age', 'SibSp', 'Parch', 'Fare']
    cat_features = ['Pclass', 'Sex', 'Embarked']

    num_transformer = SimpleImputer(strategy='median')
    cat_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer([
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ])

    clf = Pipeline([
        ('preprocess', preprocessor),
        ('model', RandomForestClassifier(random_state=42))
    ])

    param_grid = {
        'model__n_estimators': [50, 100],
        'model__max_depth': [5, 10]
    }

    grid = GridSearchCV(clf, param_grid, cv=5)
    grid.fit(X, y)

    scores = cross_val_score(grid.best_estimator_, X, y, cv=5)
    print(f"Best Params: {grid.best_params_}")
    print(f"Cross-Validated Accuracy: {scores.mean():.4f}")
else:
    print("File 'titanic.csv' not found in current directory.")


File 'titanic.csv' not found in current directory.
