In [1]:
# Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 1. Load the data
url = "https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv"
spaceship = pd.read_csv(url)
print("Original data shape:", spaceship.shape)

# 2. Data Cleaning: Drop rows with any missing values
spaceship = spaceship.dropna()
print("Data shape after dropping missing values:", spaceship.shape)

# 3. Feature Engineering
#    - Transform 'Cabin': extract the deck letter (first part before '/')
spaceship['Cabin'] = spaceship['Cabin'].apply(lambda x: x.split('/')[0] if isinstance(x, str) else x)
#    - Drop columns not useful for prediction: 'PassengerId' and 'Name'
spaceship = spaceship.drop(columns=['PassengerId', 'Name'])

# 4. Separate target and features
y = spaceship['Transported']
X = spaceship.drop(columns=['Transported'])
#    - For non-numerical columns, create dummy variables (one-hot encoding)
X = pd.get_dummies(X, drop_first=True)

# 5. Feature Scaling: Scale features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 6. Train Test Split: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 7. Baseline Model: RandomForestClassifier with default parameters
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
baseline_accuracy = accuracy_score(y_test, y_pred)
print("\nBaseline RandomForestClassifier Accuracy:", baseline_accuracy)

# 8. Hyperparameter Tuning using Grid Search
# Define hyperparameter grid for RandomForestClassifier
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           scoring='accuracy',
                           n_jobs=-1)

# Run Grid Search on training data
grid_search.fit(X_train, y_train)

print("\nBest hyperparameters found:")
print(grid_search.best_params_)

print("\nBest cross-validation score:")
print(grid_search.best_score_)

# 9. Evaluate the Tuned Model on the Test Set
best_rf = grid_search.best_estimator_
y_pred_tuned = best_rf.predict(X_test)
tuned_accuracy = accuracy_score(y_test, y_pred_tuned)
print("\nTuned RandomForestClassifier Accuracy on Test Set:", tuned_accuracy)

Original data shape: (8693, 14)
Data shape after dropping missing values: (6606, 14)

Baseline RandomForestClassifier Accuracy: 0.8063540090771558

Best hyperparameters found:
{'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}

Best cross-validation score:
0.7950431466987758

Tuned RandomForestClassifier Accuracy on Test Set: 0.8124054462934948
