In [1]:
# Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score

# 1. Load the data
url = "https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv"
spaceship = pd.read_csv(url)
print("Original data shape:", spaceship.shape)

# 2. Data Cleaning
#    Drop rows with any missing values
spaceship = spaceship.dropna()
print("Data shape after dropping missing values:", spaceship.shape)

# 3. Feature Engineering
#    Transform 'Cabin': extract deck letter (first part before '/')
spaceship['Cabin'] = spaceship['Cabin'].apply(lambda x: x.split('/')[0] if isinstance(x, str) else x)

#    Drop columns not useful for prediction: 'PassengerId' and 'Name'
spaceship = spaceship.drop(columns=['PassengerId', 'Name'])

#    Separate the target variable from features
y = spaceship['Transported']
X = spaceship.drop(columns=['Transported'])

#    For non-numerical columns, create dummies
X = pd.get_dummies(X, drop_first=True)

# 4. Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5. Train Test Split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 6. Model Selection - Ensemble Methods

# 6.1. Bagging and Pasting using BaggingClassifier
bagging = BaggingClassifier(n_estimators=50, random_state=42)
bagging.fit(X_train, y_train)
y_pred_bag = bagging.predict(X_test)
accuracy_bag = accuracy_score(y_test, y_pred_bag)
print("BaggingClassifier Accuracy:", accuracy_bag)

# 6.2. Random Forests using RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("RandomForestClassifier Accuracy:", accuracy_rf)

# 6.3. Gradient Boosting using GradientBoostingClassifier
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print("GradientBoostingClassifier Accuracy:", accuracy_gb)

# 6.4. Adaptive Boosting using AdaBoostClassifier
ada = AdaBoostClassifier(random_state=42)
ada.fit(X_train, y_train)
y_pred_ada = ada.predict(X_test)
accuracy_ada = accuracy_score(y_test, y_pred_ada)
print("AdaBoostClassifier Accuracy:", accuracy_ada)

# 7. Summary of results
print("\nSummary of Ensemble Model Accuracies:")
print("Bagging:         ", accuracy_bag)
print("Random Forest:   ", accuracy_rf)
print("Gradient Boosting:", accuracy_gb)
print("Adaptive Boosting:", accuracy_ada)


Original data shape: (8693, 14)
Data shape after dropping missing values: (6606, 14)
BaggingClassifier Accuracy: 0.8071104387291982
RandomForestClassifier Accuracy: 0.8063540090771558
GradientBoostingClassifier Accuracy: 0.8071104387291982
AdaBoostClassifier Accuracy: 0.791981845688351

Summary of Ensemble Model Accuracies:
Bagging:          0.8071104387291982
Random Forest:    0.8063540090771558
Gradient Boosting: 0.8071104387291982
Adaptive Boosting: 0.791981845688351


