In [None]:
# Spaceship Titanic - Kaggle Competition
In this notebook, we will preprocess data, train models, evaluate performance, and generate predictions.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Load the dataset
train_df = pd.read_csv('/mnt/data/spaceship_titanic_dataset/train.csv')
test_df = pd.read_csv('/mnt/data/spaceship_titanic_dataset/test.csv')

In [None]:
# Preprocessing
train_df[['Deck', 'Num', 'Side']] = train_df['Cabin'].str.split('/', expand=True)
test_df[['Deck', 'Num', 'Side']] = test_df['Cabin'].str.split('/', expand=True)
train_df[['Group', 'Number']] = train_df['PassengerId'].str.split('_', expand=True)
test_df[['Group', 'Number']] = test_df['PassengerId'].str.split('_', expand=True)

In [None]:
features = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService',
            'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Deck', 'Side']
X = train_df[features]
y = train_df['Transported']

In [None]:
numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [None]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
# Define models
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier()
}

In [None]:
# Train and evaluate models
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
results = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', model)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    results[name] = acc
    print(f'Model: {name}')
    print(f'Accuracy: {acc:.4f}')
    print(classification_report(y_val, y_pred))
    print('-' * 50)

In [None]:
# Select best model based on accuracy
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
best_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', best_model)])
best_pipeline.fit(X, y)

In [None]:
# Generate predictions for test data
X_test = test_df[features]
test_df['Transported'] = best_pipeline.predict(X_test)
submission = test_df[['PassengerId', 'Transported']]
submission.to_csv('/mnt/data/spaceship_titanic_submission.csv', index=False)

In [None]:
print(f"Best Model Selected: {best_model_name}")
print("Submission file saved as spaceship_titanic_submission.csv")