In [1]:
!pip install lightgbm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import lightgbm as lgb  
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report




In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Save PassengerId for submission
test_passenger_ids = test_df["PassengerId"].copy()

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

In [None]:
# ==================== PREPROCESSING FUNCTION ====================
def preprocess_data(df, is_train=True):
    """Preprocess train or test data with same transformations"""
    df = df.copy()
    
    # Handle Cabin Column
    df[["deck","num","side"]] = df["Cabin"].str.split("/", expand=True)
    
    # Drop unnecessary columns
    cols_to_drop = ["PassengerId", "Name", "Cabin"]
    df.drop(cols_to_drop, axis=1, inplace=True)
    
    # Convert num to numeric
    df['num'] = pd.to_numeric(df['num'], errors='coerce')
    
    # Fill missing values for numerical columns
    num_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "num"]
    for col in num_cols:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].median())
    
    # Fill missing values for categorical columns with mode
    cat_cols = ["CryoSleep", "Destination", "VIP", "deck", "side"]
    for col in cat_cols:
        if col in df.columns:
            mode_val = df[col].mode()
            if len(mode_val) > 0:
                df[col] = df[col].fillna(mode_val.iloc[0])

 # Fill HomePlanet based on deck
    if 'HomePlanet' in df.columns and 'deck' in df.columns:
        df['HomePlanet'] = df.groupby('deck')['HomePlanet'].transform(
            lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else 'Earth')
        )
    
    # Create services feature
    service_cols = ["ShoppingMall", "RoomService", "FoodCourt", "Spa", "VRDeck"]
    df["services"] = df[service_cols].sum(axis=1)
    df.drop(service_cols, axis=1, inplace=True)
    
    # Remove outliers (only for training data)
    if is_train:
        for col in ["Age", "services"]:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5*IQR
            upper = Q3 + 1.5*IQR
            df[col] = df[col].clip(lower, upper)
    
    # One-hot encoding
    encode_cols = ["HomePlanet", "Destination", "deck", "side"]
    df = pd.get_dummies(columns=encode_cols, data=df, drop_first=True, dtype=int)
    
    # Convert boolean to int
    bool_cols = ["CryoSleep", "VIP"]
    for col in bool_cols:
        if col in df.columns:
            df[col] = df[col].astype(int)
    
    return df

In [None]:
stitanic = preprocess_data(train_df, is_train=True)
test_processed = preprocess_data(test_df, is_train=False)

# Separate target variable
y = stitanic["Transported"].astype(int)
X = stitanic.drop("Transported", axis=1)

# Align test data columns with train data
# Add missing columns to test with 0 values
for col in X.columns:
    if col not in test_processed.columns:
        test_processed[col] = 0

# Remove extra columns from test
test_processed = test_processed[X.columns]

print("\nTrain features shape:", X.shape)
print("Test features shape:", test_processed.shape)


Train features shape: (8693, 17)
Test features shape: (4277, 17)


  df[col] = df[col].fillna(mode_val.iloc[0])
  df[col] = df[col].fillna(mode_val.iloc[0])


In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=101)
X_val, X_test_internal, y_val, y_test_internal = train_test_split(X_temp, y_temp, test_size=0.5, random_state=101)

print("\nTrain set:", X_train.shape)
print("Validation set:", X_val.shape)
print("Internal test set:", X_test_internal.shape)


Train set: (6085, 17)
Validation set: (1304, 17)
Internal test set: (1304, 17)


In [None]:
# 1. LightGBM 
print("\n1. LightGBM")
lgb_model = lgb.LGBMClassifier(n_estimators=200, learning_rate=0.05, max_depth=7, random_state=42)
lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
y_val_pred = lgb_model.predict(X_val)
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")

# 2. Random Forest
print("\n2. Random Forest")
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
y_val_pred_rf = rf_model.predict(X_val)
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_rf):.4f}")

# 3. AdaBoost
print("\n3. AdaBoost")
ada_model = AdaBoostClassifier(n_estimators=100, learning_rate=0.5, random_state=42)
ada_model.fit(X_train, y_train)
y_val_pred_ada = ada_model.predict(X_val)
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_ada):.4f}")


In [None]:
# ==================== CHOOSE BEST MODEL ====================
# Let's use LightGBM as it typically performs best
# Train on full dataset (train + val) for final submission
print("\n" + "="*50)
print("TRAINING FINAL MODEL ON FULL DATA")
print("="*50)

X_full = pd.concat([X_train, X_val], axis=0)
y_full = pd.concat([y_train, y_val], axis=0)

final_model = lgb.LGBMClassifier(n_estimators=200, learning_rate=0.05, max_depth=7, random_state=42)
final_model.fit(X_full, y_full)

# Evaluate on internal test set
y_test_pred = final_model.predict(X_test_internal)
print(f"Internal Test Accuracy: {accuracy_score(y_test_internal, y_test_pred):.4f}")

# ==================== GENERATE SUBMISSION ====================
print("\n" + "="*50)
print("GENERATING SUBMISSION FILE")
print("="*50)

# Make predictions on actual test data
test_predictions = final_model.predict(test_processed)

# Convert to boolean as required by competition
test_predictions_bool = test_predictions.astype(bool)

# Create submission dataframe
submission = pd.DataFrame({
    'PassengerId': test_passenger_ids,
    'Transported': test_predictions_bool
})

# Save to CSV
submission.to_csv('submission.csv', index=False)
print("\n✓ Submission file 'submission.csv' created successfully!")
print(f"✓ Shape: {submission.shape}")
print("\nFirst few predictions:")
print(submission.head(10))

# Display prediction distribution
print(f"\nPrediction distribution:")
print(submission['Transported'].value_counts())
print(f"Percentage Transported: {submission['Transported'].sum() / len(submission) * 100:.2f}%")


1. LightGBM
[LightGBM] [Info] Number of positive: 3069, number of negative: 3016
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000772 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 600
[LightGBM] [Info] Number of data points in the train set: 6085, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504355 -> initscore=0.017420
[LightGBM] [Info] Start training from score 0.017420
Validation Accuracy: 0.7630

2. Random Forest
Validation Accuracy: 0.7707

3. AdaBoost
Validation Accuracy: 0.7546

TRAINING FINAL MODEL ON FULL DATA
[LightGBM] [Info] Number of positive: 3712, number of negative: 3677
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000290 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wi