In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

pd.set_option('future.no_silent_downcasting', True)

df_train = pd.read_csv(r"C:\Users\jackm\Downloads\spaceship-titanic\train.csv")
df_test = pd.read_csv(r"C:\Users\jackm\Downloads\spaceship-titanic\test.csv")

# --------------------------
# 2. Boolean columns
# --------------------------
booleans = ['CryoSleep', 'VIP']

for col in booleans:
    for df in [df_train, df_test]:
        # Add missingness flag
        df[col + '_was_missing'] = df[col].isna().astype(int)
        # Convert to 0/1, fill NaN with 0
        df[col] = df[col].map({"True": 1, "False": 0})
        df[col] = df[col].fillna(0).astype(int)

# --------------------------
# 3. Numeric columns
# --------------------------
numeric_cols = df_train.select_dtypes(include=['float64', 'int64']).columns

for col in numeric_cols:
    for df in [df_train, df_test]:
        df[col + '_was_missing'] = df[col].isna().astype(int)
    # Fill missing using training median
    df_train[col] = df_train[col].fillna(df_train[col].median())
    df_test[col] = df_test[col].fillna(df_train[col].median())

# 4. Categorical columns
categorical_cols = df_train.select_dtypes(include=['object', 'category']).columns

categorical_cols = [c for c in categorical_cols if c not in ["PassengerId", "Name"]]

# Split by cardinality
high_card_thresh = 5
low_card_cols = [c for c in categorical_cols if df_train[c].nunique() <= high_card_thresh]
high_card_cols = [c for c in categorical_cols if df_train[c].nunique() > high_card_thresh]

# Fill missing values
for col in low_card_cols + high_card_cols:
    df_train[col] = df_train[col].fillna("Unknown")
    df_test[col] = df_test[col].fillna("Unknown")

# Label encode high-cardinality columns
le_dict = {}
for col in high_card_cols:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col].astype(str))
    # Only transform known labels; unseen labels will be assigned -1
    df_test[col] = df_test[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
    le_dict[col] = le

# One-hot encode low-cardinality columns
df_train = pd.get_dummies(df_train, columns=low_card_cols, drop_first=True)
df_test = pd.get_dummies(df_test, columns=low_card_cols, drop_first=True)

# Align test columns with train
df_test = df_test.reindex(columns=df_train.columns.drop('Transported'), fill_value=0)

# 5. Split features and target
y_train = df_train['Transported'].map({True:1, False:0}).astype(int)
X_train = df_train.drop(['Transported','Name'], axis=1)
X_test = df_test.drop('Name', axis=1)  # Keep PassengerId for submission if needed

In [2]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    max_features=0.3,
    n_jobs=-1,
    random_state=42
)

rf.fit(X_train, y_train)

In [3]:
y_pred_train = rf.predict(X_train)
print("Training accuracy:", accuracy_score(y_train, y_pred_train))

Training accuracy: 0.8415966869895318


In [4]:
y_pred_test = rf.predict(X_test)

# Make sure predictions are boolean
y_pred_test = y_pred_test.astype(bool)

# Create submission DataFrame
submission = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],  # match exactly the test IDs
    "Transported": y_pred_test
})

# Save to CSV
submission.to_csv(r"C:\Users\jackm\Downloads\spaceship-titanic\my_submission4.csv", index=False)

In [5]:
import xgboost as xgb

X_train_model = X_train.drop('PassengerId', axis=1)
X_test_model = X_test.drop('PassengerId', axis=1)


# --------------------------
# 1. Prepare DMatrix for XGBoost
# --------------------------
# XGBoost can take pandas directly, but DMatrix is optimized
dtrain = xgb.DMatrix(X_train_model, label=y_train)
dtest = xgb.DMatrix(X_test_model)  # no label since it's the competition/test set

# --------------------------
# 2. Set parameters
# --------------------------
params = {
    "objective": "binary:logistic",  # binary classification
    "eval_metric": "logloss",        # could also use "error" for accuracy
    "max_depth": 8,                 # adjust for complexity
    "eta": 0.07,                      # learning rate
    "subsample": 0.8,                # row sampling for regularization
    "colsample_bytree": 0.8,         # feature sampling for regularization
    "seed": 42
}

# --------------------------
# 3. Train the model with early stopping
# --------------------------
evals = [(dtrain, 'train')]
xgb_model = xgb.train(
    params,
    dtrain,
    num_boost_round=500,
    evals=evals,
    early_stopping_rounds=20,
    verbose_eval=20
)

# --------------------------
# 4. Make predictions
# --------------------------
y_pred_prob = xgb_model.predict(dtest)
y_pred_test = (y_pred_prob > 0.5).astype(bool)  # threshold 0.5

# --------------------------
# 5. Prepare submission
# --------------------------
submission = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],
    "Transported": y_pred_test
})

submission.to_csv(r"C:\Users\jackm\Downloads\spaceship-titanic\my_submission7.csv", index=False)


[0]	train-logloss:0.66142
[20]	train-logloss:0.41523
[40]	train-logloss:0.35482
[60]	train-logloss:0.32819
[80]	train-logloss:0.31033
[100]	train-logloss:0.29689
[120]	train-logloss:0.28374
[140]	train-logloss:0.27152
[160]	train-logloss:0.26010
[180]	train-logloss:0.24900
[200]	train-logloss:0.24000
[220]	train-logloss:0.23049
[240]	train-logloss:0.22257
[260]	train-logloss:0.21551
[280]	train-logloss:0.20977
[300]	train-logloss:0.20286
[320]	train-logloss:0.19616
[340]	train-logloss:0.19104
[360]	train-logloss:0.18621
[380]	train-logloss:0.18132
[400]	train-logloss:0.17649
[420]	train-logloss:0.17220
[440]	train-logloss:0.16743
[460]	train-logloss:0.16353
[480]	train-logloss:0.15954
[499]	train-logloss:0.15612
