In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
train_df = pd.read_csv('train.csv')

# set up for further filtering
train_df['NameLength'] = train_df['Name'].str.len()
train_df['Cabin'] = train_df['Cabin'].astype(str)
train_df['Cabin'] = train_df['Cabin'].str[0] + train_df['Cabin'].str[-1]
train_df['total spending'] = train_df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
train_df['VIP'] = train_df['VIP'].fillna(False).astype(int)
train_df['CryoSleep'] = train_df['CryoSleep'].fillna(False).astype(int)

# needed to calculate the distribution of positives to sort catagorical features
transported_true = train_df[train_df['Transported'] == True]

# sort homeplanet catagories by True values percentages
counts = train_df['HomePlanet'].value_counts()
transported_counts = transported_true.groupby('HomePlanet').size()
percentages = transported_counts/counts
homeplanet_map = percentages.sort_values(ascending=False)
planet_map = {planet: rank for rank, planet in enumerate(homeplanet_map.index)}
train_df['HomePlanet'] = train_df['HomePlanet'].map(planet_map)


# sort destination catagories by True values percentages
Destcounts = train_df['Destination'].value_counts()
transDest_counts = transported_true.groupby('Destination').size()
percent_vals = transDest_counts/Destcounts
dest_map = percent_vals.sort_values(ascending=False)
d_map = {planet: rank for rank, planet in enumerate(dest_map.index)}
train_df['Destination'] = train_df['Destination'].map(d_map)

# sort Cabin catagories by True values percentages
# This does not need to be divided by counts as in my previous notebook it is 
# shown that there is less than a 1% differance in size of groups
cabin_counts = train_df[train_df['Transported'] == True].groupby('Cabin').size()
sorted_cabins = cabin_counts.sort_values(ascending=False).index
cabin_mapping = {cabin: rank for rank, cabin in enumerate(sorted_cabins)}
train_df['Cabin_ordinal'] = train_df['Cabin'].map(cabin_mapping).fillna(-1).astype(int)


# Targets
targets = train_df['Transported']


#drop unused Catagories
filtered_X = train_df[['HomePlanet', 'CryoSleep', 'Age','VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','NameLength', 'total spending', 'Cabin_ordinal']]
filtered_X = filtered_X.fillna(filtered_X.median())
X_train, X_test, y_train, y_test = train_test_split(filtered_X, targets, test_size=0.15, stratify=targets, random_state=1)



In [None]:
from sklearn.pipeline import make_pipeline, clone
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
import numpy as np

pipe_knn = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier(n_neighbors=7, weights='distance')
)
pipe_nb = make_pipeline(
    StandardScaler(),
    GaussianNB()
)
pipe_mlp = make_pipeline(
    StandardScaler(),
    MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=20)
)

pipe_ext = make_pipeline(
    StandardScaler(),
    ExtraTreesClassifier(
        n_estimators=400,
        max_depth=12,
        min_samples_split=5,
        min_samples_leaf=3,
        max_features='sqrt',
        class_weight='balanced',
        random_state=20,
        n_jobs=-1
    )
)



# Define your base pipelines
pipe_rf = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(
        random_state=20,
        n_estimators=400,
        max_depth=12,
        min_samples_split=5,
        min_samples_leaf=3,
        max_features='sqrt',
        class_weight='balanced',
        n_jobs=-1
    )
)
#random_state = 20,max_depth = 5,reg_lambda = 10,subsample=1.0,min_child_weight = 5,learning_rate = 0.2,n_estimators =100,reg_alpha=0.1, colsample_bytree = 0.8
pipe_xgb = make_pipeline(
    StandardScaler(),
    XGBClassifier(
        random_state=20,
        n_estimators=200,
        max_depth=5,
        min_child_weight = 5,
        reg_lambda = 10,
        learning_rate=0.02,
        subsample=0.8,
        reg_alpha=0.1,
        colsample_bytree=0.8,
        eval_metric='logloss',
        n_jobs=-1
    )
)

pipe_lr = make_pipeline(
    StandardScaler(),
    LogisticRegression(random_state=20, max_iter=500)
)

pipe_svc = make_pipeline(
    StandardScaler(),
    SVC(probability=True, random_state=20)
)

models = {
    "RandomForest": pipe_rf,
    "XGB": pipe_xgb,
    "LogisticRegression": pipe_lr,
    "SVC": pipe_svc,
    "KNN": pipe_knn,
    "NaiveBayes": pipe_nb,
    "MLP": pipe_mlp,
    "ExtraTrees": pipe_ext,
}


# Store CV results for each model
cv_results = {name: {"scores": [], "models": []} for name in models}

kfold = StratifiedKFold(n_splits=15, shuffle=True, random_state=20)

for name, pipeline in models.items():
    for fold, (train_idx, test_idx) in enumerate(kfold.split(X_train, y_train)):
        model = clone(pipeline)
        model.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
        score = model.score(X_train.iloc[test_idx], y_train.iloc[test_idx])
        
        cv_results[name]["scores"].append(score)
        cv_results[name]["models"].append(model)
        
    mean_acc = np.mean(cv_results[name]["scores"])
    std_acc = np.std(cv_results[name]["scores"])
    print(f"{name}: {mean_acc:.3f} +/- {std_acc:.3f}")

# ---- Voting classifier on final retrained models ----
# Usually, retrain each on full dataset for final voting model
final_rf   = clone(pipe_rf).fit(X_train, y_train)
final_xgb  = clone(pipe_xgb).fit(X_train, y_train)
final_lr   = clone(pipe_lr).fit(X_train, y_train)
final_svc  = clone(pipe_svc).fit(X_train, y_train)
final_knn  = clone(pipe_knn).fit(X_train, y_train)
final_nb   = clone(pipe_nb).fit(X_train, y_train)
final_mlp  = clone(pipe_mlp).fit(X_train, y_train)
final_ext  = clone(pipe_ext).fit(X_train, y_train)


estimatormodels = [
    ('rf', final_rf),
    ('xgb', final_xgb),
    ('lr', final_lr),
    ('svc', final_svc),
    ('knn', final_knn),
    ('nb', final_nb),
    ('mlp', final_mlp),
    ('ext', final_ext)
]


voting_clf = VotingClassifier(
    estimators=estimatormodels,
    voting='soft'  # soft voting uses predicted probabilities
)

voting_clf.fit(X_train, y_train)
print("Voting classifier accuracy:", voting_clf.score(X_test, y_test))


In [None]:
from sklearn.metrics import accuracy_score

# High recall (favors True)
rf_high_recall = RandomForestClassifier(
    class_weight={0: 1, 1: 2},  # more weight on True class
    random_state=20,
    n_estimators=400,
    max_depth=12,
    min_samples_split=5,
    min_samples_leaf=3,
    max_features='sqrt',
    n_jobs=-1
)

# High precision (favors False)
rf_high_precision = RandomForestClassifier(
    class_weight={0: 2, 1: 1},  # more weight on False class
    random_state=21,
    n_estimators=400,
    max_depth=12,
    min_samples_split=5,
    min_samples_leaf=3,
    max_features='sqrt',
    n_jobs=-1
)

# Neutral
rf_neutral = RandomForestClassifier(
    class_weight='balanced',
    random_state=22,
    n_estimators=400,
    max_depth=12,
    min_samples_split=5,
    min_samples_leaf=3,
    max_features='sqrt',
    n_jobs=-1
)

# Fit biased models
rf_high_recall.fit(X_train, y_train)
rf_high_precision.fit(X_train, y_train)
rf_neutral.fit(X_train, y_train)

pred_recall = rf_high_recall.predict(X_test)
pred_prec = rf_high_precision.predict(X_test)
pred_neut = rf_neutral.predict(X_test)

print(f"High Recall RF accuracy: {accuracy_score(y_test, pred_recall):.4f}")
print(f"High Precision RF accuracy: {accuracy_score(y_test, pred_prec):.4f}")
print(f"Neutral RF accuracy: {accuracy_score(y_test, pred_neut):.4f}")

# Get probability predictions for training meta-model
proba_recall = rf_high_recall.predict_proba(X_train)[:, 1]
proba_prec   = rf_high_precision.predict_proba(X_train)[:, 1]
proba_neut   = rf_neutral.predict_proba(X_train)[:, 1]

meta_X = np.column_stack([proba_recall, proba_prec, proba_neut])

# Fit meta-model
meta_model = LogisticRegression()
meta_model.fit(meta_X, y_train)

proba_recall = rf_high_recall.predict_proba(X_test)[:, 1]
proba_prec   = rf_high_precision.predict_proba(X_test)[:, 1]
proba_neut   = rf_neutral.predict_proba(X_test)[:, 1]

meta_X_test = np.column_stack([proba_recall, proba_prec, proba_neut])
final_preds = meta_model.predict(meta_X_test)

# Print meta-model accuracy
print(f"Meta-model stacking accuracy: {accuracy_score(y_test, final_preds):.4f}")



In [None]:
test_df = pd.read_csv('test.csv')

# Feature engineering on test data — apply same steps as train_df
test_df['NameLength'] = test_df['Name'].str.len()
test_df['Cabin'] = test_df['Cabin'].astype(str)
test_df['Cabin'] = test_df['Cabin'].str[0] + test_df['Cabin'].str[-1]
test_df['total spending'] = test_df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
test_df['VIP'] = test_df['VIP'].fillna(False).astype(int)
test_df['CryoSleep'] = test_df['CryoSleep'].fillna(False).astype(int)

test_df['HomePlanet'] = test_df['HomePlanet'].map(planet_map).fillna(-1).astype(int)
test_df['Destination'] = test_df['Destination'].map(d_map).fillna(-1).astype(int)
test_df['Cabin_ordinal'] = test_df['Cabin'].map(cabin_mapping).fillna(-1).astype(int)

# Select features & fill NA
filtered_test_X = test_df[['HomePlanet', 'CryoSleep', 'Age','VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','NameLength', 'total spending', 'Cabin_ordinal']]
filtered_test_X = filtered_test_X.fillna(filtered_test_X.median())

proba_recall = rf_high_recall.predict_proba(filtered_test_X)[:, 1]
proba_prec = rf_high_precision.predict_proba(filtered_test_X)[:, 1]
proba_neut = rf_neutral.predict_proba(filtered_test_X)[:, 1]

meta_X_test = np.column_stack([proba_recall, proba_prec, proba_neut])

# Final predictions
final_preds = meta_model.predict(meta_X_test)

# Build submission DataFrame
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': final_preds.astype(bool)
})

# Save to CSV
submission.to_csv('tug_war_model.csv', index=False)
