In [6]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
import joblib
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline


train_df = pd.read_csv('train.csv')

In [5]:
def filter_data(data, train = True, preprocessor=None):
    training_data = data.copy()
    training_data['NameLength'] = training_data['Name'].str.len()
    training_data[['Deck', 'Num', 'Side']] = training_data['Cabin'].str.split('/', expand=True)
    training_data = training_data.drop(columns=['PassengerId', 'Name', 'Cabin'])

    target = None
    if train:
        target = training_data['Transported']
        training_data = training_data.drop(columns=['Transported'])

    features  = training_data.columns.tolist()
    categorical_features = ['HomePlanet', 'Destination', 'Side', 'Deck']
    numeric_features = [col for col in features if col not in categorical_features]

    for col in ['CryoSleep', 'VIP']:
        training_data[col] = training_data[col].fillna(False).astype(int)

    training_data['Num'] = pd.to_numeric(training_data['Num'], errors='coerce')



    if preprocessor is None:
        numeric_transformer = Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])

        categorical_transformer = Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        preprocessor = ColumnTransformer([
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

        X_processed = preprocessor.fit_transform(training_data)

    else:
        # Use passed preprocessor to transform (inference mode)
        X_processed = preprocessor.transform(training_data)

    if train:
        return X_processed, target, preprocessor
    else:
        return X_processed

In [7]:
def test(y_pred, y_test):
    y_pred = np.array(y_pred)

    # ✅ If shape is (n_samples, 1), flatten it to (n_samples,)
    if y_pred.ndim > 1:
        y_pred = y_pred.ravel()

    # ✅ Only threshold if values are probabilities (i.e. float)
    if y_pred.dtype != int and y_pred.dtype != np.int64:
        y_pred = (y_pred >= 0.5).astype(int)

    np_y_test = np.array(y_test)
    print(np.unique(np_y_test), np_y_test.dtype)
    print(np.unique(y_pred), y_pred.dtype)


    acc = accuracy_score(np_y_test, y_pred)
    prec = precision_score(np_y_test, y_pred)
    rec = recall_score(np_y_test, y_pred)
    f1 = f1_score(np_y_test, y_pred)

    print(f"\nAccuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(np_y_test, y_pred))

In [10]:
processed_data, target, preprocessor = X_train, y_train, preprocessor = filter_data(train_df, train=True)

X_train, X_test, y_train, y_test = train_test_split(processed_data, target, test_size=0.1, random_state=42, shuffle=False)
model = GradientBoostingClassifier(n_estimators=300, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
test(y_pred, y_test)

[False  True] bool
[0 1] int64

Accuracy: 0.7851
Precision: 0.8017
Recall: 0.7167
F1 Score: 0.7568
Confusion Matrix:
 [[392  72]
 [115 291]]
