# <center> **Kaggle’s Spaceship Titanic Competition**
# <center> **Machine Learning**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt


from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score

import functions
import importlib
importlib.reload(functions)

import warnings
import time

# **Loading Data**

In [None]:
data = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Titanic\Data\data.csv",
    index_col=False
)

train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Titanic\Data\Data\train.csv",
    index_col=False
)

test = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Titanic\Data\Data\test.csv",
    index_col=False
)


random_state = 101
target = 'Transported'

# **Machine Learning**

## **Split Data Back to Train and Test**

In [None]:
train=data[data['PassengerId'].isin(train['PassengerId'].values)].copy()
test=data[data['PassengerId'].isin(test['PassengerId'].values)].copy()

# **Drop Unneeded Features**

In [None]:
train.drop(['PassengerId', 'Group', 'CabinNumber'], axis=1, inplace=True)
test.drop(['PassengerId', 'Group', 'CabinNumber'], axis=1, inplace=True)

# **Log Transform**

The logarithm transform is used to decrease skew in distributions, especially with large outliers. It can make it easier for algorithms to 'learn' the correct relationships. We will apply it to the expenditure features as these are heavily skewed by outliers.

In [None]:
columns_to_transform = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpent']  # Replace with your actual column names

for col in columns_to_transform:
    train = functions.log_transform(train, col)
    test = functions.log_transform(test, col)

## **Column Separation**

In [None]:
numerical_cols = [cname for cname in train.columns if train[cname].dtype in ['int64', 'float64']]
categorical_cols = [cname for cname in train.columns if train[cname].dtype in ["object", "bool"]]
categorical_cols.remove(target)

## **Train Test Split**

In [None]:
X = train.drop(target, axis=1)
y = train[target]
y = y.astype(bool)


X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

## **Pre-Processing**

In [None]:
numerical_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

preprocessor  = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)],
        remainder='passthrough')

Logistic Regression: Unlike linear regression which uses Least Squares, this model uses Maximum Likelihood Estimation to fit a sigmoid-curve on the target variable distribution. The sigmoid/logistic curve is commonly used when the data is questions had binary output.

K-Nearest Neighbors (KNN): KNN works by selecting the majority class of the k-nearest neighbours, where the metric used is usually Euclidean distance. It is a simple and effective algorithm but can be sensitive by many factors, e.g. the value of k, the preprocessing done to the data and the metric used.

Random Forest (RF): RF is a reliable ensemble of decision trees, which can be used for regression or classification problems. Here, the individual trees are built via bagging (i.e. aggregation of bootstraps which are nothing but multiple train datasets created via sampling with replacement) and split using fewer features. The resulting diverse forest of uncorrelated trees exhibits reduced variance; therefore, is more robust towards change in data and carries its prediction accuracy to new data. It works well with both continuous & categorical data.

Extreme Gradient Boosting (XGBoost): XGBoost is similar to RF in that it is made up of an ensemble of decision-trees. The difference arises in how those trees as derived; XGboost uses extreme gradient boosting when optimising its objective function. It often produces the best results but is relatively slow compared to other gradient boosting algorithms.

Light Gradient Boosting Machine (LGBM): LGBM works essentially the same as XGBoost but with a lighter boosting technique. It usually produces similar results to XGBoost but is significantly faster.

Categorical Boosting (CatBoost): CatBoost is an open source algorithm based on gradient boosted decision trees. It supports numerical, categorical and text features. It works well with heterogeneous data and even relatively small data. Informally, it tries to take the best of both worlds from XGBoost and LGBM.

In [None]:
lg_model = LogisticRegression(random_state=random_state, max_iter=5000)
lg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lg', lg_model)
])

knn_model = KNeighborsClassifier()
knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', knn_model)
])  

rf_model = RandomForestClassifier(random_state=random_state)
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random_forest', rf_model)
])

xgb_model = XGBClassifier()
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgb', xgb_model)
])

lgbm_model = LGBMClassifier(random_state=random_state, verbose=0)
lgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lgbm', lgbm_model)
])

catboost_model = CatBoostClassifier(random_state=random_state, verbose=0)
catboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('catboost', catboost_model)
])  


pipelines = {
    "Logistic Regression": lg_pipeline,
    "KNN": knn_pipeline,
    "Random Forest": rf_pipeline,
    "XGBoost": xgb_pipeline,
    "LightGBM": lgbm_pipeline,
    "CatBoost": catboost_pipeline,
}


for name, pipeline in pipelines.items():
    start_time = time.time()  
    scores = cross_val_score(pipeline, X, y, cv=10)
    end_time = time.time()  
    elapsed_time = (end_time - start_time)/60  
    
    print(f"{name}: {scores.mean():.2f} ({elapsed_time:.2f} minutes)")

## **Grid Search CV**

## **XGB**

In [22]:
param_grid = [
    {
        'classifier__n_estimators': [50, 100, 150, 200],
        'classifier__max_depth': [4, 8, 12],
        'classifier__learning_rate': [0.01, 0.1, 0.2]
    }
]   

pipeline = Pipeline([
    ('preprocessor', preprocessor),         
    ('classifier', XGBClassifier(random_state=random_state)) 
])

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')


grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: {:.2f}".format(grid_search.best_score_))

test_score = grid_search.score(X_test, y_test)
print("Test set accuracy: {:.2f}".format(test_score))

Best parameters found:  {'classifier__learning_rate': 0.2, 'classifier__max_depth': 4, 'classifier__n_estimators': 200}
Best cross-validation accuracy: 0.81
Test set accuracy: 0.79


In [24]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),         
    ('classifier', XGBClassifier(random_state=random_state, learning_rate=0.2, max_depth=4, n_estimators=200)) 
])

cross_val_accuracy = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy').mean()
print("Best cross-validation accuracy: {:.2f}".format(cross_val_accuracy))

pipeline.fit(X_train, y_train)
test_score = pipeline.score(X_test, y_test)
print("Test set accuracy: {:.2f}".format(test_score))

Best cross-validation accuracy: 0.81
Test set accuracy: 0.79


## **LGBM**

In [23]:
param_grid = [
    {
        'classifier__n_estimators': [50, 100, 150, 200],
        'classifier__max_depth': [4, 8, 12],
        'classifier__learning_rate': [0.01, 0.1, 0.2]
    }
]


pipeline = Pipeline([
    ('preprocessor', preprocessor),         
    ('classifier', LGBMClassifier(random_state=random_state, verbose = -1)) 
])

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')


grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: {:.2f}".format(grid_search.best_score_))

test_score = grid_search.score(X_test, y_test)
print("Test set accuracy: {:.2f}".format(test_score))

Best parameters found:  {'classifier__learning_rate': 0.1, 'classifier__max_depth': 4, 'classifier__n_estimators': 150}
Best cross-validation accuracy: 0.81
Test set accuracy: 0.79


In [25]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),         
    ('classifier', LGBMClassifier(random_state=random_state, n_estimators=150, max_depth=4, learning_rate=0.1, verbose = -1)) 
])

cross_val_accuracy = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy').mean()
print("Best cross-validation accuracy: {:.2f}".format(cross_val_accuracy))

pipeline.fit(X_train, y_train)
test_score = pipeline.score(X_test, y_test)
print("Test set accuracy: {:.2f}".format(test_score))

Best cross-validation accuracy: 0.81
Test set accuracy: 0.79


## **CatBoost**

In [27]:
param_grid = [
    {
        'classifier__n_estimators': [50, 100, 150, 200],
        'classifier__max_depth': [4, 8, 12],
        'classifier__learning_rate': [0.01, 0.1, 0.2]
    }
]


pipeline = Pipeline([
    ('preprocessor', preprocessor),         
    ('classifier', CatBoostClassifier(random_state=random_state, verbose = 0)) 
])

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')


grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: {:.2f}".format(grid_search.best_score_))

test_score = grid_search.score(X_test, y_test)
print("Test set accuracy: {:.2f}".format(test_score))

Best parameters found:  {'classifier__learning_rate': 0.1, 'classifier__max_depth': 8, 'classifier__n_estimators': 150}
Best cross-validation accuracy: 0.81
Test set accuracy: 0.78


In [28]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),         
    ('classifier', CatBoostClassifier(verbose=0, random_state=random_state, learning_rate=0.1, max_depth=8, n_estimators=150)) 
])

cross_val_accuracy = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy').mean()
print("Best cross-validation accuracy: {:.2f}".format(cross_val_accuracy))

pipeline.fit(X_train, y_train)
test_score = pipeline.score(X_test, y_test)
print("Test set accuracy: {:.2f}".format(test_score))

Best cross-validation accuracy: 0.81
Test set accuracy: 0.78
