# <center> **Titanic**

# **Libraries**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB

import functions
import importlib
importlib.reload(functions)

import warnings
import time

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB

import functions
import importlib
importlib.reload(functions)

import warnings
import time

# **Data Overview and Preprocessing**

In [11]:
data = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Titanic\Data\data.csv",
    index_col=False
)

train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Titanic\Data\Data\train.csv",
    index_col=False
)

test = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Titanic\Data\Data\test.csv",
    index_col=False
)


random_state = 101
target = 'Transported'

# **Machine Learning**

## **Split Data Back to Train and Test**

In [12]:
train=data[data['PassengerId'].isin(train['PassengerId'].values)].copy()
test=data[data['PassengerId'].isin(test['PassengerId'].values)].copy()

# **Drop Unneeded Features**

In [13]:
train.drop(['PassengerId', 'Group', 'CabinNumber'], axis=1, inplace=True)
test.drop(['PassengerId', 'Group', 'CabinNumber'], axis=1, inplace=True)

# **Log Transform**

The logarithm transform is used to decrease skew in distributions, especially with large outliers. It can make it easier for algorithms to 'learn' the correct relationships. We will apply it to the expenditure features as these are heavily skewed by outliers.

In [14]:
columns_to_transform = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpent']  # Replace with your actual column names

for col in columns_to_transform:
    train = functions.log_transform(train, col)
    test = functions.log_transform(test, col)

## **Column Separation**

### **Numerical**

In [15]:
numerical_cols = [cname for cname in train.columns if train[cname].dtype in ['int64', 'float64']]

### **Categorical**

In [16]:
categorical_cols = [cname for cname in train.columns if train[cname].dtype in ["object", "bool"]]
categorical_cols.remove(target)

## **Train Test Split**

In [17]:
X = train.drop(target, axis=1)
y = train[target]
y = y.astype(bool)


X, y = shuffle(X, y, random_state=random_state)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

# **Modeling**

In [18]:
numerical_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

preprocessor  = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)],
        remainder='passthrough')


lg_model = LogisticRegression(random_state=random_state, max_iter=5000)
lg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lg', lg_model)
])


knn_model = KNeighborsClassifier()
knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', knn_model)
])  

# svm_model = SVC(random_state=random_state, probability=True)
# svm_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('svm', svm_model)
# ])

rf_model = RandomForestClassifier(random_state=random_state)
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random_forest', rf_model)
])

xgb_model = XGBClassifier()
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgb', xgb_model)
])

lgbm_model = LGBMClassifier(random_state=random_state, verbose=0)
lgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lgbm', lgbm_model)
])

# catboost_model = CatBoostClassifier(random_state=random_state, verbose=0)
# catboost_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('catboost', catboost_model)
# ])  

naive_bayes_model = GaussianNB()
naive_bayes_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('naive_bayes', naive_bayes_model)
])


pipelines = {
    "Logistic Regression": lg_pipeline,
    "KNN": knn_pipeline,
    # "SVM": svm_pipeline,
    "Random Forest": rf_pipeline,
    "XGBoost": xgb_pipeline,
    "LightGBM": lgbm_pipeline,
    # "CatBoost": catboost_pipeline,
    "Naive Bayes": naive_bayes_pipeline
}


for name, pipeline in pipelines.items():
    start_time = time.time()  
    scores = cross_val_score(pipeline, X, y, cv=10)
    end_time = time.time()  
    elapsed_time = (end_time - start_time)/60  
    
    print(f"{name}: {scores.mean():.2f} ({elapsed_time:.2f} minutes)")

Logistic Regression: 0.77 (0.67 minutes)
KNN: 0.75 (0.20 minutes)
Random Forest: 0.79 (2.12 minutes)
XGBoost: 0.80 (0.78 minutes)
LightGBM: 0.80 (0.14 minutes)
Naive Bayes: 0.53 (0.17 minutes)
