# 0. Importing Libraries & titanic_data_clean

In [None]:
# titanic_data_clean Manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

# Misc
import warnings

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")


This titanic_titanic_titanic_data_clean_clean_cleanset can be found at https://www.kaggle.com/competitions/spaceship-titanic 
***
#### titanic_data_clean Fields:
__PassengerId__ <br>
A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.<br>
__HomePlanet__ <br>
The planet the passenger departed from, typically their planet of permanent residence.<br>
__CryoSleep__ <br>
Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.<br>
__Cabin__ <br>
The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.<br>
__Destination__ <br>
The planet the passenger will be debarking to.<br>
__Age__ <br>
The age of the passenger.<br>
__VIP__ <br>
Whether the passenger has paid for special VIP service during the voyage.<br>
__RoomService, FoodCourt, ShoppingMall, Spa, VRDeck__ <br>
Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.<br>
__Name__ <br>
The first and last names of the passenger.<br>
__Transported__ <br>
Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.<br>

In [None]:
# titanic_data = pd.read_csv("/home/mmmarinov/ProjectPortfolio/1. Titanic_SpaceShip - Binary Classification/Titanic_SpaceShip_Train_Data.csv")

#titanic_data = pd.read_csv(r"C:\Users\N179960\OneDrive - Munich Re\Martin Stuff\Personal\ProjectPortfolio\1. Data Science\1. Titanic_SpaceShip - Binary Classification\Titanic_SpaceShip_Train_Data.csv")

#titanic_data = pd.read_csv(r"\\192.168.68.200\ProjectPortfolio\1. Data Science\1. Titanic_SpaceShip - Binary Classification\Titanic_SpaceShip_Train_Data.csv")

titanic_data = pd.read_csv("/Users/martinmarinov/ProjectPortfolio/ProjectPortfolio/1. Data Science/1. Titanic_SpaceShip - Binary Classification/Titanic_SpaceShip_Train_Data.csv")


# 1. Investigate the titanic_data_clean

## 1.1 High-level understanding
This segment is to get a general sense of what kind of information is held

In [None]:
titanic_data.head()


In [None]:
titanic_data.info()

# Looking at what data types the dataset hold. Which needs to be changeed to float, and which whill need to be One-Hot Encoded


In [None]:
titanic_data.describe(include='all')

# Viewing the mean, std, min, max for numerical values and unique for catagorical values
# Looking for an initial understanding of the spread and common values found in the dataset


In [None]:
# Sum treats the True as 1 and False as 0, Count will add everything regardless of T/F
true_count = titanic_data["Transported"].sum()
false_count = titanic_data["Transported"].count() - true_count

# Data to plot
labels = 'True', 'False'
sizes = [true_count, false_count]
colors = ['lightcoral', 'lightskyblue']
explode = (0.1, 0)  # Explode the first slice (True) for emphasis

# {:1f}% will return a percentage with 1 decimal point. {:d} returns the full number.
def autopct_format(pct):
    return "{:.1f}%\n({:d})".format(pct, int(round(pct * sum(sizes) / 100)))

# Plot the pie chart
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct=autopct_format, shadow=True, startangle=140)

plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

# Give a title to the chart
plt.title("Transported Ratio")

# Display the chart
plt.show()

# This data tells me that there isn't a class imbalance 
# and therefore we do not need to worry about undersampling, SMOTE, or class weight approaches

In [None]:
titanic_data.isna().sum()

# Each column has missing information. Since it's numerical and catagorical, likely I'll need to use a mix of techniques to populate the values


Numerical: <br>
- Passenger_Id (If seperated) 
- Age (#)
- RoomService ($)
- FoodCourt ($)
- ShoppingMall ($)
- Spa ($)
- VRDeck ($)

Catagorical: <br>
- HomePlant (Unique: 3)
- Cryosleep (T/F)
- Cabin (Deck and Side)
- Destination (Unique: 3)
- VIP (T/F)
- Name (Mostly Unique)

Target Variable: <br>
- Transported (T/F)

## 1.2 Explore the titanic_data_clean

In [None]:
# Make a copy so that the original remains untouched
titanic_data_eda = titanic_data.copy()


In [None]:
# Breaking up Cabin to understand if there are trends with deck level or side of ship
titanic_data_eda[['Deck', 'Cabin_num', 'Side']
                 ] = titanic_data_eda['Cabin'].str.split('/', expand=True)

# Breaking up Passenger to view grouping
titanic_data_eda[['Pass_group', 'Pass_id']
                 ] = titanic_data_eda['PassengerId'].str.split('_', expand=True)


In [None]:
# One Hot Encoding to view correlation, it can be interesting to see if these columns have strong relation to the target variable (Transported)
titanic_data_eda_dummies = pd.get_dummies(
    titanic_data_eda, columns=['Deck', 'Side'])

# Since these planets and destinations are closer/farther from one another with distance, it would be best to encode them as ordinal data rather than nominal
# The same logic can be applied to the Deck, but since we do not not for sure which is closer or farther, then we can not assign them just on alaphabetically order alone
titanic_data_eda_dummies['HomePlanet'] = titanic_data_eda_dummies['HomePlanet'].astype(
    'category').cat.codes
titanic_data_eda_dummies['Destination'] = titanic_data_eda_dummies['Destination'].astype(
    'category').cat.codes


In [None]:
titanic_data_eda_dummies


In [None]:
# Removing unneeded columns. This is due to the unlikelyhood of them being valuable predictors.
titanic_data_eda_dropped = titanic_data_eda_dummies.drop(
    ['PassengerId', 'Cabin', 'Name', 'Cabin_num', 'Pass_id'], axis=1)
titanic_data_eda_dropped


In [None]:
# Changing the catagorical fields into numerical so that it can all be analyzed
titanic_data_eda_dropped[['CryoSleep', 'VIP', 'Transported', 'Pass_group']] = titanic_data_eda_dropped[[
    'CryoSleep', 'VIP', 'Transported', 'Pass_group']].astype('float64')


In [None]:
titanic_data_eda_dropped.info()


In [None]:
plt.figure(figsize=(24, 8))
sns.set_theme(style="white")
corr = titanic_data_eda_dropped.corr()
heatmap = sns.heatmap(corr, annot=True, cmap="Blues", fmt='.1g')

# The interesting relation to note is Cryosleep. Which looks to be at 0.5 in relation with Transported without any data cleaning.
# This looks like it is going to be a strong predictor and will need special attention when populating nulls.
# The next highest/lowest is +-0.2 which mainly comes from the different spend categories like RoomService or Spa/VRDeck.
# Since their relation is negative I should see that spending more decreased the chance of being transported


In [None]:
columns = ['CryoSleep', 'Age', 'VIP', 'RoomService',
           'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
sns.pairplot(titanic_data_eda_dropped, vars=columns, hue='Transported')
plt.show()


"""
Blue = Transported

Things to note:
- Someone in CryoSleep does not spend any money. I can use this to populate missing CryoSleep and the Expense columns
- The ratio of those who were transported vs not is apparent for CryoSleep and Age. Showing that these can be valuable features for prediction
- Looking at the spending, people who spent more money at the Spa/VRdeck/RoomService were less likely to be transported. While the opposite is true for Foodcourt and Shopping Mall
    - This ties back to the heatmap and provides a better look into the spread of the data points
- There is an age minmium to be a VIP. This can be used to fill in null values for Age or vise versa. We know that a 5 year old can not be a VIP
- There is an Age minimum to spending money as well. If VIP is null or False, then we can use moeny spent as a group to identify the mean value to fill in.
"""


In [None]:
# This proves my first obeservation with the relation to CryoSleep & Expenses
titanic_data_eda_dropped[['CryoSleep', 'RoomService', 'FoodCourt',
                          'Spa', 'ShoppingMall', 'VRDeck']].groupby('CryoSleep').sum()


In [None]:
# The youngest VIP is 18 Year Old
titanic_data_eda_dropped[['VIP', 'Age']].groupby('VIP').min()


In [None]:
# The youngest to spend any money is 13 Years Old
titanic_data_eda_dropped[['RoomService', 'FoodCourt', 'Spa', 'ShoppingMall', 'VRDeck', 'Age']]\
    .groupby('Age')\
    .sum()\
    .head(20)


In [None]:
titanic_data_eda_dropped.columns


In [None]:
columns = ['HomePlanet', 'Destination', 'Pass_group', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D',
           'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'Side_P', 'Side_S']
sns.pairplot(titanic_data_eda_dropped, vars=columns, hue='Transported')
plt.show()

"""
Blue = Transported

Things to note:
- HomePlanet has some ability to predict the target variable. This ties back well to the heatmap. 
- Deck Side as expected is redudent. Keeping 1 would be fine. 
- Deck Level do not show much valuable information. It would be best to test whether the model fits better with or without this information.
"""


In [None]:
pass_group_counts = titanic_data_eda_dropped['Pass_group'].value_counts()
pass_group_counts_greater_than_one = pass_group_counts[pass_group_counts > 1]
pass_group_counts_greater_than_one


In [None]:
# Check to see if I can use the passenger group to fill in null values
titanic_data_eda_dropped[titanic_data_eda_dropped['Pass_group'].isin(pass_group_counts_greater_than_one.index) &
                         (titanic_data_eda_dropped['VIP'].isna())]


# (titanic_data_eda_dropped['VIP'].isna())


# 1.3 Clean the titanic_data_clean

In [None]:
del titanic_data_clean
del titanic_data_clean_complete


In [None]:
# Copying the titanic_data_clean once more to start fresh and seperate out the eda
titanic_data_clean = titanic_data.copy()


In [None]:
titanic_data_clean.info()


What fields to use: <br>
>PassengerId  | Exclude  <br>
HomePlanet   | Include <br>
CryoSleep    | Include  <br>
Cabin        | T/E Deck, Keep 1 Side  <br> 
Destination  | Include  <br>
Age          | Include  <br>
VIP          | Include  <br>
RoomService  | Include  <br>
FoodCourt    | Include  <br>
ShoppingMall | Include  <br>
Spa          | Include  <br>
VRDeck       | Include  <br>
Name         | Exclude  <br>

What I know regarding populating missing values
- Those who are in CryoSleep could not have spent any money, therefore if CryoSleep = True then the null expenses values are 0
    - If Cryosleep is null, then sum the expenses and if it is 0 then set the value to True
    - If Cryosleep is True and an expense is null, then likely taking the mean of that column will be sufficent
- If Age is null, then I can use VIP and whether they spent money or not as a way to identify.
    - People under the age of 18 look like they can't be VIP based on the PairPlot. So if a null Age is VIP then we can use the mean of the VIP group to fill in the null value. 
    - Same goes for money spent. If there is any money spent, then we know they are at least 13 years or older meaning we can derive the mean age from that group. 
- I can use the passenger group to fill in missing home planet or Deck Side values, otherwise take the most frequent

In [None]:
titanic_data_clean.isna().sum()


### Populating NAN's using Domain Logic & Catagorical Imputer

In [None]:
df = titanic_data.copy()

In [None]:
# Cryosleep & Expenses
# sum all $ columns into one expense column
expense_columns = ['RoomService', 'FoodCourt',
                    'ShoppingMall', 'Spa', 'VRDeck']
# add this column to the dataframe
df['Expenses'] = df[expense_columns].sum(axis=1)
# run a script that populates null CryoSleep based on whether or not the expense column has $0 or not
df['CryoSleep'] = df.apply(lambda row: True if pd.isna(
    row['CryoSleep']) and row['Expenses'] == 0 else False, axis=1).astype('bool')

# Populates NaN Expenses based on whether the person is in CryoSleep or not. 
# True Cryosleep means they couldn't have spent money.
for column in expense_columns:
    df[column] = df.apply(lambda row: 0 if pd.isna(
        row[column]) and row['CryoSleep'] == True else row[column], axis=1).astype('float64')
    df[column] = df.apply(lambda row: df[column].mean() if pd.isna(
        row[column]) else row[column], axis=1).astype('float64')    

In [None]:
df[df['CryoSleep']==True][expense_columns].isna().sum()

In [None]:
# Age & VIP
df['VIP'] = df.apply(lambda row: False
                        if pd.isna(row['VIP']) and row['Age'] < 18
                        else row['VIP'], axis=1).astype('bool')

VIP_true = df[df['VIP'] == True].Age.mean()
VIP_false = df[df['VIP'] == False].Age.mean()

df['Age'] = df.apply(lambda row: VIP_true
                        if pd.isna(row['Age']) and row['VIP'] == True
                        else row['Age'], axis=1).astype('float64')
df['Age'] = df.apply(lambda row: VIP_false
                        if pd.isna(row['Age']) and row['VIP'] == False
                        else row['Age'], axis=1).astype('float64')

In [None]:
df[['Age','VIP']].isna().sum()

In [None]:
print(f"Mean Age for VIP's: {VIP_true:.1f} Yrs \nMean Age for Non-VIP's: {VIP_false:.1f} Yrs")


In [None]:
# Catagorical Imputing

categorical_imputer = SimpleImputer(strategy='most_frequent')

categorical_columns = ['HomePlanet', 'Destination','Cabin','VIP']

# Impute missing values in the categorical columns
df[categorical_columns] = categorical_imputer.fit_transform(df[categorical_columns])


In [None]:
df[['Deck', 'Cabin_num', 'Side']] = df['Cabin'].str.split('/', expand=True)
df[['Pass_group', 'Pass_id']] = df['PassengerId'].str.split('_', expand=True)

df['Pass_group'] = pd.to_numeric(df['Pass_group'])

df = pd.get_dummies(df, columns=['Deck', 'Side'])

df['HomePlanet'] = df['HomePlanet'].astype('category').cat.codes
df['Destination'] = df['Destination'].astype('category').cat.codes

df = df.drop(['PassengerId', 'Cabin', 'Name',
                'Cabin_num', 'Pass_id', 'Side_S'], axis=1)

In [None]:
df.head()

In [None]:
df.isna().sum()

# 2. Data Pipeline & Modelling

## 2.1 Identify Best Model

In [None]:
# Data Manipulation
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb

# Evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV


In [None]:
titanic_data = pd.read_csv("/workspaces/ProjectPortfolio/1. Data Science/1. Titanic_SpaceShip - Binary Classification/Titanic_SpaceShip_Train_Data.csv")

In [None]:
def CryoExpenseImputer(df):
    expense_columns = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df['Expenses'] = df[expense_columns].sum(axis=1)
    df['CryoSleep'] = df.apply(lambda row: True if pd.isna(row['CryoSleep']) and row['Expenses'] == 0 else False, axis=1).astype('float64')
    for column in expense_columns:
        df[column] = df.apply(lambda row: 0 if pd.isna(row[column]) and row['CryoSleep'] == True else row[column], axis=1).astype('float64')
        df[column] = df.apply(lambda row: df[column].mean() if pd.isna(row[column]) else row[column], axis=1).astype('float64')
    return df

def VIPAgeImputer(df):
    from sklearn.impute import SimpleImputer
    si = SimpleImputer(strategy='most_frequent')
    df['VIP'] = df.apply(lambda row: False if pd.isna(row['VIP']) and row['Age'] < 18 else row['VIP'], axis=1).astype('float64')
    df['VIP'] = si.fit_transform(df['VIP'].array.reshape(-1,1))
    
    VIP_true = df[df['VIP'] == True].Age.mean()
    VIP_false = df[df['VIP'] == False].Age.mean()

    df['Age'] = df.apply(lambda row: VIP_true if pd.isna(row['Age']) and row['VIP'] == True else row['Age'], axis=1).astype('float64')
    df['Age'] = df.apply(lambda row: VIP_false if pd.isna(row['Age']) and row['VIP'] == False else row['Age'], axis=1).astype('float64')  
    return df

def HomeDestImputer(df):
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import OrdinalEncoder

    si = SimpleImputer(strategy='most_frequent')
    oe = OrdinalEncoder()

    home_dest_cols = ['HomePlanet', 'Destination']

    for column in home_dest_cols:
        imputed_data = si.fit_transform(df[column].array.reshape(-1, 1))
        df[column] = imputed_data.ravel()  
        df[column] = oe.fit_transform(df[column].array.reshape(-1, 1))

    return df  

def CatagoryTransform(df):
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import OneHotEncoder

    si = SimpleImputer(strategy='most_frequent')

    imputed_data = si.fit_transform(df['Cabin'].array.reshape(-1, 1))
    df['Cabin'] = imputed_data.ravel() 
    df[['Deck', 'Cabin_num', 'Side']] = df['Cabin'].str.split('/', expand=True)

    categorical_columns = ['Deck', 'Side']

    encoder = OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False)
    encoder.fit(df[categorical_columns])

    df_encoded_columns = encoder.transform(df[categorical_columns])

    df_encoded = pd.DataFrame(df_encoded_columns, columns=encoder.get_feature_names_out(categorical_columns))

    df = pd.concat([df.drop(categorical_columns, axis=1).reset_index(drop=True), df_encoded], axis='columns')

    return df

def DropColumns(df):
    df = df.drop(['PassengerId','Cabin','Name','Cabin_num'], axis=1)
    return df

In [None]:
classifiers = [
    {
        'name': 'Logistic Regression',
        'classifier': LogisticRegression(solver='lbfgs', max_iter=10000),
        'params': {
            'C': [0.1, 1.0, 10.0]
        }
    },
    {
        'name': 'Random Forest',
        'classifier': RandomForestClassifier(),
        'params': {
            'n_estimators': [50, 100, 150],
            'max_depth': [None, 10, 20]
        }
    },
    {
        'name': 'Gradient Boosting',
        'classifier': GradientBoostingClassifier(),
        'params': {
            'n_estimators': [50, 100, 150],
            'learning_rate': [0.1, 0.01]
        }
    },
    {
        'name': 'XGBoost',
        'classifier': xgb.XGBClassifier(),
        'params': {
            'n_estimators': [50, 100, 150],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.1, 0.01]
        }
    }
]

In [None]:
X = titanic_data.drop(['Transported'], axis=1)
y = titanic_data['Transported']

function_list = [CryoExpenseImputer,VIPAgeImputer,HomeDestImputer,CatagoryTransform,DropColumns]    

for function in function_list:
    X  = function(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [None]:
best_model = None
best_accuracy = 0.0

for classifier_info in classifiers:
    classifier = classifier_info['classifier']
    params = classifier_info['params']

    grid_search = GridSearchCV(classifier, param_grid=params, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    if grid_search.best_score_ > best_accuracy:
        best_accuracy = grid_search.best_score_ 
        best_model = grid_search.best_estimator_
        best_model_name = classifier_info['name']
        best_params = grid_search.best_params_

# Train the best model on the entire dataset
best_model.fit(X, y)

y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Best Model: {best_model_name}")
print(f"Best Parameters: {best_params}")
print(f"Training Accuracy: {best_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")