In [65]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error
import matplotlib.pyplot as plt
from xgboost import XGBRegressor, plot_importance
from sklearn.model_selection import GridSearchCV
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_regression
import xgboost as xgb
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,r2_score,classification_report,confusion_matrix

In [114]:
train_dataset = pd.read_csv("/Users/julius/Personal/Kaggle_competition/spaceship-titanic/Space_Titanic_Kaggle_Comp/Dataset/train.csv")
test_dataset = pd.read_csv("/Users/julius/Personal/Kaggle_competition/spaceship-titanic/Space_Titanic_Kaggle_Comp/Dataset/test.csv")

In [91]:
train_dataset

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [92]:
#checking for null value
train_dataset.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

# Data Preprocessing

In [93]:
train_dataset.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [94]:
train_dataset = train_dataset.drop(columns=['Name','PassengerId'])

In [96]:
train_dataset

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False
8689,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False
8690,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True
8691,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False


In [115]:
# Handling Cabin

train_dataset[['Deck','CabinNum','Side']] = train_dataset["Cabin"].str.split("/",expand=True)
test_dataset[['Deck','CabinNum','Side']] = test_dataset["Cabin"].str.split("/",expand=True)


In [116]:
categorical_value = ['HomePlanet','CryoSleep','Destination',"VIP",'Deck','Side']
numerical_value = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','CabinNum']

# Handling categorical Value
train_dataset[categorical_value] = train_dataset[categorical_value].fillna("Unknown")
test_dataset[categorical_value]= test_dataset[categorical_value].fillna('Unknown')

# Handling numberical value
train_dataset[numerical_value] = train_dataset[numerical_value].fillna(value=0)
test_dataset[numerical_value] = test_dataset[numerical_value].fillna(value=0)

In [117]:
train_dataset = train_dataset.drop(columns=['Cabin'])
test_dataset = test_dataset.drop(columns=['Cabin'])

In [118]:
train_dataset

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,CabinNum,Side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,A,98,P
8689,9278_01,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,G,1499,S
8690,9279_01,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,G,1500,S
8691,9280_01,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,E,608,S


In [119]:
#checking for null value
train_dataset.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
Transported       0
Deck              0
CabinNum          0
Side              0
dtype: int64

In [120]:
train_dataset = train_dataset.replace(np.nan,0)

In [121]:
train_dataset.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
Deck             object
CabinNum         object
Side             object
dtype: object

In [122]:
#feature encoding

#encoding Categorical
encode_cols = ['HomePlanet','CryoSleep','Destination','VIP','Deck','CabinNum','Side']

train_dataset[encode_cols] = train_dataset[encode_cols].astype(str)
test_dataset[encode_cols]=  test_dataset[encode_cols].astype(str)

encoder = OneHotEncoder(
    handle_unknown='ignore',sparse_output=False
)

encoder.fit(train_dataset[encode_cols])



In [123]:
train_df_encoded = encoder.transform(train_dataset[encode_cols])
test_df_encoded = encoder.transform(test_dataset[encode_cols])

In [124]:
train_df_encoded = pd.DataFrame(train_df_encoded, columns=encoder.get_feature_names_out(encode_cols))
test_df_encoded = pd.DataFrame(test_df_encoded, columns=encoder.get_feature_names_out(encode_cols))

In [125]:
#combining encodedd dataframe to the original dataframe
train_df_combine = pd.concat([train_dataset.drop(encode_cols,axis=1),train_df_encoded],axis=1)
test_df_combine = pd.concat([test_dataset.drop(encode_cols,axis=1),test_df_encoded],axis=1)

In [126]:
test_df_combine

Unnamed: 0,PassengerId,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,HomePlanet_Earth,HomePlanet_Europa,...,CabinNum_993,CabinNum_994,CabinNum_995,CabinNum_996,CabinNum_997,CabinNum_998,CabinNum_999,Side_P,Side_S,Side_Unknown
0,0013_01,27.0,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0018_01,19.0,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0019_01,31.0,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0021_01,38.0,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0023_01,20.0,10.0,0.0,635.0,0.0,0.0,Brence Harperez,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,34.0,0.0,0.0,0.0,0.0,0.0,Jeron Peter,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4273,9269_01,42.0,0.0,847.0,17.0,10.0,144.0,Matty Scheron,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4274,9271_01,0.0,0.0,0.0,0.0,0.0,0.0,Jayrin Pore,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4275,9273_01,0.0,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [127]:
train_df_combine

Unnamed: 0,PassengerId,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,HomePlanet_Earth,...,CabinNum_993,CabinNum_994,CabinNum_995,CabinNum_996,CabinNum_997,CabinNum_998,CabinNum_999,Side_P,Side_S,Side_Unknown
0,0001_01,39.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0002_01,24.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0003_01,58.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0003_02,33.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0004_01,16.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,41.0,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8689,9278_01,18.0,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8690,9279_01,26.0,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8691,9280_01,32.0,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [154]:
# preparing model 

X = train_df_combine.drop(columns=['Transported','PassengerId','Name','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck'])
y = train_df_combine['Transported']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [155]:
X_train

Unnamed: 0,Age,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_Unknown,CryoSleep_False,CryoSleep_True,CryoSleep_Unknown,Destination_55 Cancri e,Destination_PSO J318.5-22,...,CabinNum_993,CabinNum_994,CabinNum_995,CabinNum_996,CabinNum_997,CabinNum_998,CabinNum_999,Side_P,Side_S,Side_Unknown
2333,28.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2589,17.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8302,28.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8177,20.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
500,36.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,18.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5191,50.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5390,22.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
860,34.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [156]:
#XGBoost model

model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    early_stopping_rounds=10,
    random_state=42    
)

#train model

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=True
)

val_preds = model.predict(X_val)
print(f"Validation Accuracy: {accuracy_score(y_val, val_preds):.4f}")

[0]	validation_0-logloss:0.66437
[1]	validation_0-logloss:0.64043
[2]	validation_0-logloss:0.63631
[3]	validation_0-logloss:0.61684
[4]	validation_0-logloss:0.60105
[5]	validation_0-logloss:0.58745
[6]	validation_0-logloss:0.57624
[7]	validation_0-logloss:0.56720
[8]	validation_0-logloss:0.55884
[9]	validation_0-logloss:0.55165
[10]	validation_0-logloss:0.54495
[11]	validation_0-logloss:0.53969
[12]	validation_0-logloss:0.53502
[13]	validation_0-logloss:0.53079
[14]	validation_0-logloss:0.52741
[15]	validation_0-logloss:0.52473
[16]	validation_0-logloss:0.52165
[17]	validation_0-logloss:0.51970
[18]	validation_0-logloss:0.51744
[19]	validation_0-logloss:0.51540
[20]	validation_0-logloss:0.51414
[21]	validation_0-logloss:0.51329
[22]	validation_0-logloss:0.51190
[23]	validation_0-logloss:0.51134
[24]	validation_0-logloss:0.51056
[25]	validation_0-logloss:0.50963
[26]	validation_0-logloss:0.50851
[27]	validation_0-logloss:0.50805
[28]	validation_0-logloss:0.50783
[29]	validation_0-loglos

In [163]:
X_test = test_df_combine[X_train.columns]  
test_preds = model.predict(X_test)

In [159]:
feature_cols = X_train.columns

In [160]:
X_test = test_df_combine[feature_cols]

In [166]:
submission = pd.DataFrame({
    'PassengerId': test_df_combine['PassengerId'],
    'Transported': test_preds.astype(bool)  # This adds the column
})

In [171]:
submission.to_csv('/Users/julius/Personal/Kaggle_competition/spaceship-titanic/Space_Titanic_Kaggle_Comp/Dataset/submission.csv', index=False)