# Spaceship Titanic

## Import everything

In [52]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

X = train_data.drop(["Transported"], axis=1)
y = train_data["Transported"]
X_test = test_data


## Basic EDA

In [53]:
# group people by Transported 
transported = train_data[train_data['Transported']==True]
not_transported = train_data[train_data['Transported']==False]

# print(transported.describe())
# print(not_transported.describe())
X.head()

# # find the row number if any of the columns are null
# rows = np.where(pd.isnull(X))
# X.iloc[rows]

# # find all column names that contain null values
# null_col_names = X.columns[X.isnull().any()]
# null_col_names

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines


## Dealing with missing values

In [54]:
from sklearn.impute import SimpleImputer

imputer1 = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_X = pd.DataFrame(imputer1.fit_transform(X), columns=X.columns)
imp_X_test = pd.DataFrame(imputer1.transform(test_data), columns=test_data.columns)

## Split the Cabin column into Deck, Num, and Side

In [55]:
# split the column "Cabin" into three columns "Deck", "Num", "Side"
cabin = imp_X["Cabin"].str.split("/", n = 2, expand = True)
# rename the columns
cabin = cabin.rename(columns={0: "Deck", 1: "Num", 2: "Side"})
# add the new columns to the dataframe
split_X = pd.concat([imp_X, cabin], axis=1)
split_X.drop(["Cabin"], axis=1, inplace=True)
split_X

cabin_test = imp_X_test["Cabin"].str.split("/", n = 2, expand = True)
cabin_test = cabin_test.rename(columns={0: "Deck", 1: "Num", 2: "Side"})
split_X_test = pd.concat([imp_X_test, cabin_test], axis=1)
split_X_test.drop(["Cabin"], axis=1, inplace=True)


## Basic logistic regression model

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

# features = ["HomePlanet", "CryoSleep", "Destination", "Age", "VIP", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Deck", "Num", "Side"]
features = ["HomePlanet", "CryoSleep", "Age", "VIP", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Deck", "Num", "Side"]

X_lg = split_X[features]
y_lg = y
X_lg_test = split_X_test[features]

X_lg.head()

# categorical_features = ["HomePlanet", "CryoSleep", "Destination", "VIP", "Deck", "Side"]
categorical_features = ["HomePlanet", "CryoSleep", "VIP", "Deck", "Side"]

# one hot encode all the categorical features
one_hot = OneHotEncoder()
one_hot_X = one_hot.fit_transform(X_lg[categorical_features])
one_hot_X_test = one_hot.transform(X_lg_test[categorical_features])

# add the encoded variable back to the dataframe
X_lg = X_lg.join(pd.DataFrame(one_hot_X.toarray(), columns=one_hot.get_feature_names_out()))
X_lg_test = X_lg_test.join(pd.DataFrame(one_hot_X_test.toarray(), columns=one_hot.get_feature_names_out()))

X_lg.drop(categorical_features, axis=1, inplace=True)
X_lg_test.drop(categorical_features, axis=1, inplace=True)

model = LogisticRegression(max_iter=10000)
model.fit(X_lg, y_lg)
predictions = model.predict(X_lg_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Transported': predictions})
output.to_csv('rfc1_submission.csv', index=False)

## Cross validation to select the best imputer

In [57]:
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler


# split the cabin column into three sub columns
cabin = X["Cabin"].str.split("/", n = 2, expand = True)
# rename the columns
cabin = cabin.rename(columns={0: "Deck", 1: "Num", 2: "Side"})
# add the new columns to the dataframe
split_X = pd.concat([X, cabin], axis=1)
split_X.drop(["Cabin"], axis=1, inplace=True)

cabin_test = X_test["Cabin"].str.split("/", n = 2, expand = True)
cabin_test = cabin_test.rename(columns={0: "Deck", 1: "Num", 2: "Side"})
split_X_test = pd.concat([X_test, cabin_test], axis=1)
split_X_test.drop(["Cabin"], axis=1, inplace=True)

# features = ["HomePlanet", "CryoSleep", "Destination", "Age", "VIP", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Deck", "Num", "Side"]
features = ["HomePlanet", "CryoSleep", "Age", "VIP", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Deck", "Num", "Side"]

X_lg = split_X[features]
y_lg = y
X_lg_test = split_X_test[features]


# categorical_features = ["HomePlanet", "CryoSleep", "Destination", "VIP", "Deck", "Side"]
categorical_features = ["HomePlanet", "CryoSleep", "VIP", "Deck", "Side"]
numerical_features = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Num"]

# one hot encode all the categorical features
one_hot = OneHotEncoder()
one_hot_X = one_hot.fit_transform(X_lg[categorical_features])
one_hot_X_test = one_hot.transform(X_lg_test[categorical_features])

# add the encoded variable back to the dataframe
X_lg = X_lg.join(pd.DataFrame(one_hot_X.toarray(), columns=one_hot.get_feature_names_out()))
X_lg_test = X_lg_test.join(pd.DataFrame(one_hot_X_test.toarray(), columns=one_hot.get_feature_names_out()))

X_lg.drop(categorical_features, axis=1, inplace=True)
X_lg_test.drop(categorical_features, axis=1, inplace=True)

# scale the numerical features
# scaler = StandardScaler()
# X_lg = pd.DataFrame(scaler.fit_transform(X_lg), columns=X_lg.columns)
# X_lg_test = pd.DataFrame(scaler.transform(X_lg_test), columns=X_lg_test.columns)

# add in missing values
# imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
imputer = KNNImputer(n_neighbors=20)
X_lg = pd.DataFrame(imputer.fit_transform(X_lg), columns=X_lg.columns)
X_lg_test = pd.DataFrame(imputer.transform(X_lg_test), columns=X_lg_test.columns)

model = LogisticRegression(max_iter=10000)

# score = cross_val_score(model, X_lg, y_lg, cv=5)
# print(score.mean())

model.fit(X_lg, y_lg)
predictions = model.predict(X_lg_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Transported': predictions})
output.to_csv('lg_submission_KNN.csv', index=False)

