<a href="https://colab.research.google.com/github/filsto/spaceship_titanic/blob/main/spaceship_titanic_xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [71]:
import tensorflow as tf
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler

**PART A: DATA**

A.1 load data

A.2 define target

A.3 explore data

A.4 Data extraction

A.5 Imputation of missing data

A.6 get_dummies

A.7 Normalization

In [3]:
# Part A.1 - load data

from google.colab import drive
drive.mount('/content/drive')
from google.colab import files

Mounted at /content/drive


In [101]:
train_set = pd.read_csv('/content/drive/MyDrive/DATA/spaceship_titanic/train.csv', sep=",")
test_set = pd.read_csv('/content/drive/MyDrive/DATA/spaceship_titanic/test.csv', sep=",")

In [117]:
# Part A.2 - define target

y = train_set['Transported']
X = train_set.drop(['Transported', 'Name', 'PassengerId'], axis=1)
id = test_set['PassengerId']
X_test = test_set.drop([ 'Name', 'PassengerId'], axis=1)

In [50]:
# Part A.3 - explore data

X.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0


In [51]:
X.isnull().sum()

HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
dtype: int64

In [52]:
X.dtypes

HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
dtype: object

In [119]:
# Part A.4 - Data extraction

def SplitCabin(data):
  data['Deck']=data['Cabin'].str.split("/", n=2,expand=True)[0]
  data['Side']=data['Cabin'].str.split("/", n=2, expand=True)[2]
  data.pop('Cabin')
  return data

X = SplitCabin(X)
X_test = SplitCabin(X_test)

In [126]:
len(X_test)

4277

In [121]:
def Cost(data):
  data['TotalCost']=data['RoomService']+data['FoodCourt']+data['ShoppingMall']+data['Spa']
  return data

X = Cost(X)
X_test = Cost(X_test)

In [123]:
# Partie A.5 - Imputation of missing data
# numerical: median, categorical: most represented
# By the hell, I got the intrinsic feeling that I should use unsupervised machine learning to impute missing data from available data
# Next time ?

def missingData(data):

  numeric_data =  [column for column in data.select_dtypes('float64')]
  categoric_data = [column for column in data.select_dtypes('object')]

  for col in numeric_data:
    data[col].fillna(data[col].median(), inplace=True)

  for col in categoric_data:
    data[col].fillna(data[col].value_counts().index[0], inplace=True) 
  
  return data

X = missingData(X)
X_test = missingData(X_test)


In [77]:
X.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalCost,HomePlanet_Earth,...,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_P,Side_S
0,0,-1.957965,0,-0.333609,-0.281203,-0.284053,-0.270866,-0.263235,-0.492436,0,...,0,1,0,0,0,0,0,0,1,0
1,0,-2.030903,0,-0.333359,-0.281199,-0.283983,-0.270433,-0.263201,-0.492302,1,...,0,0,0,0,0,1,0,0,0,1
2,0,-1.865576,1,-0.33351,-0.279798,-0.284053,-0.265565,-0.263197,-0.490442,0,...,1,0,0,0,0,0,0,0,0,1
3,0,-1.98714,0,-0.333609,-0.280699,-0.283015,-0.268238,-0.263085,-0.491474,0,...,1,0,0,0,0,0,0,0,0,1
4,0,-2.069804,0,-0.332914,-0.281175,-0.283631,-0.27042,-0.263234,-0.492226,1,...,0,0,0,0,0,1,0,0,0,1


In [125]:
# Part A.6 - get dummies

def categ(data):
  data=pd.get_dummies(data)
  data['CryoSleep'] = data['CryoSleep']*1
  data['VIP'] = data['VIP']*1
  return data

X = categ(X)
X_test = categ(X_test)


In [127]:
# Part A.7 - Normalization
# standardscaler

scaler = StandardScaler()

def stdscale(data,fitorno):
  liste=['Age','RoomService','FoodCourt','ShoppingMall', 'Spa', 'VRDeck','TotalCost']
  if fitorno == 1:
    data[liste] = scaler.fit_transform(data[liste])
  else:
    data[liste] = scaler.transform(data[liste])


  return data

X = stdscale(X, 1)
X_test = stdscale(X_test, 0)

PART B:

THE MODEL

we gonna use a gradient boosting classification model, 
because that's what I wanna use

next time we'll give a try with random forest

In [128]:
X_train, X_valid, y_train, y_valid= train_test_split (X, y, random_state=42)

In [129]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

model = XGBRegressor(n_estimators=1000, learning_rate=0.1)

model.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_valid, y_valid)], verbose=True)
pred=model.predict(X_valid)

print('mean_absolute_error : ', str(mean_absolute_error(pred, y_valid)))


[0]	validation_0-rmse:0.48459
Will train until validation_0-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:0.471233
[2]	validation_0-rmse:0.459959
[3]	validation_0-rmse:0.450155
[4]	validation_0-rmse:0.442291
[5]	validation_0-rmse:0.435037
[6]	validation_0-rmse:0.429151
[7]	validation_0-rmse:0.424378
[8]	validation_0-rmse:0.419753
[9]	validation_0-rmse:0.415848
[10]	validation_0-rmse:0.412238
[11]	validation_0-rmse:0.409336
[12]	validation_0-rmse:0.405419
[13]	validation_0-rmse:0.402412
[14]	validation_0-rmse:0.399699
[15]	validation_0-rmse:0.3974
[16]	validation_0-rmse:0.395974
[17]	validation_0-rmse:0.394182
[18]	validation_0-rmse:0.392603
[19]	validation_0-rmse:0.3911
[20]	validation_0-rmse:0.390452
[21]	validation_0-rmse:0.389125
[22]	validation_0-rmse:0.388434
[23]	validation_0-rmse:0.387392
[24]	validation_0-rmse:0.386868
[25]	validation_0-rmse:0.386005
[26]	validation_0-rmse:0.385331
[27]	validation_0-rmse:0.384798
[28]	validation_0-rmse:0.384183
[29]	validation_0-rmse

**PART C**

THE PREDICTION

In [130]:
answer = model.predict(X_test)

In [131]:
ans= (answer > 0.5)
  

In [135]:
d = { 'PassengerId' : id, 'Transported': ans}
df=pd.DataFrame (data=d)

df.to_csv('/content/drive/MyDrive/DATA/spaceship_titanic/Submission.csv', encoding='utf-8', index=False)

In [133]:
df


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,True
4274,9271_01,True
4275,9273_01,True
