In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import matplotlib.pyplot as plt

#### Load dataset

In [2]:
train_data = pd.read_csv(os.path.join(os.getcwd(), "data", "train.csv"))
test_data = pd.read_csv(os.path.join(os.getcwd(), "data", "test.csv"))
test_ids = test_data["PassengerId"]

In [3]:
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [5]:
train_data.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [6]:
def clean(data):
    """
    Description: This function will do some data preprocessing, such as replace missing values in the inputed dataframe.

    Args:
        data (pd.DataFrame) : A pandas dataframe containing information about passengers.
    Returns:
        data (pd.Dataframe) : Returns the cleaned data frame.
    """

    data = data.drop(["PassengerId", "Cabin", "Name"], axis=1) # Drop columns that will likely not help our prediction

    cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Age"] # All these columns have some missing values and they are numerical
    for col in cols:
        data[col].fillna(data[col].median(), inplace=True) # This will replace the null values with the median value of that column

    cols = ["HomePlanet", "Destination"]
    for col in cols:
        data[col].fillna("Unkown", inplace=True) # We will just fill the missing data values with the string "Unknown"
    
    cols = ["CryoSleep", "VIP"]
    for col in cols:
        data[col].fillna(value=bool(False), inplace=True)
    
    # data.dropna(axis=0, how='any', thresh=None, inplace=True) # Drop any row that has missing values
    
    return data

#### Inspect/clean the data

In [7]:
train_data = clean(train_data)
test_data = clean(test_data)

In [8]:
train_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In [9]:
test_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0
1,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0
2,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0
3,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0
4,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0


In [10]:
train_data.isnull().sum()

HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
dtype: int64

In [11]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8693 non-null   object 
 1   CryoSleep     8693 non-null   bool   
 2   Destination   8693 non-null   object 
 3   Age           8693 non-null   float64
 4   VIP           8693 non-null   bool   
 5   RoomService   8693 non-null   float64
 6   FoodCourt     8693 non-null   float64
 7   ShoppingMall  8693 non-null   float64
 8   Spa           8693 non-null   float64
 9   VRDeck        8693 non-null   float64
 10  Transported   8693 non-null   bool   
dtypes: bool(3), float64(6), object(2)
memory usage: 568.9+ KB


In [12]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    4277 non-null   object 
 1   CryoSleep     4277 non-null   bool   
 2   Destination   4277 non-null   object 
 3   Age           4277 non-null   float64
 4   VIP           4277 non-null   bool   
 5   RoomService   4277 non-null   float64
 6   FoodCourt     4277 non-null   float64
 7   ShoppingMall  4277 non-null   float64
 8   Spa           4277 non-null   float64
 9   VRDeck        4277 non-null   float64
dtypes: bool(2), float64(6), object(2)
memory usage: 275.8+ KB


In [13]:
type(train_data["CryoSleep"][0])

numpy.bool_

In [14]:
# Now we need to encode the String labels into numbers, so True/False will be 1/0
cols = ["HomePlanet", "CryoSleep", "Destination", "VIP"]

label_encoder = preprocessing.LabelEncoder()

for col in cols:
    train_data[col] = label_encoder.fit_transform(train_data[col])
    test_data[col] = label_encoder.fit_transform(test_data[col]) # We want to use the same trasnsformation from the training set
    print(f"{col}, {label_encoder.classes_}")

# train_data["Transported"] = label_encoder.fit_transform(train_data["Transported"])

HomePlanet, ['Earth' 'Europa' 'Mars' 'Unkown']
CryoSleep, [False  True]
Destination, ['55 Cancri e' 'PSO J318.5-22' 'TRAPPIST-1e' 'Unkown']
VIP, [False  True]


In [15]:
train_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,1,0,2,39.0,0,0.0,0.0,0.0,0.0,0.0,False
1,0,0,2,24.0,0,109.0,9.0,25.0,549.0,44.0,True
2,1,0,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0,False
3,1,0,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0,False
4,0,0,2,16.0,0,303.0,70.0,151.0,565.0,2.0,True


The data is all numerical so now we can go into training our model.

#### Select/Train/Validate model

In [16]:
y = train_data["Transported"]
x = train_data.drop("Transported", axis=1)

X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

In [17]:
logistic_model = LogisticRegression(random_state=0, max_iter=10000).fit(X_train, y_train)

In [18]:
predictions = logistic_model.predict(X_val)

In [19]:
print(f"Logistic regression validation Accuracy Score: {accuracy_score(y_val, predictions):0.4f}")

Logistic regression validation Accuracy Score: 0.7740


#### Predict Test Set

In [20]:
submission_preds = logistic_model.predict(test_data)

In [21]:
submission_df = pd.DataFrame({"PassengerId": test_ids.values,
                              "Transported": submission_preds,
                            })

submission_df.to_csv("Submission.csv", index=False)

#### Let's try grid search to see if we can make it better

In [23]:
random_forest_classifier = RandomForestClassifier(random_state=42)
xgb_classfier = XGBClassifier(random_state=42)

In [24]:
search_space = {
                "random_forest_classifier" : {
                    "n_estimators": [25, 50, 100, 200, 250],
                    "max_depth": [2, 4, 8, 16],
                    "min_samples_split": [2, 4, 8, 16, 32]
                },
                "xgboost_classifier" : {
                    "n_estimators": [25, 50, 100, 200, 250],
                    "max_depth": [2, 4, 8, 16],
                    "gamma": [0.01, 0.1], # Minimum amount of loss (info gain) required to keep splitting
                    "learning_rate": [0.001, 0.01, 0.1, 1]
                }
        }

In [31]:
grid_search_forest = GridSearchCV(estimator=random_forest_classifier,
                                  param_grid=search_space["random_forest_classifier"],
                                  scoring=["neg_log_loss", "f1"],
                                  refit="neg_log_loss",
                                  cv=5, # k-fold cross validation
                                  verbose=4)

In [32]:
grid_search_forest.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5] END max_depth=2, min_samples_split=2, n_estimators=25; f1: (test=0.711) neg_log_loss: (test=-0.546) total time=   0.0s
[CV 2/5] END max_depth=2, min_samples_split=2, n_estimators=25; f1: (test=0.687) neg_log_loss: (test=-0.554) total time=   0.0s
[CV 3/5] END max_depth=2, min_samples_split=2, n_estimators=25; f1: (test=0.700) neg_log_loss: (test=-0.549) total time=   0.0s
[CV 4/5] END max_depth=2, min_samples_split=2, n_estimators=25; f1: (test=0.697) neg_log_loss: (test=-0.549) total time=   0.0s
[CV 5/5] END max_depth=2, min_samples_split=2, n_estimators=25; f1: (test=0.721) neg_log_loss: (test=-0.531) total time=   0.0s
[CV 1/5] END max_depth=2, min_samples_split=2, n_estimators=50; f1: (test=0.714) neg_log_loss: (test=-0.547) total time=   0.0s
[CV 2/5] END max_depth=2, min_samples_split=2, n_estimators=50; f1: (test=0.694) neg_log_loss: (test=-0.554) total time=   0.0s
[CV 3/5] END max_depth=2, min_samples_spl

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'max_depth': [2, 4, 8, 16],
                         'min_samples_split': [2, 4, 8, 16, 32],
                         'n_estimators': [25, 50, 100, 200, 250]},
             refit='neg_log_loss', scoring=['neg_log_loss', 'f1'], verbose=4)

In [33]:
grid_search_xgb = GridSearchCV(estimator=xgb_classfier,
                                  param_grid=search_space["xgboost_classifier"],
                                  scoring=["neg_log_loss", "f1"],
                                  refit="neg_log_loss",
                                  cv=5, # k-fold cross validation
                                  verbose=4)

In [34]:
grid_search_xgb.fit(X_train, y_train)

Fitting 5 folds for each of 160 candidates, totalling 800 fits
[CV 1/5] END gamma=0.01, learning_rate=0.001, max_depth=2, n_estimators=25; f1: (test=0.676) neg_log_loss: (test=-0.687) total time=   0.0s
[CV 2/5] END gamma=0.01, learning_rate=0.001, max_depth=2, n_estimators=25; f1: (test=0.656) neg_log_loss: (test=-0.687) total time=   0.0s
[CV 3/5] END gamma=0.01, learning_rate=0.001, max_depth=2, n_estimators=25; f1: (test=0.654) neg_log_loss: (test=-0.687) total time=   0.0s
[CV 4/5] END gamma=0.01, learning_rate=0.001, max_depth=2, n_estimators=25; f1: (test=0.669) neg_log_loss: (test=-0.687) total time=   0.0s
[CV 5/5] END gamma=0.01, learning_rate=0.001, max_depth=2, n_estimators=25; f1: (test=0.679) neg_log_loss: (test=-0.687) total time=   0.0s
[CV 1/5] END gamma=0.01, learning_rate=0.001, max_depth=2, n_estimators=50; f1: (test=0.676) neg_log_loss: (test=-0.680) total time=   0.0s
[CV 2/5] END gamma=0.01, learning_rate=0.001, max_depth=2, n_estimators=50; f1: (test=0.656) neg_

  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


[CV 4/5] END gamma=0.01, learning_rate=1, max_depth=4, n_estimators=200; f1: (test=0.778) neg_log_loss: (test=nan) total time=   0.2s
[CV 5/5] END gamma=0.01, learning_rate=1, max_depth=4, n_estimators=200; f1: (test=0.778) neg_log_loss: (test=-0.625) total time=   0.2s
[CV 1/5] END gamma=0.01, learning_rate=1, max_depth=4, n_estimators=250; f1: (test=0.773) neg_log_loss: (test=-0.635) total time=   0.3s


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


[CV 2/5] END gamma=0.01, learning_rate=1, max_depth=4, n_estimators=250; f1: (test=0.778) neg_log_loss: (test=nan) total time=   0.3s
[CV 3/5] END gamma=0.01, learning_rate=1, max_depth=4, n_estimators=250; f1: (test=0.769) neg_log_loss: (test=-0.717) total time=   0.3s


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


[CV 4/5] END gamma=0.01, learning_rate=1, max_depth=4, n_estimators=250; f1: (test=0.782) neg_log_loss: (test=nan) total time=   0.3s
[CV 5/5] END gamma=0.01, learning_rate=1, max_depth=4, n_estimators=250; f1: (test=0.784) neg_log_loss: (test=-0.645) total time=   0.3s
[CV 1/5] END gamma=0.01, learning_rate=1, max_depth=8, n_estimators=25; f1: (test=0.764) neg_log_loss: (test=-0.563) total time=   0.0s
[CV 2/5] END gamma=0.01, learning_rate=1, max_depth=8, n_estimators=25; f1: (test=0.782) neg_log_loss: (test=-0.549) total time=   0.0s
[CV 3/5] END gamma=0.01, learning_rate=1, max_depth=8, n_estimators=25; f1: (test=0.776) neg_log_loss: (test=-0.579) total time=   0.0s
[CV 4/5] END gamma=0.01, learning_rate=1, max_depth=8, n_estimators=25; f1: (test=0.777) neg_log_loss: (test=-0.578) total time=   0.0s
[CV 5/5] END gamma=0.01, learning_rate=1, max_depth=8, n_estimators=25; f1: (test=0.780) neg_log_loss: (test=-0.567) total time=   0.0s
[CV 1/5] END gamma=0.01, learning_rate=1, max_dep

  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


[CV 1/5] END gamma=0.01, learning_rate=1, max_depth=8, n_estimators=250; f1: (test=0.774) neg_log_loss: (test=nan) total time=   0.5s
[CV 2/5] END gamma=0.01, learning_rate=1, max_depth=8, n_estimators=250; f1: (test=0.785) neg_log_loss: (test=-0.756) total time=   0.5s
[CV 3/5] END gamma=0.01, learning_rate=1, max_depth=8, n_estimators=250; f1: (test=0.777) neg_log_loss: (test=-0.847) total time=   0.5s
[CV 4/5] END gamma=0.01, learning_rate=1, max_depth=8, n_estimators=250; f1: (test=0.779) neg_log_loss: (test=-0.774) total time=   0.5s
[CV 5/5] END gamma=0.01, learning_rate=1, max_depth=8, n_estimators=250; f1: (test=0.776) neg_log_loss: (test=-0.739) total time=   0.5s
[CV 1/5] END gamma=0.01, learning_rate=1, max_depth=16, n_estimators=25; f1: (test=0.776) neg_log_loss: (test=-0.611) total time=   0.0s
[CV 2/5] END gamma=0.01, learning_rate=1, max_depth=16, n_estimators=25; f1: (test=0.786) neg_log_loss: (test=-0.600) total time=   0.0s
[CV 3/5] END gamma=0.01, learning_rate=1, ma

 -0.67586369 -0.66021584 -0.63280608 -0.62060042 -0.68279393 -0.67294472
 -0.65460153 -0.6227123  -0.60869305 -0.68259324 -0.67256703 -0.65394011
 -0.62207131 -0.60825835 -0.63961913 -0.60412319 -0.56000782 -0.51247691
 -0.49982065 -0.62034386 -0.57150819 -0.51312602 -0.46373324 -0.45249543
 -0.60843969 -0.55380228 -0.49214245 -0.44890485 -0.44187062 -0.60799435
 -0.55433163 -0.49566018 -0.45829318 -0.45484066 -0.49824909 -0.46616137
 -0.44513935 -0.43566444 -0.43484409 -0.45110469 -0.43637327 -0.43186059
 -0.43570522 -0.4378903  -0.44249531 -0.43817236 -0.44477291 -0.45892146
 -0.46708158 -0.45417027 -0.46272178 -0.47971985 -0.51109912 -0.52424826
 -0.44240877 -0.45017908 -0.46635027 -0.48594522 -0.49448587 -0.48356531
 -0.51244121 -0.55514957         nan         nan -0.56703098 -0.62207818
 -0.69070277 -0.75910946         nan -0.6175009  -0.66611371 -0.72331554
 -0.76839161 -0.77082184 -0.68671049 -0.68053477 -0.66900234 -0.64870724
 -0.63977799 -0.68428399 -0.67586369 -0.66021584 -0

GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None,...
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                         

In [35]:
print(f"Model\tBest Score\tBest Paramaters\nForest\t{grid_search_forest.best_score_:0.4f}\t{grid_search_forest.best_params_}\n \
      XGBoost\t{grid_search_xgb.best_score_:0.4f}\t{grid_search_xgb.best_params_}\n")

Model	Best Score	Best Paramaters
Forest	-0.4387	{'max_depth': 8, 'min_samples_split': 4, 'n_estimators': 200}
       XGBoost	-0.6867	{'gamma': 0.01, 'learning_rate': 0.001, 'max_depth': 2, 'n_estimators': 25}



In [38]:
xgb_classfier_gs = XGBClassifier(random_state=42, gamma=0.01, learning_rate=0.001, max_depth=2, n_estimators=25).fit(X_train, y_train, 
                                                                                                                     early_stopping_rounds=10,
                                                                                                                     eval_set=[(X_val, y_val)])

[0]	validation_0-logloss:0.69287
[1]	validation_0-logloss:0.69259
[2]	validation_0-logloss:0.69232
[3]	validation_0-logloss:0.69204
[4]	validation_0-logloss:0.69176
[5]	validation_0-logloss:0.69149
[6]	validation_0-logloss:0.69121
[7]	validation_0-logloss:0.69094
[8]	validation_0-logloss:0.69067
[9]	validation_0-logloss:0.69039
[10]	validation_0-logloss:0.69012
[11]	validation_0-logloss:0.68985
[12]	validation_0-logloss:0.68958
[13]	validation_0-logloss:0.68931
[14]	validation_0-logloss:0.68904
[15]	validation_0-logloss:0.68877
[16]	validation_0-logloss:0.68850
[17]	validation_0-logloss:0.68823
[18]	validation_0-logloss:0.68796
[19]	validation_0-logloss:0.68769
[20]	validation_0-logloss:0.68743
[21]	validation_0-logloss:0.68716
[22]	validation_0-logloss:0.68690
[23]	validation_0-logloss:0.68663
[24]	validation_0-logloss:0.68637




In [39]:
print(accuracy_score(y_val, xgb_classfier_gs.predict(X_val)))

0.7222541690626797


In [40]:
random_forest_classifier_gs = RandomForestClassifier(random_state=42,
                                                     max_depth=8, 
                                                     min_samples_split=4, 
                                                     n_estimators=200).fit(X_train, y_train)

In [41]:
print(accuracy_score(y_val, random_forest_classifier_gs.predict(X_val)))

0.780333525014376


In [43]:
submission_preds = random_forest_classifier_gs.predict(test_data)

submission_df = pd.DataFrame({"PassengerId": test_ids.values,
                              "Transported": submission_preds,
                            })

submission_df.to_csv("Submission_forest.csv", index=False)