### Kaggle note for this TPS
"The dataset is used for this competition is synthetic but based on a real dataset (in this case, the actual Titanic data!) and generated using a CTGAN. The statistical properties of this dataset are very similar to the original Titanic dataset, but there's no way to "cheat" by using public labels for predictions. How well does your model perform on truly unseen data?"

Idea: Start with my existing model used for the Titanic data set.

In [94]:
# load input data

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd

train_df = pd.read_csv("input/train.csv")

submit_df = pd.read_csv("input/test.csv")

In [95]:
train_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [96]:
train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,1,"Oconnor, Frankie",male,,2,0,209245,27.14,C12239,S
1,1,0,3,"Bryan, Drew",male,,0,0,27323,13.35,,S
2,2,0,3,"Owens, Kenneth",male,0.33,1,2,CA 457703,71.29,,S
3,3,0,3,"Kramer, James",male,19.0,0,0,A. 10866,13.04,,S
4,4,1,3,"Bond, Michael",male,25.0,0,0,427635,7.76,,S


In [97]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   PassengerId  100000 non-null  int64  
 1   Survived     100000 non-null  int64  
 2   Pclass       100000 non-null  int64  
 3   Name         100000 non-null  object 
 4   Sex          100000 non-null  object 
 5   Age          96708 non-null   float64
 6   SibSp        100000 non-null  int64  
 7   Parch        100000 non-null  int64  
 8   Ticket       95377 non-null   object 
 9   Fare         99866 non-null   float64
 10  Cabin        32134 non-null   object 
 11  Embarked     99750 non-null   object 
dtypes: float64(2), int64(5), object(5)
memory usage: 9.2+ MB


# 1. Data Transformation and Pre-Processing

First: transform and then drop rows with missing values (simple version, see how it goes).

Later: See if we can improve the model by inference and filling missing data.

In [98]:
# TRAINING DATA

import re

# Rename class
train_df["Pclass"].replace(1, "Upper", inplace=True)
train_df["Pclass"].replace(2, "Middle", inplace=True)
train_df["Pclass"].replace(3, "Lower", inplace=True)

# Replace missing age with median -> age inference method?
train_df["Age"].fillna(np.nanmedian(train_df["Age"]), inplace=True)

# Create Age Bands (thinner age bands didn't improve the model)
bins = [0,10,20,30,40,50,60,70,100]
train_df["Age_bin"] = pd.cut(train_df['Age'], bins)

# Replace Fare==0 with median
# train_df["Fare"].replace(0, np.median(train_df["Fare"]), inplace=True)

# Create Fare Bands -> did not improve the model
# bins = [0,50,100,200,500,1000]
# train_df["Fare_bin"] = pd.cut(train_df['Fare'], bins)

# With family 
train_df["with_family"] = (train_df["SibSp"] + train_df["Parch"])>0

# Replace NA for embarked with "S"
train_df["Embarked"].fillna("S", inplace=True)

# Replace NA for Cabin with "Unknown"
train_df["Cabin"].fillna("Unknown", inplace=True)

# Extract Name Title
#train_df["Title"] = train_df.Name.apply(lambda x: x.split(",")[1].split(".")[0].strip())
#title_list = ["Mr", "Miss", "Mrs", "Master", "Dr", "Rev", "Col"]
#train_df.loc[~train_df["Title"].isin(title_list), "Title"] = "NA"

# Extract deck from Cabin
for i in range(0, len(train_df)):
    train_df.at[i, "Deck"] = " ".join(re.findall("[a-zA-Z]+", train_df.at[i, "Cabin"]))

train_df["Deck"].replace("B B", "B", inplace=True)
train_df["Deck"].replace("B B B", "B", inplace=True)
train_df["Deck"].replace("B B B B", "B", inplace=True)
train_df["Deck"].replace("C C", "C", inplace=True)
train_df["Deck"].replace("D D", "D", inplace=True)
train_df["Deck"].replace("C C C", "C", inplace=True)
train_df["Deck"].replace("F G", "F", inplace=True)
train_df["Deck"].replace("F E", "E", inplace=True)
train_df["Deck"].replace("T", "Unknown", inplace=True)

In [99]:
train_df = train_df.dropna()

In [100]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95249 entries, 0 to 99999
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  95249 non-null  int64   
 1   Survived     95249 non-null  int64   
 2   Pclass       95249 non-null  object  
 3   Name         95249 non-null  object  
 4   Sex          95249 non-null  object  
 5   Age          95249 non-null  float64 
 6   SibSp        95249 non-null  int64   
 7   Parch        95249 non-null  int64   
 8   Ticket       95249 non-null  object  
 9   Fare         95249 non-null  float64 
 10  Cabin        95249 non-null  object  
 11  Embarked     95249 non-null  object  
 12  Age_bin      95249 non-null  category
 13  with_family  95249 non-null  bool    
 14  Deck         95249 non-null  object  
dtypes: bool(1), category(1), float64(2), int64(4), object(7)
memory usage: 10.4+ MB


In [101]:
# SUBMIT DATA

# Rename class
submit_df["Pclass"].replace(1, "Upper", inplace=True)
submit_df["Pclass"].replace(2, "Middle", inplace=True)
submit_df["Pclass"].replace(3, "Lower", inplace=True)

# Replace missing age with median
submit_df["Age"].fillna(np.nanmedian(submit_df["Age"]), inplace=True)

# Create Age Bands
bins = [0,10,20,30,40,50,60,70,100]
submit_df["Age_bin"] = pd.cut(submit_df['Age'], bins)

# Replace missing fare with median
submit_df["Fare"].fillna(np.nanmedian(submit_df["Fare"]), inplace=True)

# Replace Fare==0 with median
# test_df["Fare"].replace(0, np.median(test_df["Fare"]), inplace=True)

# Create Fare Bands -> did not improve the model
# bins = [0,50,100,200,500,1000]
# test_df["Fare_bin"] = pd.cut(test_df['Fare'], bins)

# Replace NA for embarked with "S"
submit_df["Embarked"].fillna("S", inplace=True)

# Replace NA for Cabin with "Unknown"
submit_df["Cabin"].fillna("Unknown", inplace=True)

# With family
submit_df["with_family"] = (submit_df["SibSp"] + submit_df["Parch"])>0

# Extract Name Title
#submit_df["Title"] = submit_df.Name.apply(lambda x: x.split(",")[1].split(".")[0].strip())
#title_list = ["Mr", "Miss", "Mrs", "Master", "Dr", "Rev", "Col"]
#submit_df.loc[~submit_df["Title"].isin(title_list), "Title"] = "NA"

# Extract deck from Cabin
for i in range(0, len(submit_df)):
    submit_df.at[i, "Deck"] = " ".join(re.findall("[a-zA-Z]+", submit_df.at[i, "Cabin"]))

submit_df["Deck"].replace("B B", "B", inplace=True)
submit_df["Deck"].replace("B B B", "B", inplace=True)
submit_df["Deck"].replace("B B B B", "B", inplace=True)
submit_df["Deck"].replace("C C", "C", inplace=True)
submit_df["Deck"].replace("E E", "E", inplace=True)
submit_df["Deck"].replace("D D", "D", inplace=True)
submit_df["Deck"].replace("C C C", "C", inplace=True)
submit_df["Deck"].replace("F G", "F", inplace=True)
submit_df["Deck"].replace("F E", "E", inplace=True)
submit_df["Deck"].replace("T", "Unknown", inplace=True)

In [104]:
submit_df["Ticket"]=submit_df["Ticket"].fillna("Unknown")

In [105]:
submit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column       Non-Null Count   Dtype   
---  ------       --------------   -----   
 0   PassengerId  100000 non-null  int64   
 1   Pclass       100000 non-null  object  
 2   Name         100000 non-null  object  
 3   Sex          100000 non-null  object  
 4   Age          100000 non-null  float64 
 5   SibSp        100000 non-null  int64   
 6   Parch        100000 non-null  int64   
 7   Ticket       100000 non-null  object  
 8   Fare         100000 non-null  float64 
 9   Cabin        100000 non-null  object  
 10  Embarked     100000 non-null  object  
 11  Age_bin      100000 non-null  category
 12  with_family  100000 non-null  bool    
 13  Deck         100000 non-null  object  
dtypes: bool(1), category(1), float64(2), int64(3), object(7)
memory usage: 9.3+ MB


In [106]:
# Pre-processing: Standardization and One-Hot Encoder

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

train_df_pre = train_df.drop(columns=["Name", "Ticket", "Cabin"])
submit_df_pre = submit_df.drop(columns=[ "Name", "Ticket", "Cabin"])

#train_df_pre = train_df.drop(columns=["Age", "Fare", "Name", "Ticket", "Cabin"])
#test_df_pre = test_df.drop(columns=["Age", "Fare", "Name", "Ticket", "Cabin"])

num_attribs = ["Age", "SibSp", "Parch", "Fare"]
#num_attribs = ["SibSp", "Parch",]
cat_attribs = ["Pclass", "Embarked", "Deck", "Sex", "with_family", "Age_bin"]#, "Fare_bin"]

col_transformer = ColumnTransformer([
    ("num", StandardScaler(), num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
    ],
    remainder="passthrough")

# Fit transform TRAIN
train_array_transformed = col_transformer.fit_transform(train_df_pre)

# Convert numpy.ndarray to pd.DataFrame
train_df_transformed = pd.DataFrame(data=train_array_transformed)
#train_df_transformed = pd.DataFrame(data=train_array_transformed.toarray())

# Rename columns
column_names = num_attribs + list(col_transformer.named_transformers_['cat'].get_feature_names()) + ["PassengerId"] + ["Survived"]
train_df_transformed.columns = column_names


# Fit transform SUBMIT
submit_array_transformed = col_transformer.fit_transform(submit_df_pre)

# Convert numpy.ndarray to pd.DataFrame
submit_df_transformed = pd.DataFrame(data=submit_array_transformed)
#test_df_transformed = pd.DataFrame(data=test_array_transformed.toarray())

# Rename columns
column_names = num_attribs + list(col_transformer.named_transformers_['cat'].get_feature_names()) + ["PassengerId"]
submit_df_transformed.columns = column_names

In [107]:
# check columns
print(train_df_transformed.columns)
print(submit_df_transformed.columns)

Index(['Age', 'SibSp', 'Parch', 'Fare', 'x0_Lower', 'x0_Middle', 'x0_Upper',
       'x1_C', 'x1_Q', 'x1_S', 'x2_A', 'x2_B', 'x2_C', 'x2_D', 'x2_E', 'x2_F',
       'x2_G', 'x2_Unknown', 'x3_female', 'x3_male', 'x4_False', 'x4_True',
       'x5_(0, 10]', 'x5_(10, 20]', 'x5_(20, 30]', 'x5_(30, 40]',
       'x5_(40, 50]', 'x5_(50, 60]', 'x5_(60, 70]', 'x5_(70, 100]',
       'PassengerId', 'Survived'],
      dtype='object')
Index(['Age', 'SibSp', 'Parch', 'Fare', 'x0_Lower', 'x0_Middle', 'x0_Upper',
       'x1_C', 'x1_Q', 'x1_S', 'x2_A', 'x2_B', 'x2_C', 'x2_D', 'x2_E', 'x2_F',
       'x2_G', 'x2_Unknown', 'x3_female', 'x3_male', 'x4_False', 'x4_True',
       'x5_(0, 10]', 'x5_(10, 20]', 'x5_(20, 30]', 'x5_(30, 40]',
       'x5_(40, 50]', 'x5_(50, 60]', 'x5_(60, 70]', 'x5_(70, 100]',
       'PassengerId'],
      dtype='object')


# 2. Model Building and Testing

### 1. Random Forest without Train-Test-Split

In [119]:
X = train_df_transformed.drop(columns=["Survived", "PassengerId"])
y = train_df_transformed[["Survived"]]

In [120]:
# Random Forest simple

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# Random forrest classifier
rf = RandomForestClassifier()
rf.fit(X, np.ravel(y))

RandomForestClassifier()

In [121]:
print("train score: " + str(rf.score(X_train, y_train)))
print("test score: " + str(rf.score(X_test, y_test)))

train score: 0.994435612082671
test score: 0.9957305336832896


In [None]:
# Random Forest tuned

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# Random forrest classifier
rf = RandomForestClassifier(verbose=1)

# Find optimal parameter settings using Randomized Search
param_grid =  {'n_estimators': [200,300,400,600,800,1000], 
               'bootstrap': [True,False],
               'max_depth': [3,5,10,20,50,None],
               'max_features': ['auto'],
               'min_samples_leaf': [1,2,4,6,10],
               'min_samples_split': [2,5,10,20]}

rnd_clf = RandomizedSearchCV(rf, param_distributions=param_grid, n_iter=300, cv=3, verbose=5, n_jobs=-1)

best_rnd_clf = rnd_clf.fit(X, np.ravel(y))

Fitting 3 folds for each of 300 candidates, totalling 900 fits


In [112]:
best_rnd_clf.best_estimator_

RandomForestClassifier(max_depth=10, min_samples_leaf=6, min_samples_split=5,
                       n_estimators=300, verbose=1)

In [113]:
print("train score: " + str(best_rnd_clf.score(X_train, y_train)))
print("test score: " + str(best_rnd_clf.score(X_test, y_test)))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    1.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


train score: 0.7915829258781534
test score: 0.7899912510936133


[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.6s finished


In [114]:
# Export for submit
import datetime as datetime
export_df = pd.DataFrame()
export_df["PassengerId"] = submit_df_transformed["PassengerId"].astype(int)
export_df["Survived"] = best_rnd_clf.predict(submit_df_transformed.drop(columns=["PassengerId"])).astype(int)
now = datetime.datetime.now()
name_add = "date_"+str(now.year)+"-"+str(now.month)+"-"+str(now.day)+"_time_"+str(now.hour)+"-"+str(now.minute)
export_df.to_csv(f"output/random_forest_tuned_{name_add}.csv", index=False)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    2.1s finished


### 2. Random Forest with Train-Test-Split

In [108]:
from sklearn.model_selection import train_test_split

X = train_df_transformed.drop(columns=["Survived", "PassengerId"])
y = train_df_transformed[["Survived"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99, shuffle=True)

In [109]:
# Random Forest simple

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# Random forrest classifier
rf = RandomForestClassifier()
rf.fit(X_train, np.ravel(y_train))

RandomForestClassifier()

In [110]:
print("train score: " + str(rf.score(X_train, y_train)))
print("test score: " + str(rf.score(X_test, y_test)))

train score: 0.9956204817470078
test score: 0.735573053368329


In [111]:
# Random Forest tuned

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# Random forrest classifier
rf = RandomForestClassifier(verbose=1)

# Find optimal parameter settings using Randomized Search
param_grid =  {'n_estimators': [200,300,400,600,800,1000], 
               'bootstrap': [True,False],
               'max_depth': [3,5,10,20,50,None],
               'max_features': ['auto'],
               'min_samples_leaf': [1,2,4,6,10],
               'min_samples_split': [2,5,10,20]}

rnd_clf = RandomizedSearchCV(rf, param_distributions=param_grid, n_iter=300, cv=3, verbose=5, n_jobs=-1)

best_rnd_clf = rnd_clf.fit(X_train, np.ravel(y_train))

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    7.8s finished


In [112]:
best_rnd_clf.best_estimator_

RandomForestClassifier(max_depth=10, min_samples_leaf=6, min_samples_split=5,
                       n_estimators=300, verbose=1)

In [113]:
print("train score: " + str(best_rnd_clf.score(X_train, y_train)))
print("test score: " + str(best_rnd_clf.score(X_test, y_test)))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    1.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


train score: 0.7915829258781534
test score: 0.7899912510936133


[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.6s finished


In [114]:
# Export for submit
import datetime as datetime
export_df = pd.DataFrame()
export_df["PassengerId"] = submit_df_transformed["PassengerId"].astype(int)
export_df["Survived"] = best_rnd_clf.predict(submit_df_transformed.drop(columns=["PassengerId"])).astype(int)
now = datetime.datetime.now()
name_add = "date_"+str(now.year)+"-"+str(now.month)+"-"+str(now.day)+"_time_"+str(now.hour)+"-"+str(now.minute)
export_df.to_csv(f"output/random_forest_tuned_{name_add}.csv", index=False)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    2.1s finished
