In [2]:
import pandas as pd

df_train = pd.read_csv('data/kaggle_titanic/train.csv')
df_test = pd.read_csv('data/kaggle_titanic/test.csv')

In [None]:
## Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import KNNImputer
from sklearn.preprocessing import FunctionTransformer, StandardScaler
import numpy as np

# DROP passemgerID, name, ticket, cabin
df_train_drop = df_train.drop(['PassengerId','Name', 'Ticket', 'Cabin'], axis=1)

# ONEHOTENCODE embarked
df_train_drop['Embarked'] = df_train_drop['Embarked'].fillna('S')
df_embarked = df_train_drop[['Embarked']]
embarked_encoder = OneHotEncoder()
df_embarked_encoded = pd.DataFrame(data=embarked_encoder.fit_transform(df_embarked).toarray(),
                                   columns=embarked_encoder.get_feature_names_out(), 
                                   index=df_train_drop.index)
df_train_drop = df_train_drop.drop(['Embarked'], axis=1)
df_train_drop[embarked_encoder.get_feature_names_out()] = df_embarked_encoded


# ONEHOTENCODE pclass
df_pclass = df_train_drop[['Pclass']]
pclass_encoder = OneHotEncoder()
df_pclass_encoded = pd.DataFrame(data=pclass_encoder.fit_transform(df_pclass).toarray(),
                                   columns=pclass_encoder.get_feature_names_out(), 
                                   index=df_train_drop.index)
df_train_drop = df_train_drop.drop(['Pclass'], axis=1)
df_train_drop[pclass_encoder.get_feature_names_out()] = df_pclass_encoded

# REPLACE sex
df_train_drop['Sex'] = df_train_drop['Sex'].replace({'male':0, 'female':1})

# IMPUTE age
age_imputer = KNNImputer(n_neighbors=5)
df_train_drop['Age'] = age_imputer.fit_transform(df_train_drop[['Age']]).round(1)

# SCALE age
mean_age = df_train_drop['Age'].mean()**0.5
std_age = df_train_drop['Age'].std()**0.5
sqrt_transformer = FunctionTransformer(func = lambda x: (x**0.5-mean_age)/std_age, inverse_func=lambda x: (x*std_age+mean_age)**2)
df_train_drop[['Age']] = sqrt_transformer.fit_transform(df_train_drop[['Age']]).round(1)

# # SCALE SibSp, Parch, Fare
# stand_transformer = StandardScaler()
# df_train_drop[['SibSp']] = stand_transformer.fit_transform(df_train_drop[['SibSp']])
# df_train_drop[['Parch']] = stand_transformer.fit_transform(df_train_drop[['Parch']])
# df_train_drop[['Fare']] = stand_transformer.fit_transform(df_train_drop[['Fare']]).round(3)



#ToDo:
# - scale SibSp, Parch, Fare
# - scale age on another way
# - use pipeline
# - combine SibSp and Parch to family size
# - combine other features


  df_train_drop['Sex'] = df_train_drop['Sex'].replace({'male':0, 'female':1})


In [24]:
## Model
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import cross_val_score

X = df_train_drop.drop(['Survived'], axis=1)
y = df_train_drop['Survived']

dict_models = {'model_name':[], 'model':[], 'mean':[], 'std':[]}

model = LogisticRegression(max_iter=1000)
scores = cross_val_score(model, X, y, cv=20, scoring='accuracy')
print(f'LogisticRegression: MEAN = {scores.mean().round(2)}, STD = {scores.std().round(2)}')
dict_models['model_name'].append('LogisticRegression')
dict_models['model'].append(model)
dict_models['mean'].append(scores.mean())
dict_models['std'].append(scores.std())

# model_MLP = MLPClassifier(hidden_layer_sizes=(100,100), max_iter=1000, random_state=42, learning_rate_init=0.0001)
# scores = cross_val_score(model_MLP, X, y, cv=20, scoring='accuracy')
# print(f'MLPClassifier: MEAN = {scores.mean().round(2)}, STD = {scores.std().round(2)}')
# df_models = df_models.add({'model_name':'MLPClassifier', 'model': model_MLP, 'mean':scores.mean(), 'std':scores.std()})

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
scores = cross_val_score(model, X, y, cv=20, scoring='accuracy')
print(f'RandomForestClassifier: MEAN = {scores.mean().round(2)}, STD = {scores.std().round(2)}')
dict_models['model_name'].append('RandomForestClassifier')
dict_models['model'].append(model)
dict_models['mean'].append(scores.mean())
dict_models['std'].append(scores.std())

model = GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42)
scores = cross_val_score(model, X, y, cv=20, scoring='accuracy')
print(f'GradientBoostingClassifier: MEAN = {scores.mean().round(2)}, STD = {scores.std().round(2)}')
dict_models['model_name'].append('GradientBoostingClassifier')
dict_models['model'].append(model)
dict_models['mean'].append(scores.mean())
dict_models['std'].append(scores.std())

model = AdaBoostClassifier(n_estimators=100, random_state=42)
scores = cross_val_score(model, X, y, cv=20, scoring='accuracy')
print(f'GradientBoostingClassifier: MEAN = {scores.mean().round(2)}, STD = {scores.std().round(2)}')
dict_models['model_name'].append('AdaBoostClassifier')
dict_models['model'].append(model)
dict_models['mean'].append(scores.mean())
dict_models['std'].append(scores.std())

df_models = pd.DataFrame(dict_models)

LogisticRegression: MEAN = 0.81, STD = 0.04
RandomForestClassifier: MEAN = 0.82, STD = 0.05
GradientBoostingClassifier: MEAN = 0.82, STD = 0.06
GradientBoostingClassifier: MEAN = 0.8, STD = 0.05


In [25]:
## Choose and optimize
df_models.sort_values(by='mean', ascending=False, inplace=True)
print(df_models)
best_model = df_models['model_name'][0]
print(f'Best model: {best_model}')

                   model_name  \
1      RandomForestClassifier   
2  GradientBoostingClassifier   
0          LogisticRegression   
3          AdaBoostClassifier   

                                               model      mean       std  
1  RandomForestClassifier(max_depth=5, random_sta...  0.824848  0.050195  
2  GradientBoostingClassifier(max_depth=5, random...  0.822677  0.060789  
0                  LogisticRegression(max_iter=1000)  0.807020  0.043130  
3  AdaBoostClassifier(n_estimators=100, random_st...  0.803586  0.046666  
Best model: LogisticRegression
