#### Imports

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split

from catboost import Pool, CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

#### Read CSV

In [2]:
def get_scores(X, ncv = 10):
    st = timer()
    rf0 = RandomForestClassifier(max_depth = 7)
    svm0 = SVC()
    xgb0 = XGBClassifier(max_depth = 7)

    res0 = {}
    res0["forest"] = cross_val_score(rf0, X, target, cv=ncv).mean()
    res0["svm"] = cross_val_score(svm0, X, target, cv=ncv).mean()
    res0["xgb"] = cross_val_score(xgb0, X, target, cv=ncv).mean()
    timer(st)
    return res0, np.array(list(res0.values())).mean()

def timer(st = None):
    if not st:
        return datetime.now()
    if st:
        h, t = divmod((datetime.now()-st).total_seconds(), 3600)
        m, s = divmod(t, 60)
        print(f"{h} hours, {m} minutes and {round(s, 2)} seconds!")

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
titanic = pd.concat([train, test], axis=0)

Id = test.PassengerId
target = train.Survived
split_index = len(train)
seed = np.random.randint(1,300)
ncv = 30 

In [4]:
titanic.drop(["PassengerId"], axis=1, inplace= True)

#### Features for test

In [14]:
dop_features = pd.DataFrame()
dop_features["Family"] = titanic.Parch + titanic.SibSp + 1
dop_features["IsAlong"] = dop_features.Family.map(lambda x: 1 if x==1 else 0)
titanic.Cabin = titanic.Cabin.fillna("NAN")
dop_features["Cabin_First"] = titanic.Cabin.map(lambda x: x[0])
dop_features["IsCabin"] = titanic.Cabin.map(lambda x: 1 if x != "NAN" else 0)
dop_features["Ticket_First"] = titanic.Ticket.map(lambda x: x[0])

In [15]:
dop_features

Unnamed: 0,Family,IsAlong,Cabin_First,IsCabin,Ticket_First
0,2,0,N,0,A
1,2,0,C,1,P
2,1,1,N,0,S
3,2,0,C,1,1
4,1,1,N,0,3
...,...,...,...,...,...
413,1,1,N,0,A
414,1,1,C,1,P
415,1,1,N,0,S
416,1,1,N,0,3


In [7]:
dop_features

Unnamed: 0,Family,IsAlong,Cabin_First,IsCabin
0,2,0,N,0
1,2,0,C,1
2,1,1,N,0
3,2,0,C,1
4,1,1,N,0
...,...,...,...,...
413,1,1,N,0
414,1,1,C,1
415,1,1,N,0
416,1,1,N,0


#### 0 solution

In [8]:
df0 = titanic.drop(["Name", "Ticket", "Cabin", "Embarked", "Fare", "Age", "Survived"], axis=1)[:split_index]
X0 = MinMaxScaler().fit_transform(np.array(pd.get_dummies(df0[:split_index], columns=["Sex"], drop_first=True)))
get_scores(X0, ncv)

0.0 hours, 0.0 minutes and 6.66 seconds!


({'forest': 0.7800766283524905,
  'svm': 0.8079693486590038,
  'xgb': 0.7743678160919542},
 0.7874712643678161)

#### 1 solution

In [9]:
df1 = titanic.drop(["Survived", "Cabin", "Ticket", "Name"], axis=1)
df1.Embarked = df1.Embarked.fillna(df1.Embarked.mode()[0])
df1.Age = df1.Age.fillna(df1.Age.median())
df1.Fare = df1.Fare.fillna(df1.Fare.median())

In [10]:
X1 = MinMaxScaler().fit_transform(np.array(pd.get_dummies(df1[:split_index], columns=["Sex", "Embarked"], drop_first=True)))
get_scores(X1, ncv)

0.0 hours, 0.0 minutes and 7.3 seconds!


({'forest': 0.828199233716475,
  'svm': 0.8112643678160919,
  'xgb': 0.8148275862068967},
 0.8180970625798212)

#### 2 solution