In [3]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier

from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, MinMaxScaler

pd.set_option('display.max_columns', 20)
pd.set_option('mode.chained_assignment', None)


In [7]:
path = '../input/titanic/train.csv'
df = pd.read_csv(path) # titanic_fullsample
print(df.head())
print(df.shape)

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
(8

In [8]:
#%% Encode categorical variables ###

df.drop(columns=['Name', 'Ticket', 'Cabin'],inplace=True)
df.loc[df.Age.isnull(),'Age'] = df.Age.median()

df.loc[df.SibSp>2,'SibSp']=3
df.loc[df.Parch>2,'Parch']=3

df_uniques = pd.DataFrame([[i, len(df[i].unique())] for i in df.columns], columns=['Variable', 'Unique Values']).set_index('Variable')
df_uniques

binary_variables = list(df_uniques[df_uniques['Unique Values'] == 2].index)
categorical_variables = list(df_uniques[(6 >= df_uniques['Unique Values']) & (df_uniques['Unique Values'] > 2)].index)
numeric_variables = list(set(df.columns) - set(categorical_variables) - set(binary_variables))

lb = LabelBinarizer()
binary_variables.remove('Survived')

for column in binary_variables:
    df[column] = lb.fit_transform(df[column])

df = pd.get_dummies(df, columns = categorical_variables, drop_first=True)

print(df.head(10))

   PassengerId  Survived  Sex   Age     Fare  Pclass_2  Pclass_3  SibSp_1  \
0            1         0    1  22.0   7.2500         0         1        1   
1            2         1    0  38.0  71.2833         0         0        1   
2            3         1    0  26.0   7.9250         0         1        0   
3            4         1    0  35.0  53.1000         0         0        1   
4            5         0    1  35.0   8.0500         0         1        0   
5            6         0    1  28.0   8.4583         0         1        0   
6            7         0    1  54.0  51.8625         0         0        0   
7            8         0    1   2.0  21.0750         0         1        0   
8            9         1    0  27.0  11.1333         0         1        0   
9           10         1    0  14.0  30.0708         1         0        1   

   SibSp_2  SibSp_3  Parch_1  Parch_2  Parch_3  Embarked_Q  Embarked_S  
0        0        0        0        0        0           0           1  
1     

In [9]:
# %% scaling ###

y = df['Survived']
X = df.drop(columns=['Survived'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

mm = MinMaxScaler()

for column in [numeric_variables]:
    X_train[column] = mm.fit_transform(X_train[column])
    X_test[column] = mm.fit_transform(X_test[column])

In [10]:
#%% Logistic regression ###

grid_values = {'penalty': ['l2'], 'C': [1,2,4,10,15,20,25,30,40,100]}
lr = LogisticRegression()
model_lr = GridSearchCV(lr, param_grid=grid_values, cv = 20)
model_lr.fit(X_train, y_train)
print(model_lr.best_score_, model_lr.best_params_)

# model_lr.predict(X_test)

0.8046428571428572 {'C': 2, 'penalty': 'l2'}


In [None]:
#%% KNN ###

grid_values = dict(n_neighbors=np.arange(1,40))
knnm = KNeighborsClassifier()   
model_knn = GridSearchCV(knnm, param_grid=grid_values, cv = 20)
model_knn.fit(X_train, y_train)
print(model_knn.best_score_, model_knn.best_params_)

In [None]:
#%% SVM ###

grid_values = {'C': np.arange(0.05, 1, 0.05)} 
svmm = svm.SVC(kernel='rbf')
model_svm = GridSearchCV(svmm, param_grid=grid_values, cv = 20)
model_svm.fit(X_train, y_train)
print(model_svm.best_score_, model_svm.best_params_)

In [None]:
#%% RF ###

# may look here: https://www.geeksforgeeks.org/hyperparameter-tuning/

grid_values = [{'max_depth': list(range(2, 9)), 'max_features': list(np.arange(0.2,0.71,0.05))}]
rfc = RandomForestClassifier(random_state=42)
model_rf = GridSearchCV(rfc, grid_values, cv = 20, scoring='accuracy')
model_rf.fit(X_train, y_train)
print(model_rf.best_score_, model_rf.best_params_)

In [16]:
#%% XGBoost? ###
# run this code only on Kaggle with GPU

estimator = XGBClassifier(
    nthread=4,
    seed=42,
    use_label_encoder=False
)

parameters = {
    'max_depth': range (2, 5, 1),
    'n_estimators': range(10, 50, 5),
    'learning_rate': [0.01, 0.05, 0.1, 0.15]
}

grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = 10,
    cv = 10,
    verbose=True
)

grid_search.fit(X_train, y_train, eval_metric='rmse')
print(grid_search.best_score_, grid_search.best_params_)

Fitting 10 folds for each of 96 candidates, totalling 960 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  40 tasks      | elapsed:    1.2s
[Parallel(n_jobs=10)]: Done 300 tasks      | elapsed:   12.7s
[Parallel(n_jobs=10)]: Done 550 tasks      | elapsed:   24.0s
[Parallel(n_jobs=10)]: Done 900 tasks      | elapsed:   39.9s


0.8585024029468474 {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 10}


[Parallel(n_jobs=10)]: Done 960 out of 960 | elapsed:   44.0s finished


In [None]:
# evaluate performance oos