In [24]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading the Data

In [198]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
submission = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

# EDA

In [26]:
train_data.head()

### Let's observe what titles the passengers have in their names

In [27]:
titles = [i.split(",")[1].split(".")[0].strip() for i in train_data["Name"].append(test_data["Name"])]
titles_freq = {i : titles.count(i) for i in titles}
titles_freq

### Let's encode the titles to the follwing categories :
* 0 : Mr 
* 1 : Mrs
* 2 : Miss
* 3 : Master
* 4 : other

In [None]:
encoder = {
    'Mr': 0,
    'Mrs': 1,
    'Miss': 2,
    'Master': 3,
    'Don': 4,
    'Rev': 4,
    'Dr': 4,
    'Mme': 2,
    'Ms': 2,
    'Major': 4,
    'Lady': 2,
    'Sir': 4,
    'Mlle': 2,
    'Col': 4,
    'Capt': 4,
    'the Countess': 1,
    'Jonkheer': 4 ,
    'Dona' : 2
}

### Let's create a function that extracts the Title from the Name column

In [28]:
def treat_names(data,encoder) :
    data = data.copy()
    titles = [i.split(",")[1].split(".")[0].strip() for i in data["Name"]]
    data['Title'] = [encoder[i] for i in titles]
    return data

In [29]:
temp = treat_names(train_data,encoder)
temp

### Let's observe the variation of the survival probability depending on the name title

In [30]:
%matplotlib inline    
plot = sns.catplot(data= temp, x= 'Title', y= 'Survived', palette="muted", kind = 'point')
title_plot = sns.catplot(data= temp, 
               y= 'Title',kind="count", palette="muted")
# sns.factorplot("Pclass", col="Embarked",  data=train,
#                    size=6, kind="count", palette="muted")

### We can see that this is an important feature for Survival seeing that the probality of surviving for someone with Title0 is 15% compared to 80% for someone with Title1

In [118]:
temp.head()

### We create a function that creates new columns for each Title category

In [32]:
def treat_titles(data):
    data = treat_names(data.copy(),encoder)
    for i in range(5):
        data["Title_"+str(i)] = data["Title"].map(lambda x : (x == i)*1)
    data.drop("Title",axis=1,inplace=True)
    return data

In [119]:
treat_titles(temp).head()

## All data tranformation and treatment in this function
* extract the titles from the name column and remove the name column
* remove the following columns : ['PassengerId','Ticket','Cabin'] seeing as they don't give meaningful or useful information
* add colums for each port a passenger can embark from
* add a Female and Male columns
* fill missing values in the age column

In [199]:
def treat_data(data):
    data = treat_titles(data.copy())
    data.drop(['Name','PassengerId','Ticket','Cabin'],axis=1,inplace = True)
    data['Cherbourg'] = (data['Embarked'] == 'C')*1
    data['Queenstown'] = (data['Embarked'] == 'Q')*1
    data['Southampton'] = (data['Embarked'] == 'S')*1
    data['Male'] = (data['Sex'] == 'male')*1
    data['Female'] = (data['Sex'] == 'female')*1
    data.drop(['Embarked','Sex'],axis=1,inplace=True)
    data['Age'].fillna(value=(train_data['Age'].mean()+test_data['Age'].mean())*0.5,inplace=True)
    return data

In [200]:
train = treat_data(train_data)
test = treat_data(test_data)

In [36]:
train.head()

In [203]:
test.isna().value_counts()

In [202]:
test.fillna(0,inplace=True)

In [39]:
# !pip install -U dataprep

In [40]:
# from dataprep.eda import create_report
# report = create_report(train, title='My Report')
# report

# Modelling

In [156]:
from sklearn.model_selection import train_test_split
from lightgbm import Dataset
import lightgbm as lgb
from xgboost import XGBClassifier
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score as ac
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, KFold, StratifiedKFold, StratifiedShuffleSplit

def scorer(estimator,X,y) :
    return ac(y,estimator.predict(X))

### Splitting the data

In [204]:
X = train.drop('Survived',axis=1)
y = train['Survived']
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,test_size=0.33)
train_set = Dataset(X, label=y)

# Scaling the data
Did not seem to improve performance

In [207]:
# # Feature Scaling
# ## We will be using standardscaler to transform
# from sklearn.preprocessing import StandardScaler
# st_scale = StandardScaler()

# st_scale.fit(X.append(test))

# ## transforming X
# X = st_scale.transform(X)

# ## transforming "train_x"
# X_train = st_scale.transform(X_train)

# ## transforming "test_x"
# X_test = st_scale.transform(X_test)

# ## transforming "The testset"
# test = st_scale.transform(test)

### Observing the performance of diffferent models

In [43]:
models = {
    'logreg' : LogisticRegression(solver='liblinear',penalty='l2'),
    'lgbm' : lgb.LGBMClassifier(objective='binary',n_estimators=8),
    'xgboost' : XGBClassifier(eval_metric='error',use_label_encoder=False,n_estimators=8),
    'KNN' : KNN(n_neighbors=10),
    'AdaBoost' : AdaBoostClassifier()
}

In [44]:
for name,model in models.items() :
    model.fit(X_train,y_train)
    print("{} :\nTraining : {} , Testing : {}".format(name,ac(y_train,model.predict(X_train)),ac(y_test,model.predict(X_test))))

In [45]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

In [46]:
for name, model in zip(names,classifiers) :
    model.fit(X_train,y_train)
    print("{} :\nTraining : {} , Testing : {}".format(name,ac(y_train,model.predict(X_train)),ac(y_test,model.predict(X_test))))

# Dropping more features
This did not improve the performance of the models so let's skip this part

In [48]:
def treat_data_1(data):
    data = data.copy()
    ### Fare
    data.drop(['Name','SibSp','Parch','PassengerId','Ticket','Cabin'],axis=1,inplace = True)
    data['Male'] = (data['Sex'] == 'male')*1
    data['Female'] = (data['Sex'] == 'female')*1
    data.drop(['Embarked','Sex'],axis=1,inplace=True)
    data['Age'].fillna(value=train_data['Age'].mean(),inplace=True)
    return data

In [49]:
# train = treat_data_1(train_data)
# test = treat_data_1(test_data)
# test.fillna(0,inplace=True)
# X = train.drop('Survived',axis=1)
# y = train['Survived']
# X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,test_size=0.3)

In [50]:
train.head()

# Hyperparameter Tuning

## Logistic Regression

In [51]:
import threading

In [52]:
grid={
    "C" : np.linspace(-3.0,3.0,60),
    "penalty" : ["l1","l2"],
    "fit_intercept" : [True,False],
    "solver" : ["liblinear","saga"]
}

logreg=LogisticRegression(max_iter=500,n_jobs=-1)
logreg_cv=GridSearchCV(logreg,grid,cv=5,verbose=2,n_jobs=-1)

def thread_function(name):
    logreg_cv.fit(X_train,y_train)
th = threading.Thread(target=thread_function, args=(1,))
th.start()

In [53]:
th.join()

In [54]:
print(logreg_cv.best_estimator_, logreg_cv.best_score_, logreg_cv.best_params_,sep='\n')

In [71]:
p = {'C': 2.593220338983051, 'fit_intercept': True, 'penalty': 'l1', 'solver': 'liblinear'}
p_less = {'C': 0.8644067796610173, 'fit_intercept': True, 'penalty': 'l1', 'solver': 'liblinear'}
p_even_less = {'C': 0.6610169491525424, 'fit_intercept': True, 'penalty': 'l1', 'solver': 'liblinear'}
new_p = {'C': 1.2711864406779663, 'fit_intercept': True, 'penalty': 'l1', 'solver': 'liblinear'}
p = {'C': 1.4745762711864412, 'fit_intercept': False, 'penalty': 'l2', 'solver': 'liblinear'}

final_logreg = LogisticRegression(**p)
final_logreg.fit(X_train,y_train)
print("Optimized LogReg :\nTraining : {} , Testing : {}".format(ac(y_train,final_logreg.predict(X_train)),ac(y_test,final_logreg.predict(X_test))))

In [72]:
cv_results = cross_val_score(final_logreg,X,y,scoring=scorer)
cv_results

In [57]:
y_submit = final_logreg.predict(test)
submission['Survived'] = y_submit
print(submission.head())
submission.to_csv('titles.csv', index=False)

## AdaBoost

In [58]:
grid = {
    'n_estimators' : np.linspace(2,60,58,dtype=int),
    'learning_rate' : np.linspace(0,3,30) ,
}
ada = AdaBoostClassifier()
ada_cv=GridSearchCV(ada,grid,cv=7,verbose=1,n_jobs=-1)
ada_cv.fit(X_train,y_train)

In [59]:
print(ada_cv.best_estimator_, ada_cv.best_score_, ada_cv.best_params_,sep='\n')

In [69]:
ada_p = {'learning_rate': 1.6271186440677967, 'n_estimators': 58}
ada_p = {'learning_rate': 1.5517241379310345, 'n_estimators': 43}
ada_p = {'learning_rate': 1.2413793103448276, 'n_estimators': 32}
ada_p = {'learning_rate': 1.3448275862068966, 'n_estimators': 36}

final_ada = AdaBoostClassifier(**ada_p)
final_ada.fit(X_train,y_train)
print("Optimized LogReg :\nTraining : {} , Testing : {}".format(ac(y_train,final_ada.predict(X_train)),ac(y_test,final_ada.predict(X_test))))

cross_val_score(final_ada,X,y,scoring=scorer)

In [70]:
y_submit = final_ada.predict(test)
submission['Survived'] = y_submit
print(submission.head())
submission.to_csv('AdaBoostwithTitles.csv', index=False)

## Bagging Classifier

In [177]:
from sklearn.ensemble import BaggingClassifier
n_estimators = [10,30,50,70,80,150,160, 170,175,180,185];
cv = StratifiedShuffleSplit(n_splits=10, test_size=.30, random_state=15)

parameters = {'n_estimators':n_estimators,
              
        }
grid = GridSearchCV(BaggingClassifier(base_estimator= None, ## If None, then the base estimator is a decision tree.
                                      bootstrap_features=False),
                                 param_grid=parameters,
                                 cv=cv,
                                 n_jobs = -1)
grid.fit(X,y) 

In [178]:
print(grid.best_estimator_)

In [179]:
bg = BaggingClassifier(n_estimators=160)
bg.fit(X_train,y_train)

cvs = cross_val_score(bg,X,y,scoring=scorer)
print("Bagging Classifier :\nTraining : {} , Testing : {}".format(ac(y_train,bg.predict(X_train)),ac(y_test,bg.predict(X_test))))
print("Cross Validation Scores mean = {} : ".format(cvs.mean()),cvs )

In [180]:
y_submit = bg.predict(test)
submission['Survived'] = y_submit
print(submission.head())
submission.to_csv('BaggingClassifierWithScaling.csv', index=False)

## Random Forest

In [163]:
grid = {
    "n_estimators" : np.linspace(2,62,60,dtype=int) ,
    "criterion" : ["gini", "entropy"] ,
    "max_depth" : np.linspace(2,60,58,dtype=int) ,
    "min_samples_split" : np.linspace(0,1,100) ,
    "max_features" : ["auto", "sqrt", "log2"] ,
    "bootstrap" : [True, False] ,
    "oob_score" : [True, False] ,
}

**RandomSearchCV**

In [164]:
rf = RandomForestClassifier()
rf_RSCV = RandomizedSearchCV(rf,grid,cv=5,verbose=1,n_jobs=-1,scoring='accuracy',n_iter=400)
rf_RSCV.fit(X_train,y_train)

In [165]:
print(rf_RSCV.best_estimator_, rf_RSCV.best_score_, rf_RSCV.best_params_,sep='\n')

### Optuna

In [209]:
def optimize(trial):
    
    # Definition of space search
    
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    n_estimators = trial.suggest_int('n_estimators', 2, 100)
    max_depth = trial.suggest_int('max_depth', 2, 30)
    min_samples_split = trial.suggest_float('min_samples_split',0.001,0.4,step=0.003)
    max_features = trial.suggest_categorical('max_features',["auto", "sqrt", "log2"])
    bootstrap = trial.suggest_categorical('bootstrap',[True, False])
#     oob_score = trial.suggest_categorical('oob_score',[True, False])
    

    # Classifier definition
    model = RandomForestClassifier(n_estimators=n_estimators,
                                max_depth=max_depth,
                                criterion=criterion,
                                min_samples_split=min_samples_split,
                                max_features=max_features,
                                bootstrap=bootstrap,
#                                 oob_score=oob_score
                                  )

    avg_accuracy = []

    # Definition of k-fold cross validation
    k_fold = StratifiedKFold(n_splits=5)

    for train_idx, test_idx in k_fold.split(X, y):
        
        # Training fold
#         x_tr = X.iloc[train_idx,:]
        x_tr = X[train_idx]
        y_tr = y[train_idx]
        
        # Testing fold
#         x_tst = X.iloc[test_idx,:]
        x_tst = X[test_idx]
        y_tst = y[test_idx]

        # Training
        model.fit(x_tr, y_tr)

        # Save accuracy
        avg_accuracy.append(model.score(x_tst, y_tst))
#     print("Accuracy_score : ",np.mean(avg_accuracy))
    return np.mean(avg_accuracy)


In [137]:
import joblib

In [210]:
# Study initialization
# direction = 'maximize' : since the goal is to maximize the accuracy
# sampler = 'TPEsampler' : default paramter for single-objective optimization
# pruner = 'MedianPruner' : default paramter for pruning useless configurations

study = optuna.create_study(study_name='lgbm',direction='maximize',
# #                             storage='/kaggle/working'
                           )

# study = joblib.load('/kaggle/working/scaled_rf_optuna.pkl')

timeout = 60*6


# Opimization
# Recevies the function to be optimized and the number of trials
study.optimize(optimize, n_trials=1000, timeout=timeout, n_jobs=-1)
joblib.dump(study, '/kaggle/working/scaled_rf_optuna.pkl')

In [211]:
print(study.best_params, study.best_value, sep='\n')

**Finalizing model**

In [213]:
p = {'oob_score': False, 'n_estimators': 37, 'min_samples_split': 0.07070707070707072, 'max_features': 'sqrt', 'max_depth': 29, 'criterion': 'entropy', 'bootstrap': False}
p = {'oob_score': True, 'n_estimators': 21, 'min_samples_split': 0.020202020202020204, 'max_features': 'sqrt', 'max_depth': 41, 'criterion': 'entropy', 'bootstrap': True}
p = {'oob_score': False, 'n_estimators': 44, 'min_samples_split': 0.020202020202020204, 'max_features': 'auto', 'max_depth': 7, 'criterion': 'entropy', 'bootstrap': False}
p = {'criterion': 'entropy', 'n_estimators': 36, 'max_depth': 14, 'min_samples_split': 0.04, 'max_features': 'sqrt', 'bootstrap': False}
p = {'criterion': 'gini', 'n_estimators': 50, 'max_depth': 28, 'min_samples_split': 0.01, 'max_features': 'auto', 'bootstrap': True}
p = {'oob_score': False, 'n_estimators': 45, 'min_samples_split': 0.030303030303030304, 'max_features': 'auto', 'max_depth': 6, 'criterion': 'gini', 'bootstrap': False}
p = {'criterion': 'gini', 'n_estimators': 32, 'max_depth': 11, 'min_samples_split': 0.01, 'max_features': 'auto', 'bootstrap': False}
p = {'criterion': 'gini', 'n_estimators': 37, 'max_depth': 11, 'min_samples_split': 0.04, 'max_features': 'auto', 'bootstrap': False}
p = {'oob_score': False, 'n_estimators': 35, 'min_samples_split': 0.020202020202020204, 'max_features': 'sqrt', 'max_depth': 10, 'criterion': 'gini', 'bootstrap': False}
p = {'criterion': 'gini', 'n_estimators': 99, 'max_depth': 20, 'min_samples_split': 0.01, 'max_features': 'sqrt', 'bootstrap': True}
p = {'criterion': 'gini', 'n_estimators': 10, 'max_depth': 19, 'min_samples_split': 0.01, 'max_features': 'log2', 'bootstrap': True}
p = {'oob_score': True, 'n_estimators': 57, 'min_samples_split': 0.010101010101010102, 'max_features': 'sqrt', 'max_depth': 10, 'criterion': 'entropy', 'bootstrap': True}
p = {'criterion': 'gini', 'n_estimators': 69, 'max_depth': 23, 'min_samples_split': 0.016, 'max_features': 'auto', 'bootstrap': True}
p = {'criterion': 'gini', 'n_estimators': 74, 'max_depth': 14, 'min_samples_split': 0.016, 'max_features': 'auto', 'bootstrap': True}


final_rf = RandomForestClassifier(**p)
final_rf.fit(X_train,y_train)

cvs = cross_val_score(final_rf,X,y,scoring=scorer)
print("Optimized RandomForest :\nTraining : {} , Testing : {}".format(ac(y_train,final_rf.predict(X_train)),ac(y_test,final_rf.predict(X_test))))
print("Cross Validation Scores mean = {} : ".format(cvs.mean()),cvs )

In [214]:
y_submit = final_rf.predict(test)
submission['Survived'] = y_submit
print(submission.head())
submission.to_csv('ScaledOptunaRandomForestwithTitles.csv', index=False)

## LightGBM

In [95]:
import warnings
warnings.filterwarnings('ignore')
# warnings.filterwarnings(action='once')

In [104]:
train_set = Dataset(X_train, label=y_train)

params = {'objective': 'binary',
         'learning_rate': 0.009,
         'random_state': 42,
          'metric': 'auc',
          'verbose': -1
         }

time_budget = 60*6

optuna.logging.set_verbosity(-1)
tuner = optuna.integration.lightgbm.LightGBMTunerCV(params=params,
                                                   train_set=train_set,
                                                   num_boost_round=200000,
                                                   nfold=5,
                                                   early_stopping_rounds=160,
                                                   verbose_eval=False,
                                                   time_budget=time_budget,
                                                   verbosity=-1
                                                   )
tuner.run()

In [105]:
print(f'Best score: {tuner.best_score}')
print(f'Best parameters: {tuner.best_params}')

In [106]:
p = {'objective': 'binary', 'learning_rate': 0.009, 'random_state': 42, 'metric': 'auc', 'verbose': -1, 'feature_pre_filter': False, 'lambda_l1': 0.2719534677136317, 'lambda_l2': 0.00019702222007845848, 'num_leaves': 31, 'feature_fraction': 0.4, 'bagging_fraction': 0.9642119561001264, 'bagging_freq': 7, 'min_child_samples': 20}
p = {'objective': 'binary', 'learning_rate': 0.009, 'random_state': 42, 'metric': 'auc', 'verbose': -1, 'feature_pre_filter': False, 'lambda_l1': 2.3093048528649408e-08, 'lambda_l2': 5.6117886784529134e-05, 'num_leaves': 18, 'feature_fraction': 1.0, 'bagging_fraction': 0.4615785697477765, 'bagging_freq': 6, 'min_child_samples': 20}


lgbmm = lgb.LGBMClassifier(**p)
lgbmm.fit(X_train,y_train)

cvs = cross_val_score(lgbmm,X,y,scoring=scorer)
print("Optimized LGBM :\nTraining : {} , Testing : {}".format(ac(y_train,lgbmm.predict(X_train)),ac(y_test,lgbmm.predict(X_test))))
print("Cross Validation Scores mean = {} : ".format(cvs.mean()),cvs )

In [116]:
y_submit = lgbmm.predict(test)
submission['Survived'] = y_submit
print(submission.head())
submission.to_csv('OptunaLGBMwithTitles.csv', index=False)