In [165]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, cross_val_score, RandomizedSearchCV, ParameterGrid
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score,accuracy_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tqdm import tqdm

In [166]:
from sklearn.model_selection import train_test_split
import plotly.express as px
import plotly.graph_objects as go
from catboost import cv, Pool
from sklearn.model_selection import StratifiedKFold

In [167]:
gender_submission = pd.read_csv('gender_submission.csv', index_col='PassengerId')
test = pd.read_csv('test.csv', index_col='PassengerId')
train = pd.read_csv('train.csv', index_col='PassengerId')

In [168]:
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [169]:
train.Survived.value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [170]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=train[train['Survived']==0]['Age'], name='Not Survived'))
fig.add_trace(go.Histogram(x=train[train['Survived']==1]['Age'], name='Survived'))
fig.update_layout(xaxis_title='Age', yaxis_title='Count')
fig.update_layout(barmode='overlay')

In [171]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=train[train['Survived']==0]['Pclass'], name='Not Survived'))
fig.add_trace(go.Histogram(x=train[train['Survived']==1]['Pclass'], name='Survived'))
fig.update_layout(xaxis_title='Pclass', yaxis_title='Count')
fig.update_layout(barmode='overlay')

In [172]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=train[train['Survived']==0]['Fare'], name='Not Survived'))
fig.add_trace(go.Histogram(x=train[train['Survived']==1]['Fare'], name='Survived'))
fig.update_layout(xaxis_title='Fare', yaxis_title='Count')
fig.update_layout(barmode='overlay')

In [173]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=train[train['Survived']==0]['Sex'], name='Not Survived'))
fig.add_trace(go.Histogram(x=train[train['Survived']==1]['Sex'], name='Survived'))
fig.update_layout(xaxis_title='Sex', yaxis_title='Count')
fig.update_layout(barmode='overlay')

In [174]:
train.dtypes

Survived      int64
Pclass        int64
Name         object
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object

In [175]:
train.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [176]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [177]:
train['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [178]:
train['Cabin'].unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [179]:
train['Ticket'].unique()

array(['A/5 21171', 'PC 17599', 'STON/O2. 3101282', '113803', '373450',
       '330877', '17463', '349909', '347742', '237736', 'PP 9549',
       '113783', 'A/5. 2151', '347082', '350406', '248706', '382652',
       '244373', '345763', '2649', '239865', '248698', '330923', '113788',
       '347077', '2631', '19950', '330959', '349216', 'PC 17601',
       'PC 17569', '335677', 'C.A. 24579', 'PC 17604', '113789', '2677',
       'A./5. 2152', '345764', '2651', '7546', '11668', '349253',
       'SC/Paris 2123', '330958', 'S.C./A.4. 23567', '370371', '14311',
       '2662', '349237', '3101295', 'A/4. 39886', 'PC 17572', '2926',
       '113509', '19947', 'C.A. 31026', '2697', 'C.A. 34651', 'CA 2144',
       '2669', '113572', '36973', '347088', 'PC 17605', '2661',
       'C.A. 29395', 'S.P. 3464', '3101281', '315151', 'C.A. 33111',
       'S.O.C. 14879', '2680', '1601', '348123', '349208', '374746',
       '248738', '364516', '345767', '345779', '330932', '113059',
       'SO/C 14885', '31012

In [180]:
columns_with_text = ['Name', 'Ticket']

In [181]:
def prepare_df(df):
    df['Embarked'] = df['Embarked'].fillna('Unknown')
    df['Cabin'] = df['Cabin'].fillna('Unknown')
    df['Cabin_type'] = df['Cabin'].str[0]
    df['Sex'] = df['Sex'].replace({'male':1, 'female':0})
    df.rename({'Sex':'Male'}, axis=1, inplace=True)
    df['Age'] = df.groupby('Male')['Age'].transform('mean')
    df.drop(columns_with_text, axis=1, inplace=True)
    return df

In [182]:
def find_object_columns(df):
    object_columns = []
    for i in df.columns:
        if df[i].dtype == np.dtype(object):
            object_columns.append(i)
    return object_columns

In [183]:
X = train.drop(['Survived'], axis=1)
y = train['Survived']

In [184]:
y = np.array(y.to_list())

In [185]:
X = prepare_df(X)

In [186]:
object_columns = find_object_columns(X)
print(object_columns)

['Cabin', 'Embarked', 'Cabin_type']


In [187]:
n_fold = 5
folds = StratifiedKFold(n_splits=n_fold, shuffle=True)

params = {'loss_function':'Logloss',
          'eval_metric':'AUC',
          'verbose': 100
         }
scores = []
prediction = np.zeros(X_test.shape[0])

In [188]:
X_test = prepare_df(test)

In [189]:
test_data = Pool(data=X_test,
                 cat_features=object_columns)

In [190]:
for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y_t)):
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    
    train_data = Pool(data=X_train, 
                      label=y_train,
                      cat_features=object_columns)
    valid_data = Pool(data=X_valid, 
                      label=y_valid,
                      cat_features=object_columns)
    model = CatBoostClassifier(**params)
    model.fit(train_data,
              eval_set=valid_data, 
              use_best_model=True
             )
    score = model.get_best_score()['validation']['AUC']
    scores.append(score)
    
    y_pred = model.predict_proba(test_data)[:, 1]
    prediction += y_pred

Learning rate set to 0.029139
0:	test: 0.8052042	best: 0.8052042 (0)	total: 13.6ms	remaining: 13.6s
100:	test: 0.8102767	best: 0.8150856 (76)	total: 1.31s	remaining: 11.7s
200:	test: 0.8158762	best: 0.8158762 (199)	total: 2.73s	remaining: 10.9s
300:	test: 0.8123188	best: 0.8158762 (199)	total: 4.23s	remaining: 9.82s
400:	test: 0.8061265	best: 0.8158762 (199)	total: 5.78s	remaining: 8.64s
500:	test: 0.8089592	best: 0.8158762 (199)	total: 7.25s	remaining: 7.22s
600:	test: 0.8013175	best: 0.8158762 (199)	total: 8.7s	remaining: 5.78s
700:	test: 0.8011858	best: 0.8158762 (199)	total: 10.2s	remaining: 4.34s
800:	test: 0.8006588	best: 0.8158762 (199)	total: 11.8s	remaining: 2.94s
900:	test: 0.7994730	best: 0.8158762 (199)	total: 13.4s	remaining: 1.47s
999:	test: 0.7985507	best: 0.8158762 (199)	total: 14.9s	remaining: 0us

bestTest = 0.8158761528
bestIteration = 199

Shrink model to first 200 iterations.
Learning rate set to 0.029149
0:	test: 0.7673797	best: 0.7673797 (0)	total: 10.6ms	remaini

In [195]:
prediction /= n_fold

In [197]:
print('CV mean: {:.4f}, CV std: {:.4f}'.format(np.mean(scores), np.std(scores)))

CV mean: 0.8668, CV std: 0.0370


In [215]:
prediction_binary = np.where(prediction > 0.5, 1, 0)

In [216]:
gender_submission['Survived'] = prediction_binary

In [217]:
gender_submission.to_csv('titanic_catboost_submission.csv', index=True)

In [None]:
#private scor: 0.77751