In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsRegressor

from catboost import Pool, CatBoostClassifier
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor

In [2]:
seed = np.random.randint(1,300)

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
split_index = len(train)
target = train.Survived
Id = test.PassengerId

data = pd.read_csv("vova_titanic_to_modeling.csv").iloc[:, 1:]
cats = ["Pclass", "Sex", "Embarked", "IsCabin", "Ticket_First_Bin", "Title_Bin"]
for col in cats:
    data[col] = data[col].astype("category")

In [4]:
X_TRAIN = data.iloc[:split_index]
X_TEST = data.iloc[split_index:]

In [5]:
data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,IsCabin,Ticket_First_Bin,Title_Bin
0,3,male,22.0,0.693147,0.000000,2.110213,S,0,new1,Mr
1,1,female,38.0,0.693147,0.000000,4.280593,C,1,P,Mrs
2,3,female,26.0,0.000000,0.000000,2.188856,S,0,S,Mrs
3,1,female,35.0,0.693147,0.000000,3.990834,S,1,1,Mrs
4,3,male,35.0,0.000000,0.000000,2.202765,S,0,3,Mr
...,...,...,...,...,...,...,...,...,...,...
1304,3,male,26.0,0.000000,0.000000,2.202765,S,0,new1,Mr
1305,1,female,39.0,0.000000,0.000000,4.699571,C,1,P,Other
1306,3,male,38.5,0.000000,0.000000,2.110213,S,0,S,Mr
1307,3,male,26.0,0.000000,0.000000,2.202765,S,0,3,Mr


#### CatBoost

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X_TRAIN, target, test_size=0.25, random_state=seed)

cb = CatBoostClassifier()
train_pool = Pool(X_train, y_train, cat_features=cats)
val_pool = Pool(X_val, y_val, cat_features=cats)
cb.fit(train_pool)

Learning rate set to 0.008672
0:	learn: 0.6882481	total: 173ms	remaining: 2m 53s
1:	learn: 0.6834049	total: 185ms	remaining: 1m 32s
2:	learn: 0.6779979	total: 203ms	remaining: 1m 7s
3:	learn: 0.6728784	total: 220ms	remaining: 54.7s
4:	learn: 0.6677992	total: 236ms	remaining: 47s
5:	learn: 0.6630285	total: 252ms	remaining: 41.7s
6:	learn: 0.6574510	total: 276ms	remaining: 39.1s
7:	learn: 0.6524788	total: 291ms	remaining: 36.1s
8:	learn: 0.6473913	total: 307ms	remaining: 33.8s
9:	learn: 0.6426815	total: 322ms	remaining: 31.9s
10:	learn: 0.6387704	total: 338ms	remaining: 30.4s
11:	learn: 0.6340090	total: 354ms	remaining: 29.1s
12:	learn: 0.6294229	total: 369ms	remaining: 28s
13:	learn: 0.6251111	total: 385ms	remaining: 27.1s
14:	learn: 0.6227621	total: 392ms	remaining: 25.8s
15:	learn: 0.6184032	total: 406ms	remaining: 24.9s
16:	learn: 0.6152081	total: 416ms	remaining: 24.1s
17:	learn: 0.6108082	total: 471ms	remaining: 25.7s
18:	learn: 0.6073026	total: 481ms	remaining: 24.8s
19:	learn: 0.

166:	learn: 0.4022530	total: 2.84s	remaining: 14.2s
167:	learn: 0.4018643	total: 2.85s	remaining: 14.1s
168:	learn: 0.4016509	total: 2.87s	remaining: 14.1s
169:	learn: 0.4012348	total: 2.88s	remaining: 14.1s
170:	learn: 0.4010075	total: 2.9s	remaining: 14.1s
171:	learn: 0.4004604	total: 2.92s	remaining: 14s
172:	learn: 0.4001736	total: 2.93s	remaining: 14s
173:	learn: 0.3995543	total: 2.95s	remaining: 14s
174:	learn: 0.3991540	total: 2.96s	remaining: 14s
175:	learn: 0.3988475	total: 2.98s	remaining: 13.9s
176:	learn: 0.3986050	total: 2.99s	remaining: 13.9s
177:	learn: 0.3982327	total: 3s	remaining: 13.9s
178:	learn: 0.3977426	total: 3.02s	remaining: 13.9s
179:	learn: 0.3973941	total: 3.04s	remaining: 13.8s
180:	learn: 0.3970725	total: 3.06s	remaining: 13.8s
181:	learn: 0.3965754	total: 3.07s	remaining: 13.8s
182:	learn: 0.3964956	total: 3.08s	remaining: 13.7s
183:	learn: 0.3958712	total: 3.09s	remaining: 13.7s
184:	learn: 0.3954429	total: 3.11s	remaining: 13.7s
185:	learn: 0.3951456	to

329:	learn: 0.3595682	total: 5.49s	remaining: 11.1s
330:	learn: 0.3594110	total: 5.5s	remaining: 11.1s
331:	learn: 0.3592546	total: 5.52s	remaining: 11.1s
332:	learn: 0.3591453	total: 5.54s	remaining: 11.1s
333:	learn: 0.3588272	total: 5.55s	remaining: 11.1s
334:	learn: 0.3586329	total: 5.57s	remaining: 11.1s
335:	learn: 0.3585235	total: 5.59s	remaining: 11s
336:	learn: 0.3583326	total: 5.6s	remaining: 11s
337:	learn: 0.3583043	total: 5.61s	remaining: 11s
338:	learn: 0.3581357	total: 5.63s	remaining: 11s
339:	learn: 0.3577984	total: 5.64s	remaining: 11s
340:	learn: 0.3576041	total: 5.66s	remaining: 10.9s
341:	learn: 0.3575036	total: 5.67s	remaining: 10.9s
342:	learn: 0.3574332	total: 5.69s	remaining: 10.9s
343:	learn: 0.3573873	total: 5.7s	remaining: 10.9s
344:	learn: 0.3572367	total: 5.72s	remaining: 10.9s
345:	learn: 0.3570442	total: 5.74s	remaining: 10.8s
346:	learn: 0.3569545	total: 5.75s	remaining: 10.8s
347:	learn: 0.3567799	total: 5.77s	remaining: 10.8s
348:	learn: 0.3564537	tot

500:	learn: 0.3363726	total: 8.33s	remaining: 8.3s
501:	learn: 0.3361136	total: 8.35s	remaining: 8.28s
502:	learn: 0.3360401	total: 8.37s	remaining: 8.27s
503:	learn: 0.3358345	total: 8.38s	remaining: 8.25s
504:	learn: 0.3357118	total: 8.4s	remaining: 8.23s
505:	learn: 0.3356406	total: 8.41s	remaining: 8.21s
506:	learn: 0.3354092	total: 8.43s	remaining: 8.2s
507:	learn: 0.3353116	total: 8.45s	remaining: 8.18s
508:	learn: 0.3351212	total: 8.46s	remaining: 8.16s
509:	learn: 0.3349491	total: 8.48s	remaining: 8.15s
510:	learn: 0.3348194	total: 8.49s	remaining: 8.13s
511:	learn: 0.3347169	total: 8.51s	remaining: 8.11s
512:	learn: 0.3346842	total: 8.52s	remaining: 8.09s
513:	learn: 0.3345712	total: 8.54s	remaining: 8.07s
514:	learn: 0.3344948	total: 8.55s	remaining: 8.05s
515:	learn: 0.3344600	total: 8.57s	remaining: 8.04s
516:	learn: 0.3342615	total: 8.58s	remaining: 8.02s
517:	learn: 0.3340007	total: 8.6s	remaining: 8s
518:	learn: 0.3338765	total: 8.62s	remaining: 7.99s
519:	learn: 0.33370

663:	learn: 0.3190445	total: 11s	remaining: 5.57s
664:	learn: 0.3190282	total: 11s	remaining: 5.55s
665:	learn: 0.3190069	total: 11.1s	remaining: 5.54s
666:	learn: 0.3189254	total: 11.1s	remaining: 5.53s
667:	learn: 0.3188043	total: 11.1s	remaining: 5.51s
668:	learn: 0.3186476	total: 11.1s	remaining: 5.5s
669:	learn: 0.3184195	total: 11.1s	remaining: 5.48s
670:	learn: 0.3183403	total: 11.1s	remaining: 5.46s
671:	learn: 0.3179704	total: 11.2s	remaining: 5.45s
672:	learn: 0.3178644	total: 11.2s	remaining: 5.43s
673:	learn: 0.3178338	total: 11.2s	remaining: 5.41s
674:	learn: 0.3177420	total: 11.2s	remaining: 5.4s
675:	learn: 0.3177277	total: 11.2s	remaining: 5.38s
676:	learn: 0.3175729	total: 11.2s	remaining: 5.37s
677:	learn: 0.3175235	total: 11.3s	remaining: 5.36s
678:	learn: 0.3173138	total: 11.3s	remaining: 5.34s
679:	learn: 0.3171266	total: 11.3s	remaining: 5.32s
680:	learn: 0.3169841	total: 11.3s	remaining: 5.31s
681:	learn: 0.3167599	total: 11.4s	remaining: 5.3s
682:	learn: 0.31673

828:	learn: 0.3034693	total: 13.9s	remaining: 2.86s
829:	learn: 0.3034602	total: 13.9s	remaining: 2.85s
830:	learn: 0.3033432	total: 13.9s	remaining: 2.83s
831:	learn: 0.3032261	total: 14s	remaining: 2.82s
832:	learn: 0.3031311	total: 14s	remaining: 2.8s
833:	learn: 0.3030615	total: 14s	remaining: 2.79s
834:	learn: 0.3030496	total: 14s	remaining: 2.77s
835:	learn: 0.3029516	total: 14s	remaining: 2.75s
836:	learn: 0.3028974	total: 14s	remaining: 2.73s
837:	learn: 0.3027623	total: 14.1s	remaining: 2.72s
838:	learn: 0.3027506	total: 14.1s	remaining: 2.7s
839:	learn: 0.3026608	total: 14.1s	remaining: 2.68s
840:	learn: 0.3025698	total: 14.1s	remaining: 2.67s
841:	learn: 0.3025180	total: 14.1s	remaining: 2.65s
842:	learn: 0.3023255	total: 14.1s	remaining: 2.63s
843:	learn: 0.3021135	total: 14.2s	remaining: 2.62s
844:	learn: 0.3020169	total: 14.2s	remaining: 2.6s
845:	learn: 0.3019529	total: 14.2s	remaining: 2.58s
846:	learn: 0.3018129	total: 14.2s	remaining: 2.57s
847:	learn: 0.3017413	total

990:	learn: 0.2886855	total: 16.7s	remaining: 152ms
991:	learn: 0.2885596	total: 16.8s	remaining: 135ms
992:	learn: 0.2885253	total: 16.8s	remaining: 118ms
993:	learn: 0.2884096	total: 16.8s	remaining: 101ms
994:	learn: 0.2883935	total: 16.8s	remaining: 84.5ms
995:	learn: 0.2883767	total: 16.8s	remaining: 67.6ms
996:	learn: 0.2882172	total: 16.8s	remaining: 50.7ms
997:	learn: 0.2881719	total: 16.9s	remaining: 33.8ms
998:	learn: 0.2881073	total: 16.9s	remaining: 16.9ms
999:	learn: 0.2880682	total: 16.9s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x13693ecda00>

In [7]:
print(classification_report(y_val, cb.predict(val_pool)))

              precision    recall  f1-score   support

           0       0.86      0.89      0.87       146
           1       0.77      0.71      0.74        77

    accuracy                           0.83       223
   macro avg       0.81      0.80      0.81       223
weighted avg       0.83      0.83      0.83       223



#### LGBM

In [11]:
lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train, categorical_feature=cats)
print(classification_report(y_val, lgbm.predict(X_val)))

              precision    recall  f1-score   support

           0       0.88      0.79      0.83       146
           1       0.67      0.79      0.73        77

    accuracy                           0.79       223
   macro avg       0.77      0.79      0.78       223
weighted avg       0.81      0.79      0.80       223





In [13]:
data2 = pd.get_dummies(data, columns=["Sex", "Embarked", "Ticket_First_Bin", "Title_Bin"], drop_first=True)
X_TRAIN2 = MinMaxScaler().fit_transform(data2.iloc[:split_index])
X_TEST2 = MinMaxScaler().fit_transform(data2.iloc[split_index:])
Xtrain2, Xval2, y_train2, y_val2 = train_test_split(X_TRAIN2, target, test_size=0.25, random_state=seed)