In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

##Download and Preparation data

In [3]:
titanic_df = pd.read_csv("https://gist.githubusercontent.com/michhar/2dfd2de0d4f8727f873422c5d959fff5/raw/fa71405126017e6a37bea592440b4bee94bf7b9e/titanic.csv")
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic_df = titanic_df.dropna() # скидываем NaN

titanic_df = titanic_df.drop(columns=['Ticket', 'PassengerId', 'Cabin', 'Name'])

titanic_df = titanic_df.replace('male', 0)
titanic_df = titanic_df.replace('female', 1)

titanic_df['Age'] = titanic_df['Age'].astype(int)

titanic_df['FamilySize'] = titanic_df['SibSp'] + titanic_df['Parch'] + 1

titanic_df['Embarked'] = titanic_df['Embarked'].map({'C':0, 'Q':1, 'S':2}) 


In [5]:
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize
1,1,1,1,38,1,0,71.2833,0,2
3,1,1,1,35,1,0,53.1,2,2
6,0,1,0,54,0,0,51.8625,2,1
10,1,3,1,4,1,1,16.7,2,3
11,1,1,1,58,0,0,26.55,2,1


In [6]:
from sklearn.model_selection import train_test_split

X = titanic_df[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize', 'SibSp', 'Parch']]
y = titanic_df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15) 

##Ensemble models

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
import lightgbm as lgb
!pip install catboost
from catboost import CatBoostClassifier

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/7e/c1/c1c4707013f9e2f8a96899dd3a87f66c9167d6d776a6dc8fe7ec8678d446/catboost-0.24.3-cp36-none-manylinux1_x86_64.whl (66.3MB)
[K     |████████████████████████████████| 66.3MB 45kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.3


##1 DecisionTreeClassifier

In [8]:
model1 = DecisionTreeClassifier(criterion='entropy', max_depth=3)     # дерево решений
model1.fit(X_train,y_train)
print(model1.score(X_test, y_test))

0.75


##2 KNeighborsClassifier

In [9]:
model2 = KNeighborsClassifier(n_neighbors=4, metric='manhattan')       # KNN
model2.fit(X_train,y_train)
print(model2.score(X_test, y_test))

0.42857142857142855


##3 LogisticRegression

In [10]:
model3 = LogisticRegression(penalty='l2')         # Логисттическая регрессия 
model3.fit(X_train,y_train)
print(model3.score(X_test, y_test))

0.8928571428571429


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


##4 SVC

In [11]:
model4 = SVC(gamma="auto", C=1.5)                        # SVC
model4.fit(X_train,y_train)
print(model4.score(X_test, y_test))

0.7142857142857143


##5 RandomForestClassifier

In [12]:
model5 = RandomForestClassifier(max_depth=3, random_state=0)     # RandomForest
model5.fit(X_train,y_train)
print(model5.score(X_test, y_test))

0.8214285714285714


##6 XGBClassifier

In [13]:
model6 = xgb.XGBClassifier()          # XGBoost
model6.fit(X_train,y_train)
print(model6.score(X_test, y_test))

0.7857142857142857


##7 LGB

In [14]:
model7 = lgb.LGBMClassifier(learning_rate=0.1, num_leaves=3)
model7.fit(X_train, y_train)
model7.score(X_test, y_test)

0.7857142857142857

##8 CatBoostClassifier

In [15]:
model8 = CatBoostClassifier(iterations=3, learning_rate=1, depth=2)         # CatBoost 
model8.fit(X_train,y_train)
model8.score(X_test, y_test)

0:	learn: 0.5127484	total: 48.7ms	remaining: 97.5ms
1:	learn: 0.4782973	total: 49.1ms	remaining: 24.6ms
2:	learn: 0.4489260	total: 49.5ms	remaining: 0us


0.8571428571428571

##Сравнение моделей


In [16]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [52]:
from sklearn.metrics import classification_report
target_names = ['dead', 'alive']
print(color.BLUE + "Decission tree\n" + color.END, classification_report(y_test, model1.predict(X_test), target_names=target_names))
print(color.BLUE + "KNN\n" + color.END, classification_report(y_test, model2.predict(X_test), target_names=target_names))
print(color.BLUE + "Log reg\n" + color.END, classification_report(y_test, model3.predict(X_test), target_names=target_names))
print(color.BLUE + "SVC\n" + color.END, classification_report(y_test, model4.predict(X_test), target_names=target_names))
print(color.BLUE + "Random Forest\n" + color.END, classification_report(y_test, model5.predict(X_test), target_names=target_names))
print(color.BLUE + "XGB\n" + color.END,classification_report(y_test, model6.predict(X_test), target_names=target_names))
print(color.BLUE + "LGB\n" + color.END, classification_report(y_test, model7.predict(X_test), target_names=target_names))
print(color.BLUE + "CatBoost\n" + color.END, classification_report(y_test, model8.predict(X_test), target_names=target_names))

[94mDecission tree
[0m               precision    recall  f1-score   support

        dead       0.50      0.86      0.63         7
       alive       0.94      0.71      0.81        21

    accuracy                           0.75        28
   macro avg       0.72      0.79      0.72        28
weighted avg       0.83      0.75      0.77        28

[94mKNN
[0m               precision    recall  f1-score   support

        dead       0.24      0.57      0.33         7
       alive       0.73      0.38      0.50        21

    accuracy                           0.43        28
   macro avg       0.48      0.48      0.42        28
weighted avg       0.60      0.43      0.46        28

[94mLog reg
[0m               precision    recall  f1-score   support

        dead       0.83      0.71      0.77         7
       alive       0.91      0.95      0.93        21

    accuracy                           0.89        28
   macro avg       0.87      0.83      0.85        28
weighted avg     

In [50]:
from sklearn.metrics import mean_squared_error  as mse, log_loss as lol
models = ([
   [
   mse(y_test, model1.predict(X_test)), mse(y_test, model2.predict(X_test)), mse(y_test, model3.predict(X_test)), 
   mse(y_test, model3.predict(X_test)), mse(y_test, model4.predict(X_test)), mse(y_test, model5.predict(X_test)), 
   mse(y_test, model6.predict(X_test)), mse(y_test, model7.predict(X_test)), mse(y_test, model8.predict(X_test))
   ], 
   [ 
   lol(y_test, model1.predict(X_test)), lol(y_test, model2.predict(X_test)), 
   lol(y_test, model3.predict(X_test)), lol(y_test, model4.predict(X_test)), 
   lol(y_test, model5.predict(X_test)), lol(y_test, model6.predict(X_test)), 
   lol(y_test, model7.predict(X_test)), lol(y_test, model8.predict(X_test))
   ] 
    ])

models_name = ['DecisionTreeClassifier', 'KNN', 'LogReg', 'SVC', 'RandomForest', 'XGB', 'LGB', 'CatBoost']
print(color.RED + "RMSE" + color.END + ' | ' + color.RED + "Log_loss" + color.END)
for i in range(0, 7):
  print('{:.2f}   {:.2f}     {}'.format((models[0][i]), (models[1][i]), models_name[i]))


[91mRMSE[0m | [91mLog_loss[0m
0.25   8.63     DecisionTreeClassifier
0.57   19.74     KNN
0.11   3.70     LogReg
0.11   9.87     SVC
0.29   6.17     RandomForest
0.18   7.40     XGB
0.21   7.40     LGB


In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

#sns.heatmap(a, annot=True, fmt='.2%')

heat_map = sns.heatmap((confusion_matrix(y_test, model2.predict(X_test))/ 15),  annot=True, fmt='.2%')

##Ансамбль

In [None]:
from sklearn.ensemble import BaggingClassifier