In [202]:
from matplotlib import pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pandas as pd
import numpy as np
import math

from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from catboost import CatBoostClassifier
from sklearn.svm import LinearSVC

import warnings
warnings.filterwarnings('ignore')

### Чтение и предобработка

In [203]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [204]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [205]:
def scaling_min_max(column):
    column = column.astype('float64')
    min_item, max_item = min(column), max(column)
    diff = max_item-min_item
    for i in range(len(column)):
        column[i] = (column[i]-min_item)/diff
    return column

In [206]:
def preprocessing_data(data):
    # female == 1 cause they survived more often
    data['Sex'] = data['Sex'].apply(lambda x: 2 if x == 'female' else 0) 
    data['Embarked'] = data['Embarked'].fillna('S')
    data['Pclass'] = data['Pclass'].apply(lambda x: (3-x)/2)
    
    # Группировка кабин (в зависимости от выживаемости)
    data['Cabin'] = data['Cabin'].apply(lambda x: '0' if pd.notna(x) and 'A' in x else x)
    data['Cabin'] = data['Cabin'].apply(lambda x: '1' if pd.notna(x) and 'B' in x else x)
    data['Cabin'] = data['Cabin'].apply(lambda x: '1' if pd.notna(x) and 'C' in x else x)
    data['Cabin'] = data['Cabin'].apply(lambda x: '1' if pd.notna(x) and 'D' in x else x)
    data['Cabin'] = data['Cabin'].apply(lambda x: '1' if pd.notna(x) and 'E' in x else x)
    data['Cabin'] = data['Cabin'].apply(lambda x: '1' if pd.notna(x) and 'F' in x else x)
    data['Cabin'] = data['Cabin'].apply(lambda x: '0' if pd.notna(x) and 'G' in x else x)
    data['Cabin'] = data['Cabin'].apply(lambda x: '0' if pd.notna(x) and 'T' in x else x)
    data['Cabin'] = data['Cabin'].fillna('0')
    data['Cabin'] = data['Cabin'].astype(int)
    
    # Примитивное заполнение пропусков возраста
    data[(list(map(lambda i: 'Mrs.' in i, data.Name)))] = data[(list(map(lambda i: 'Mrs.' in i, data.Name)))].fillna(33)
    data[(list(map(lambda i: 'Mr.' in i, data.Name)))] = data[(list(map(lambda i: 'Mr.' in i, data.Name)))].fillna(31)
    data[(list(map(lambda i: 'Ms.' in i, data.Name)))] = data[(list(map(lambda i: 'Ms.' in i, data.Name)))].fillna(33)
    data[(list(map(lambda i: 'Miss.' in i, data.Name)))] = data[(list(map(lambda i: 'Miss.' in i, data.Name)))].fillna(20)
    data[(list(map(lambda i: 'Master.' in i, data.Name)))] = data[(list(map(lambda i: 'Master.' in i, data.Name)))].fillna(5)
    data[(list(map(lambda i: 'Dr.' in i, data.Name)))] = data[(list(map(lambda i: 'Dr.' in i, data.Name)))].fillna(43)
    
    # Округляем возраст детей до года
    data['Age'] = data['Age'].apply(lambda x: 60 if x>60 else x-x%5).astype('Int32')
    
    # Нормализация
    data['Age'] = scaling_min_max(data['Age'])
    data['Parch'] = scaling_min_max(data['Parch'])
    data['Fare'] = scaling_min_max(data['Fare'])

In [207]:
preprocessing_data(train_data)
preprocessing_data(test_data)
display(train_data.info())
display(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    float64
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    float64
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    int32  
 11  Embarked     891 non-null    object 
dtypes: float64(4), int32(1), int64(4), object(3)
memory usage: 80.2+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    float64
 2   Name         418 non-null    object 
 3   Sex          418 non-null    int64  
 4   Age          418 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    float64
 7   Ticket       418 non-null    object 
 8   Fare         418 non-null    float64
 9   Cabin        418 non-null    int32  
 10  Embarked     418 non-null    object 
dtypes: float64(4), int32(1), int64(3), object(3)
memory usage: 34.4+ KB


None

### Валидация и предсказание

In [208]:
from sklearn.model_selection import train_test_split
y = train_data['Survived']
features = ['Parch',
            'Cabin',
            'Embarked',
#             'SibSp',
            'Fare',
            'Pclass', # Попробовать исключить, так как довольно сильно коррелирует с Cabin
            'Age', 
            'Sex']
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

In [209]:
# corrmat = X.corr()
# sns.set(font_scale=1.25)
# plt.figure(figsize = (10, 10))
# hm = sns.heatmap(corrmat, cbar=True, annot=True, square=True,
# fmt='.3f', annot_kws={'size': 10}, cmap='RdYlGn',
# center=0, vmax=1, vmin=-1)

In [210]:
# res = [0, 0, 0, 0, 0, 0, 0, 0, 0]
# iterations = 100
# for i in tqdm(range(iterations)):
#     train_x, val_x, train_y, val_y = train_test_split(X, y, test_size = 0.3)
    
#     model = KNeighborsClassifier(n_neighbors=69, weights='distance', metric='manhattan',
#                                  algorithm='kd_tree')
#     model.fit(train_x, train_y)
#     predictions = model.predict(val_x)
#     score = accuracy_score(predictions, val_y)
#     res[0] += score
    
#     model = DecisionTreeClassifier(max_depth=7, min_samples_leaf=6, min_samples_split=12)
#     model.fit(train_x, train_y)
#     predictions = model.predict(val_x)
#     ansamb_predictions = predictions.astype(float)
#     score = accuracy_score(predictions, val_y)
#     res[1] += score
    
# #     model = LogisticRegression(solver='liblinear', penalty='l2', C=0.4)
# #     model.fit(train_x, train_y)
# #     predictions = model.predict(val_x)
# #     score = accuracy_score(predictions, val_y)
# #     res[2] += score
    
# #     model = GaussianNB()
# #     model.fit(train_x, train_y)
# #     predictions = model.predict(val_x)
# #     score = accuracy_score(predictions, val_y)
# #     res[3] += score

# #     model = LinearSVC()
# #     model.fit(train_x, train_y)
# #     predictions = model.predict(val_x)
# #     score = accuracy_score(predictions, val_y)
# #     res[4] += score
    
#     model = RandomForestClassifier(n_estimators=100, max_depth=9, random_state=5)
#     model.fit(train_x, train_y)
#     predictions = model.predict(val_x)
#     ansamb_predictions += predictions
#     score = accuracy_score(predictions, val_y)
#     res[5] += score
    
#     model = CatBoostClassifier(verbose=False)
#     model.fit(train_x, train_y)
#     predictions = model.predict(val_x)
#     ansamb_predictions += predictions*2
#     score = accuracy_score(predictions, val_y)
#     res[6] += score
    
#     model = ExtraTreesClassifier(n_estimators=100, max_depth=9, random_state=5)
#     model.fit(train_x, train_y)
#     predictions = model.predict(val_x)
#     ansamb_predictions += predictions
#     score = accuracy_score(predictions, val_y)
#     res[7] += score
    
#     vfunc = np.vectorize(lambda x: round(x/5.5))
#     score = accuracy_score(vfunc(ansamb_predictions), val_y)
#     res[8] += score
    
# # print('Метод ближайших соседей', res[0]/iterations)
# print('Деревья решений', res[1]/iterations)
# # print('Логистическая регрессия', res[2]/iterations)
# # print('Наивный метод Байеса', res[3]/iterations)
# # print('Метод опорных векторов', res[4]/iterations)
# print('Случайный лес', res[5]/iterations)
# print('CatBoost', res[6]/iterations)
# print('Extra лес', res[7]/iterations)
# print('Ансамбль', res[8]/iterations)

In [211]:
# # Подбор параметров KNN (примерно тот же алгоритм был и для остальных моделей)
# data_score = pd.DataFrame()

# iterations = 100
# start, end, step = 1, 51, 5
# for n in tqdm(range(start, end, step)):
#     n_row = dict()
#     for weight in ['uniform', 'distance']:
#         for metric in ['euclidean', 'manhattan', 'cosine', 'cityblock']:
#             score = 0
#             for i in range(iterations):
#                 train_x, val_x, train_y, val_y = train_test_split(X, y, test_size = 0.3)
#                 model = KNeighborsClassifier(n_neighbors=11, weights='distance', metric='manhattan')
#                 model.fit(train_x, train_y)
#                 predictions = model.predict(val_x)
#                 score += accuracy_score(predictions, val_y)
#             n_row[weight+' '+metric] = score/iterations
#     data_score = data_score.append(n_row, ignore_index=True)
# data_score.index = list(range(start, end, step)) 
# data_score

In [222]:
model = CatBoostClassifier(verbose=False)
model.fit(X, y)
predictions1 = model.predict(X_test)

model = DecisionTreeClassifier(max_depth=7, min_samples_leaf=6, min_samples_split=12)
model.fit(X, y)
predictions2 = model.predict(X_test)

vfunc = np.vectorize(lambda x: round(x/3))
predictions = vfunc(predictions1*2+predictions2)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)