In [47]:
from matplotlib import pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pandas as pd
import numpy as np
import math
import random

from sklearn.metrics import accuracy_score
from sklearn.decomposition import FactorAnalysis

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from catboost import CatBoostClassifier
from sklearn.svm import LinearSVC

import warnings
warnings.filterwarnings('ignore')

### Чтение и предобработка

In [48]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [49]:
def scaling_min_max(column):
    column = column.astype('float64')
    column = (column-column.min())/(column.max()-column.min())
    return column

In [50]:
def preprocessing_data(data):
    data['Sex'] = data['Sex'].apply(lambda x: 1 if x == 'female' else 0) 
    data['Embarked'] = data['Embarked'].fillna('S')
    data['Cabin'] = data['Cabin'].apply(lambda x: 0 if pd.isna(x) else 1)
    
    # Восстановление возрастов (подробнее описано в ноутбуке titanicAnalytics.ipynb)
    indexes = data[(data['Parch'] > 2)&(data['Age'].isna())].index
    data.loc[indexes, 'Age'] = data.loc[indexes, 'Age'].fillna(train_data[(train_data['Parch'] > 2)]['Age'].median())

    indexes = data[(data['SibSp'] > 1)&(data['Age'].isna())].index
    data.loc[indexes, 'Age'] = data.loc[indexes, 'Age'].fillna(train_data[(train_data['SibSp'] > 1)]['Age'].median())

    indexes = data[(data['Parch'] == 0)&(data['Age'].isna())].index
    data.loc[indexes, 'Age'] = data.loc[indexes, 'Age'].fillna(train_data[(train_data['Parch'] == 0)]['Age'].median())

    data['Age'] = data['Age'].fillna(15)
    
    
    data['Fare'] = data['Fare'].fillna(1)
    data['Fare'] = data['Fare'].apply(lambda x: 80*x/data['Fare'].max() if x > 80 else x)
    data['Fare'] = np.log(data['Fare']+1)
    data['Age'] = scaling_min_max(data['Age'])
    data['Fare'] = scaling_min_max(data['Fare'])
    data['Pclass'] = scaling_min_max(data['Pclass'])

In [51]:
preprocessing_data(train_data)
preprocessing_data(test_data)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    float64
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    int64  
 11  Embarked     891 non-null    object 
dtypes: float64(3), int64(6), object(3)
memory usage: 83.7+ KB


### Валидация и предсказание

In [52]:
from sklearn.model_selection import train_test_split
y = train_data['Survived']
features = [
    'Sex',
    'Pclass',
    'Age',
    'Fare',
    'Cabin',
    'Embarked'
]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])
X

Unnamed: 0,Sex,Pclass,Age,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S
0,0,1.0,0.271174,0.480200,0,0,0,1
1,1,0.0,0.472229,0.974091,1,1,0,0
2,1,1.0,0.321438,0.498096,0,0,0,1
3,1,0.0,0.434531,0.908153,1,0,0,1
4,0,1.0,0.434531,0.501261,0,0,0,1
...,...,...,...,...,...,...,...,...
886,0,0.5,0.334004,0.600543,0,0,0,1
887,1,0.0,0.233476,0.781437,1,0,0,1
888,1,1.0,0.183212,0.727425,0,0,0,1
889,0,0.0,0.321438,0.781437,1,1,0,0


In [55]:
train_x, val_x, train_y, val_y = train_test_split(X, y, test_size = 0.3)


model = KNeighborsClassifier(n_neighbors=215, weights='distance', metric='manhattan',
                                 algorithm='kd_tree')
model.fit(train_x, train_y)
prediction = model.predict(val_x)
print(f"KNN accuracy: {accuracy_score(prediction, val_y)}")

model = DecisionTreeClassifier(max_depth=7, min_samples_leaf=6, min_samples_split=12)
model.fit(train_x, train_y)
prediction = model.predict(val_x)
print(f"DecisionTree accuracy: {accuracy_score(prediction, val_y)}")

model = RandomForestClassifier(n_estimators=100, max_depth=9, random_state=5)
model.fit(train_x, train_y)
prediction = model.predict(val_x)
print(f"RandomForest accuracy: {accuracy_score(prediction, val_y)}")

model = CatBoostClassifier(verbose=False)
model.fit(train_x, train_y)
prediction = model.predict(val_x)
combined_prediction = prediction.copy()
print(f"CatBoost accuracy: {accuracy_score(prediction, val_y)}")

model = ExtraTreesClassifier(n_estimators=100, max_depth=9, random_state=5)
model.fit(train_x, train_y)
prediction = model.predict(val_x)
combined_prediction += prediction
print(f"ExtraTree accuracy: {accuracy_score(prediction, val_y)}")

model = LogisticRegression(solver='liblinear', penalty='l2', C=0.4)
model.fit(train_x, train_y)
prediction = model.predict(val_x)
print(f"LogisticRegression accuracy: {accuracy_score(prediction, val_y)}")

model = GaussianNB()
model.fit(train_x, train_y)
prediction = model.predict(val_x)
print(f"GaussianNB accuracy: {accuracy_score(prediction, val_y)}")

model = LinearSVC()
model.fit(train_x, train_y)
prediction = model.predict(val_x)
print(f"LinearSVC accuracy: {accuracy_score(prediction, val_y)}")

combined_prediction = pd.Series(combined_prediction).apply(lambda x: math.ceil(x/3))
print(f"Combined accuracy: {accuracy_score(combined_prediction, val_y)}")

KNN accuracy: 0.8171641791044776
DecisionTree accuracy: 0.8097014925373134
RandomForest accuracy: 0.8246268656716418
CatBoost accuracy: 0.835820895522388
ExtraTree accuracy: 0.8246268656716418
LogisticRegression accuracy: 0.7985074626865671
GaussianNB accuracy: 0.7761194029850746
LinearSVC accuracy: 0.7873134328358209
Combined accuracy: 0.835820895522388
