In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import math
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
import warnings
warnings.filterwarnings('ignore')

### Чтение и предобработка

In [3]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
features_for_drop = ['PassengerId', 'Cabin']
train_data.Sex = train_data.Sex.apply(lambda i: 1 if i == 'male' else 0)
test_data.Sex = test_data.Sex.apply(lambda i: 1 if i == 'male' else 0)
train_data.drop(columns=features_for_drop, inplace=True)
test_data.drop(columns=features_for_drop, inplace=True)
train_data.dropna(subset=['Embarked'], inplace=True)
train_data[(list(map(lambda i: 'Mrs.' in i, train_data.Name)))] = train_data[(list(map(lambda i: 'Mrs.' in i, train_data.Name)))].fillna(36)
test_data[(list(map(lambda i: 'Mrs.' in i, test_data.Name)))] = test_data[(list(map(lambda i: 'Mrs.' in i, test_data.Name)))].fillna(36)
train_data[(list(map(lambda i: 'Mr.' in i, train_data.Name)))] = train_data[(list(map(lambda i: 'Mr.' in i, train_data.Name)))].fillna(31)
test_data[(list(map(lambda i: 'Mr.' in i, test_data.Name)))] = test_data[(list(map(lambda i: 'Mr.' in i, test_data.Name)))].fillna(31)
train_data[(list(map(lambda i: 'Ms.' in i, train_data.Name)))] = train_data[(list(map(lambda i: 'Ms.' in i, train_data.Name)))].fillna(28)
test_data[(list(map(lambda i: 'Ms.' in i, test_data.Name)))] = test_data[(list(map(lambda i: 'Ms.' in i, test_data.Name)))].fillna(28)
train_data[(list(map(lambda i: 'Miss.' in i, train_data.Name)))] = train_data[(list(map(lambda i: 'Miss.' in i, train_data.Name)))].fillna(20)
test_data[(list(map(lambda i: 'Miss.' in i, test_data.Name)))] = test_data[(list(map(lambda i: 'Miss.' in i, test_data.Name)))].fillna(20)
train_data[(list(map(lambda i: 'Master.' in i, train_data.Name)))] = train_data[(list(map(lambda i: 'Master.' in i, train_data.Name)))].fillna(5)
test_data[(list(map(lambda i: 'Master.' in i, test_data.Name)))] = test_data[(list(map(lambda i: 'Master.' in i, test_data.Name)))].fillna(5)
train_data[(list(map(lambda i: 'Dr.' in i, train_data.Name)))] = train_data[(list(map(lambda i: 'Dr.' in i, train_data.Name)))].fillna(43)
test_data[(list(map(lambda i: 'Dr.' in i, test_data.Name)))] = test_data[(list(map(lambda i: 'Dr.' in i, test_data.Name)))].fillna(43)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  889 non-null    int64  
 1   Pclass    889 non-null    int64  
 2   Name      889 non-null    object 
 3   Sex       889 non-null    int64  
 4   Age       889 non-null    float64
 5   SibSp     889 non-null    int64  
 6   Parch     889 non-null    int64  
 7   Ticket    889 non-null    object 
 8   Fare      889 non-null    float64
 9   Embarked  889 non-null    object 
dtypes: float64(2), int64(5), object(3)
memory usage: 76.4+ KB


### Валидация и предсказание

In [None]:
from sklearn.model_selection import train_test_split
y = train_data['Survived']
features = ['Pclass',
            'Parch', 
            'SibSp', 
            'Embarked',
            'Age', 
            'Sex']
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])
res = [0, 0, 0, 0, 0]

In [None]:
for i in range(50000):
    train_x, val_x, train_y, val_y = train_test_split(X, y, test_size = 0.2)
    
    model = KNeighborsClassifier(n_neighbors=11, weights='distance', metric='manhattan')
    model.fit(train_x, train_y)
    predictions = model.predict(val_x)
    score = accuracy_score(predictions, val_y)
    res[0] += score
    
    model = DecisionTreeClassifier(max_depth=11, min_samples_split=2, min_samples_leaf=7)
    model.fit(train_x, train_y)
    predictions = model.predict(val_x)
    score = accuracy_score(predictions, val_y)
    res[1] += score
    
    model = LogisticRegression(solver='liblinear', penalty='l2', C=0.4)
    model.fit(train_x, train_y)
    predictions = model.predict(val_x)
    score = accuracy_score(predictions, val_y)
    res[2] += score
    
    model = GaussianNB()
    model.fit(train_x, train_y)
    predictions = model.predict(val_x)
    score = accuracy_score(predictions, val_y)
    res[3] += score

    model = LinearSVC()
    model.fit(train_x, train_y)
    predictions = model.predict(val_x)
    score = accuracy_score(predictions, val_y)
    res[4] += score

In [5]:
print('Метод ближайших соседей', res[0]/50000)
print('Деревья решений', res[1]/50000)
print('Логистическая регрессия', res[2]/50000)
print('Наивный метод Байеса', res[3]/50000)
print('Метод опорных векторов', res[4]/50000)

Метод ближайших соседей 0.7901923595505396
Деревья решений 0.8175126966291801
Логистическая регрессия 0.7935734831460529
Наивный метод Байеса 0.7636366292134732
Метод опорных векторов 0.7558164044943755


In [None]:
model = KNeighborsClassifier(n_neighbors=11, weights='distance', metric='manhattan')
model.fit(train_x, train_y)
predictions = model.predict(val_x)
score = accuracy_score(predictions, val_y)
print('score =',score)

In [None]:
model = DecisionTreeClassifier(max_depth=11, min_samples_split=2, min_samples_leaf=leaf)
model.fit(train_x, train_y)
predictions = model.predict(val_x)
score = accuracy_score(predictions, val_y)
print('score =',score)

In [None]:
model = LogisticRegression(solver='liblinear', penalty='l2', C=0.4)
model.fit(train_x, train_y)
predictions = model.predict(val_x)
score = accuracy_score(predictions, val_y)
print(f'score = {score}')

In [None]:
model = GaussianNB()
model.fit(train_x, train_y)
predictions = model.predict(val_x)
score = accuracy_score(predictions, val_y)
print(f'score = {score}')

In [None]:
model = LinearSVC()
model.fit(train_x, train_y)
predictions = model.predict(val_x)
score = accuracy_score(predictions, val_y)
print(f'score = {score}')

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, max_depth=9, random_state=5)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)