In [15]:
import numpy as np 
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.base import BaseEstimator, TransformerMixin

### Справка о данных:

Данные содержат информацию о пассажирах Титаника (Билет, Фио, Пол и тд).

Целевой переменной является Survived (информация о том, выжил пассажир или нет)

In [16]:
data = pd.read_csv('train (3).csv')

In [17]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


Посмотрим на пропущенные значения в данных

In [18]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Видно, что в данных много пропусков. Попробуем как-то заменить пропуски в переменной Age. Посмотрим на среднее значение Age по всем данным.

In [19]:
data.Age.mean()

29.69911764705882

Однако средний восраст пассажиров в зависимости от Pclass и Sex сильно различается. Посмотрим на это.

In [20]:
data.groupby(['Pclass', 'Sex'])['Age'].agg([len, 'mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,len,mean
Pclass,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1
1,female,94.0,34.611765
1,male,122.0,41.281386
2,female,76.0,28.722973
2,male,108.0,30.740707
3,female,144.0,21.75
3,male,347.0,26.507589


In [21]:
class mean_replace(BaseEstimator, TransformerMixin):
    def __init__(self):
        
        self.age_means_ = {}

    def fit(self, X, y=None):
        
        self.age_means_ = X.groupby(['Pclass', 'Sex']).Age.mean()

        return self

    def transform(self, X, y=None):
        
        Xc = X.copy()
        
        for k, val in self.age_means_.items():
            
            Xc.loc[((np.isnan(Xc["Age"]))&(Xc.Pclass == k[0])&(Xc.Sex == k[1])), 'Age'] = val

        return Xc

In [22]:
gbc = GradientBoostingClassifier()

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer())
])

categ_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

age_transformer = Pipeline(steps=[
    ('imputer', mean_replace())
])

prepr = ColumnTransformer(
    transformers=[
        ('numerical', num_transformer, ['Pclass', 'Fare']),
        ('categorial', categ_transformer, ['Sex', 'Embarked'])
    ])

pipeline = Pipeline(steps=[
    ('mean replace', age_transformer),
    ('preprocess', prepr),
    ('model', gbc)
])

In [25]:
params = { 
           'model__n_estimators': [100, 200, 300],
           'model__max_depth': [1, 2, 3],
           'model__random_state': [42],
         }

grid_search = GridSearchCV(pipeline, params, cv=10, scoring='accuracy')
grid_search.fit(
    data.loc[:, data.columns != 'Survived'], data['Survived']
)

print('Best accuracy score: {:1.2f}%'.format(grid_search.best_score_ * 100))

print('Best params: {}'.format(grid_search.best_params_))

Best accuracy score: 82.38%
Best params: {'model__max_depth': 2, 'model__n_estimators': 300, 'model__random_state': 42}


Таким образом, мы получили, что при вышенаписанных параметрах, точность нашей модели 82,38%