In [150]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

In [151]:
cols = [
    'pclass',
    'survived',
    'sex',
    'age',
    'fare',
    'embarked',
    'sibsp',
    'parch'
]

data = pd.read_csv('../titanic.csv', usecols=cols)

data.head(15)

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,1,1,female,29.0,0,0,211.3375,S
1,1,1,male,0.9167,1,2,151.55,S
2,1,0,female,2.0,1,2,151.55,S
3,1,0,male,30.0,1,2,151.55,S
4,1,0,female,25.0,1,2,151.55,S
5,1,1,male,48.0,0,0,26.55,S
6,1,1,female,63.0,1,0,77.9583,S
7,1,0,male,39.0,0,0,0.0,S
8,1,1,female,53.0,2,0,51.4792,S
9,1,0,male,71.0,0,0,49.5042,C


In [152]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pclass,1309.0,2.294882,0.837836,1.0,2.0,3.0,3.0,3.0
survived,1309.0,0.381971,0.486055,0.0,0.0,0.0,1.0,1.0
age,1046.0,29.881135,14.4135,0.1667,21.0,28.0,39.0,80.0
sibsp,1309.0,0.498854,1.041658,0.0,0.0,0.0,1.0,8.0
parch,1309.0,0.385027,0.86556,0.0,0.0,0.0,0.0,9.0
fare,1308.0,33.295479,51.758668,0.0,7.8958,14.4542,31.275,512.3292


In [153]:
data.isnull().sum()

pclass        0
survived      0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [154]:
data['embarked'].fillna(data['embarked'].mode()[0], inplace=True)
data['embarked'].head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['embarked'].fillna(data['embarked'].mode()[0], inplace=True)


0    S
1    S
2    S
3    S
4    S
Name: embarked, dtype: object

In [155]:
data['fare'].fillna(data['fare'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['fare'].fillna(data['fare'].median(), inplace=True)


In [156]:
data['age'].fillna(data['age'].mean(), inplace=True)
data['age'].isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['age'].fillna(data['age'].mean(), inplace=True)


np.int64(0)

In [157]:
data['family_size'] = data['sibsp'] + data['parch']
data['family_size'].head(15)

0     0
1     3
2     3
3     3
4     3
5     0
6     1
7     0
8     2
9     0
10    1
11    1
12    0
13    0
14    0
Name: family_size, dtype: int64

In [158]:
data.drop(['sibsp', 'parch'], axis=1, inplace=True)
data.head(15)

Unnamed: 0,pclass,survived,sex,age,fare,embarked,family_size
0,1,1,female,29.0,211.3375,S,0
1,1,1,male,0.9167,151.55,S,3
2,1,0,female,2.0,151.55,S,3
3,1,0,male,30.0,151.55,S,3
4,1,0,female,25.0,151.55,S,3
5,1,1,male,48.0,26.55,S,0
6,1,1,female,63.0,77.9583,S,1
7,1,0,male,39.0,0.0,S,0
8,1,1,female,53.0,51.4792,S,2
9,1,0,male,71.0,49.5042,C,0


In [159]:
data.isnull().sum()

pclass         0
survived       0
sex            0
age            0
fare           0
embarked       0
family_size    0
dtype: int64

In [160]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pclass       1309 non-null   int64  
 1   survived     1309 non-null   int64  
 2   sex          1309 non-null   object 
 3   age          1309 non-null   float64
 4   fare         1309 non-null   float64
 5   embarked     1309 non-null   object 
 6   family_size  1309 non-null   int64  
dtypes: float64(2), int64(3), object(2)
memory usage: 71.7+ KB


In [161]:
ohe = OneHotEncoder(handle_unknown='ignore')
encoder = ohe.fit_transform(data[['sex']])

In [162]:
data.head(10)

Unnamed: 0,pclass,survived,sex,age,fare,embarked,family_size
0,1,1,female,29.0,211.3375,S,0
1,1,1,male,0.9167,151.55,S,3
2,1,0,female,2.0,151.55,S,3
3,1,0,male,30.0,151.55,S,3
4,1,0,female,25.0,151.55,S,3
5,1,1,male,48.0,26.55,S,0
6,1,1,female,63.0,77.9583,S,1
7,1,0,male,39.0,0.0,S,0
8,1,1,female,53.0,51.4792,S,2
9,1,0,male,71.0,49.5042,C,0


In [163]:
tmp = pd.get_dummies(data['sex'], dtype = 'int')
tmp.head()

Unnamed: 0,female,male
0,1,0
1,0,1
2,1,0
3,0,1
4,1,0


In [164]:
tmp1 = pd.get_dummies(data['embarked'], dtype=int)
tmp1

Unnamed: 0,C,Q,S
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
1304,1,0,0
1305,1,0,0
1306,1,0,0
1307,1,0,0


In [165]:
data = data.drop(columns=['sex', 'embarked'])
data = pd.concat([data, tmp, tmp1], axis=1)
data.head(10)

Unnamed: 0,pclass,survived,age,fare,family_size,female,male,C,Q,S
0,1,1,29.0,211.3375,0,1,0,0,0,1
1,1,1,0.9167,151.55,3,0,1,0,0,1
2,1,0,2.0,151.55,3,1,0,0,0,1
3,1,0,30.0,151.55,3,0,1,0,0,1
4,1,0,25.0,151.55,3,1,0,0,0,1
5,1,1,48.0,26.55,0,0,1,0,0,1
6,1,1,63.0,77.9583,1,1,0,0,0,1
7,1,0,39.0,0.0,0,0,1,0,0,1
8,1,1,53.0,51.4792,2,1,0,0,0,1
9,1,0,71.0,49.5042,0,0,1,1,0,0


In [166]:
data.dtypes

pclass           int64
survived         int64
age            float64
fare           float64
family_size      int64
female           int64
male             int64
C                int64
Q                int64
S                int64
dtype: object

In [176]:
пайплайн:
1 масштабируем переменные
2 тренировку модели на масшт.пер. => предсказываем на масштабированных данных
3 результат для отрисовки графиков мы возвращаем предсказания в виде датафрейма

SyntaxError: invalid syntax (299353504.py, line 2)

In [171]:
def scale(data):
    scaler = StandardScaler()
    df_scaled = data.copy()
    df_scaled.loc[:, :] = scaler.fit_transform(data)
    return df_scaled, scaler

In [173]:
data_scaled, scaler = scale(data)
data_scaled

  0.84191642]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_scaled.loc[:, :] = scaler.fit_transform(data)
 -0.78615963]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_scaled.loc[:, :] = scaler.fit_transform(data)
 -0.55834605]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_scaled.loc[:, :] = scaler.fit_transform(data)
 -0.74349692]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_scaled.loc[:, :] = scaler.fit_transform(data)
  0.74349692]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_scaled.loc[:, :] = scaler.fit_transform(data)
 -0.50976981]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_scaled.loc[:, :] = scaler.fit_transform(data)
 -0.32204029]' has dtype incompatible with int64, please explicitly cast to a comp

Unnamed: 0,pclass,survived,age,fare,family_size,female,male,C,Q,S
0,-1.546098,1.272006,-6.842022e-02,3.442584,-0.558346,1.344995,-1.344995,-0.50977,-0.32204,0.655011
1,-1.546098,1.272006,-2.249092e+00,2.286639,1.336749,-0.743497,0.743497,-0.50977,-0.32204,0.655011
2,-1.546098,-0.786160,-2.164974e+00,2.286639,1.336749,1.344995,-1.344995,-0.50977,-0.32204,0.655011
3,-1.546098,-0.786160,9.229922e-03,2.286639,1.336749,-0.743497,0.743497,-0.50977,-0.32204,0.655011
4,-1.546098,-0.786160,-3.790208e-01,2.286639,1.336749,1.344995,-1.344995,-0.50977,-0.32204,0.655011
...,...,...,...,...,...,...,...,...,...,...
1304,0.841916,-0.786160,-1.194347e+00,-0.364003,0.073352,1.344995,-1.344995,1.96167,-0.32204,-1.526692
1305,0.841916,-0.786160,2.758687e-16,-0.364003,0.073352,1.344995,-1.344995,1.96167,-0.32204,-1.526692
1306,0.841916,-0.786160,-2.625456e-01,-0.503774,-0.558346,-0.743497,0.743497,1.96167,-0.32204,-1.526692
1307,0.841916,-0.786160,-2.237205e-01,-0.503774,-0.558346,-0.743497,0.743497,1.96167,-0.32204,-1.526692


In [175]:
data.head(10)

Unnamed: 0,pclass,survived,age,fare,family_size,female,male,C,Q,S
0,1,1,29.0,211.3375,0,1,0,0,0,1
1,1,1,0.9167,151.55,3,0,1,0,0,1
2,1,0,2.0,151.55,3,1,0,0,0,1
3,1,0,30.0,151.55,3,0,1,0,0,1
4,1,0,25.0,151.55,3,1,0,0,0,1
5,1,1,48.0,26.55,0,0,1,0,0,1
6,1,1,63.0,77.9583,1,1,0,0,0,1
7,1,0,39.0,0.0,0,0,1,0,0,1
8,1,1,53.0,51.4792,2,1,0,0,0,1
9,1,0,71.0,49.5042,0,0,1,1,0,0
