In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
STATE = 20221212

In [4]:
df = pd.read_csv('train.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [7]:
df['Embarked'].isna().sum()

2

In [8]:
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [9]:
len(df['Ticket'].unique())

681

In [10]:
df[df['Embarked'].isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


Заполним пропуски в столбце Embarked модой, сгруппировав данные по столбцам 'Pclass', 'Parch', 'Sex'.

In [11]:
df['Embarked'] = (df['Embarked'].
                       fillna(df.groupby(['Pclass', 'Parch', 'Sex'])['Embarked'].
                       transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan)))

In [12]:
df['Embarked'].isna().sum()

0

Заполним пропуски в столбце Cabin модой, сгруппировав данные по столбцам 'Pclass', 'Parch', 'Sex'.

In [13]:
df['Cabin'] = (df['Cabin'].
                       fillna(df.groupby(['Pclass', 'Parch', 'Sex'])['Cabin'].
                       transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan)))

In [14]:
df['Cabin'].isna().sum()

128

Заполним пропуски в столбце Cabin модой, сгруппировав данные по столбцам 'Pclass', 'Sex'.

In [15]:
df['Cabin'] = (df['Cabin'].
                       fillna(df.groupby([ 'Sex', 'Pclass'])['Cabin'].
                       transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan)))

In [16]:
df['Cabin'].isna().sum()

0

In [17]:
df['Age'] = (df['Age'].
                       fillna(df.groupby(['Pclass', 'SibSp', 'Parch', 'Fare'])['Age'].
                       transform(lambda x: x.mean())))#[0])))# if not x.mean().empty else np.nan)))

In [18]:
df['Age'].isna().sum()

66

In [19]:
df['Age'] = (df['Age'].
                       fillna(df.groupby(['Pclass', 'SibSp', 'Parch'])['Age'].
                       transform(lambda x: x.mean())))#[0] if not x.mean().empty else np.nan)))

In [20]:
df['Age'].isna().sum()

7

In [21]:
df['Age'] = (df['Age'].
                       fillna(df.groupby(['Pclass', 'Parch'])['Age'].
                       transform(lambda x: x.mean())))#[0] if not x.mean().empty else np.nan)))

In [22]:
df['Age'].isna().sum()

0

In [23]:
df['Age'] = df['Age'].astype('int')

Удалим из датасета столбцы 'PassengerId', 'Name', 'Ticket' - которые являются практически уникальными для каждого объекта, 'Cabin' - поскольку слишком много пропусков 

In [24]:
columns_to_drop = ['PassengerId', 'Name','Ticket']# 'Cabin', 

Определим столбцы для кодирования OneHotEncoder

In [25]:
columns_to_ohe = ['Sex', 'Embarked', 'Cabin']

In [26]:
df_crop = df.drop(columns_to_drop, axis=1)

Кодируем категориальные признаки с помощью OneHotEncoder

In [27]:
OHE = OneHotEncoder(sparse=False, drop='first')
df_crop_ohe = OHE.fit_transform(df_crop[columns_to_ohe])

In [28]:
df_crop_ohe = pd.DataFrame(df_crop_ohe)
df_crop_ohe.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,139,140,141,142,143,144,145,146,147,148
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


Удалим исходные столбцы с категориальными признаками

In [29]:
df_conc = pd.concat([df_crop, df_crop_ohe], sort=False, axis=1)
df_conc.drop(['Sex', 'Embarked', 'Cabin'], axis=1, inplace=True)
df_conc.head(5)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,0,1,2,3,...,139,140,141,142,143,144,145,146,147,148
0,0,3,22,1,0,7.25,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,38,1,0,71.2833,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,3,26,0,0,7.925,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1,1,35,1,0,53.1,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,3,35,0,0,8.05,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
df_conc.corr()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,0,1,2,3,...,139,140,141,142,143,144,145,146,147,148
Survived,1.000000,-0.338481,-0.051781,-0.035322,0.081629,0.257307,-0.543351,0.003650,-0.155660,-0.026456,...,0.060095,0.042470,-0.026456,-0.398669,0.017895,0.073642,-0.026456,0.171046,0.097698,-0.026456
Pclass,-0.338481,1.000000,-0.400069,0.083081,0.018443,-0.549500,0.131900,0.221009,0.081720,-0.052496,...,-0.074282,0.027734,0.027734,0.610352,-0.053038,-0.021469,0.027734,-0.054522,0.360252,-0.052496
Age,-0.051781,-0.400069,1.000000,-0.318938,-0.216574,0.101165,0.099492,-0.015589,-0.027074,0.042856,...,0.001610,-0.023403,0.030586,-0.109124,-0.106676,-0.002283,0.013408,-0.066088,-0.227445,0.037948
SibSp,-0.035322,0.083081,-0.318938,1.000000,0.414838,0.159651,-0.114631,-0.026354,0.070941,-0.015907,...,0.020528,0.014507,-0.015907,-0.102798,0.040431,-0.027582,-0.015907,0.035689,0.149494,-0.015907
Parch,0.081629,0.018443,-0.216574,0.414838,1.000000,0.216225,-0.245489,-0.081228,0.063036,-0.015878,...,-0.022467,0.025731,-0.015878,-0.209447,0.159737,-0.027532,-0.015878,0.113311,0.223857,-0.015878
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,0.073642,-0.021469,-0.002283,-0.027582,-0.027532,-0.024425,-0.078791,-0.017877,0.035996,-0.001948,...,-0.002757,-0.001948,-0.001948,-0.042878,-0.008346,1.000000,-0.001948,-0.008580,-0.025308,-0.001948
145,-0.026456,0.027734,0.013408,-0.015907,-0.015878,-0.016505,0.024728,0.108986,-0.054125,-0.001124,...,-0.001590,-0.001124,-0.001124,-0.024728,-0.004813,-0.001948,1.000000,-0.004948,-0.014595,-0.001124
146,0.171046,-0.054522,-0.066088,0.035689,0.113311,-0.012339,-0.183835,-0.045400,0.091416,-0.004948,...,-0.007001,-0.004948,-0.004948,-0.108892,-0.021196,-0.008580,-0.004948,1.000000,-0.064272,-0.004948
147,0.097698,0.360252,-0.227445,0.149494,0.223857,-0.141180,-0.590237,0.226204,-0.107104,-0.014595,...,-0.020652,-0.014595,-0.014595,-0.321203,-0.062522,-0.025308,-0.014595,-0.064272,1.000000,-0.014595


In [31]:
df_conc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Columns: 155 entries, Survived to 148
dtypes: float64(150), int32(1), int64(4)
memory usage: 1.1 MB


In [32]:
scaler = StandardScaler()

Определим столбцы для масштабирования признаков с помощью StandardScaler

In [33]:
columns_to_scaler = ['Age', 'Parch', 'Fare']

Разделим имеющиеся данные на тренирововчную и тестовую выборки

In [34]:
train, test = train_test_split(df_conc, test_size=0.15, random_state=STATE)

In [35]:
train_features = train.drop('Survived', axis=1)
test_features = test.drop('Survived', axis=1)
train_target = train['Survived']
test_target = test['Survived']

Для масштабирования признаков обучим StandardScaler на тренировочной выборке и масштабируем обе выборки

In [36]:
train_features[columns_to_scaler] = scaler.fit_transform(train_features[columns_to_scaler])
test_features[columns_to_scaler] = scaler.transform(test_features[columns_to_scaler])

Построим модель линейной регрессии с параметрами по умолчанию

In [37]:
lr = LogisticRegression(random_state=STATE)

In [38]:
%%time
lr.fit(train_features, train_target.values)

CPU times: total: 15.6 ms
Wall time: 24.9 ms


In [39]:
print("Test error: %.7f" % (accuracy_score(test_target.values, lr.predict(test_features))))

Test error: 0.8059701


In [40]:
print(classification_report(test_target.values, lr.predict(test_features)))

              precision    recall  f1-score   support

           0       0.76      0.95      0.84        74
           1       0.90      0.63      0.75        60

    accuracy                           0.81       134
   macro avg       0.83      0.79      0.79       134
weighted avg       0.83      0.81      0.80       134



Получили метрики accuracy=0.8059, f1=0.79

Построим модель классификации с помощью К ближайших соседей с параметрами по умолчанию

In [41]:
knn = KNeighborsClassifier()

In [42]:
knn.fit(train_features, train_target.values)

In [43]:
print("Test error: %.7f" % (accuracy_score(test_target.values, knn.predict(test_features))))

Test error: 0.7910448


In [44]:
print(classification_report(test_target.values, knn.predict(test_features)))

              precision    recall  f1-score   support

           0       0.77      0.89      0.82        74
           1       0.83      0.67      0.74        60

    accuracy                           0.79       134
   macro avg       0.80      0.78      0.78       134
weighted avg       0.80      0.79      0.79       134



Получили метрики accuracy=0.7910, f1=0.79

Произведем подбор параметров для обеих моделей, используя RandomizedSearchCV

зададим набор параметров для модели классификации с помощью К ближайших соседей

In [45]:
parametr_knn = {'n_neighbors': range(1, 7),
                'weights': ['uniform', 'distance'],
                'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                'leaf_size': range(5, 20, 2),
                'p': [1, 2]}

In [46]:
knn_rs = RandomizedSearchCV(estimator=knn, param_distributions=parametr_knn, cv=10, n_iter=400, random_state=STATE)

In [47]:
%%time
knn_rs.fit(train_features, train_target.values)

CPU times: total: 4min 59s
Wall time: 47.2 s


In [48]:
knn_rs.best_estimator_

In [49]:
print("Test error: %.7f" % (accuracy_score(test_target.values, knn_rs.best_estimator_.predict(test_features))))

Test error: 0.7910448


In [50]:
knn_rs.best_score_

0.8072631578947368

In [51]:
print(classification_report(test_target.values, knn_rs.best_estimator_.predict(test_features)))

              precision    recall  f1-score   support

           0       0.75      0.93      0.83        74
           1       0.88      0.62      0.73        60

    accuracy                           0.79       134
   macro avg       0.82      0.77      0.78       134
weighted avg       0.81      0.79      0.78       134



Получили метрики accuracy=0.7910, f1=0.79, что не отличается от метрик, полученных с параметрами по умолчанию

зададим набор параметров для модели логистической регресии

In [52]:
c_space = np.logspace(-1, 1.5, 50)
c_space

array([ 0.1       ,  0.11246578,  0.12648552,  0.14225293,  0.15998587,
        0.17992936,  0.20235896,  0.22758459,  0.25595479,  0.28786156,
        0.32374575,  0.36410319,  0.40949151,  0.46053783,  0.51794747,
        0.58251367,  0.65512856,  0.73679546,  0.82864277,  0.93193958,
        1.04811313,  1.17876863,  1.32571137,  1.49097166,  1.67683294,
        1.88586328,  2.12095089,  2.38534401,  2.6826958 ,  3.01711481,
        3.39322177,  3.81621341,  4.29193426,  4.82695744,  5.42867544,
        6.1054023 ,  6.86648845,  7.72244995,  8.68511374,  9.7677811 ,
       10.98541142, 12.35482888, 13.89495494, 15.62706977, 17.57510625,
       19.76598072, 22.22996483, 25.00110383, 28.11768698, 31.6227766 ])

In [53]:
parametr_lr = {'penalty': ['l1', 'l2', 'None', 'elasticnet'],
                'class_weight': ['balanced', 'None'],
                'C': c_space,
                'solver': ['lbfgs', 'liblinear', 'sag'],
                'max_iter': [100]}

In [54]:
lr_rs = RandomizedSearchCV(estimator=lr, param_distributions=parametr_lr, cv=10, n_iter=1000, random_state=STATE)

In [55]:
%%time
lr_rs.fit(train_features, train_target.values)

CPU times: total: 1min 34s
Wall time: 1min 25s


In [56]:
lr_rs.best_score_

0.8402456140350877

In [57]:
print("Test error: %.7f" % (accuracy_score(test_target.values, lr_rs.best_estimator_.predict(test_features))))

Test error: 0.8134328


In [58]:
lr_rs.best_estimator_

In [59]:
print(classification_report(test_target.values, lr_rs.best_estimator_.predict(test_features)))

              precision    recall  f1-score   support

           0       0.76      0.96      0.85        74
           1       0.93      0.63      0.75        60

    accuracy                           0.81       134
   macro avg       0.85      0.80      0.80       134
weighted avg       0.84      0.81      0.81       134



Получили метрики accuracy=0.8134, f1=0.81, что выше метрик, полученных с параметрами по умолчанию.

Можно сделать вывод, что на данном наборе данных логистическая регрессия показала себя лучше на опробованных параметрах, чем модель классификации с помощью К ближайших соседей на заданных параметрах. Однако можно отметить, что логистическая регрессия оказалась чуть переобученной и метрика accuracy на тренировочной выборке с разбиением на 10 сплитов выше чем на тестовой. 