In [2]:
import pandas as pd
import numpy as np

In [3]:
df_train = pd.read_csv('train.csv', index_col='PassengerId')
df_test = pd.read_csv('test.csv', index_col='PassengerId')

In [4]:
def take_name_part_from_df(df, col_name='Name', want_to_add_parts=['Mr.', 'Mrs.', 'Miss.', 'Master.', 'Dr.'], missing_name_part='no_name_part'):
    dict_name_parts = {}
    for name in df[col_name]:
        name_parts = name.split(' ')
        for name_part in name_parts:
            if name_part not in dict_name_parts.keys():
                dict_name_parts[name_part] = 1
            else:
                dict_name_parts[name_part] += 1
    
    df_name_parts = pd.DataFrame.from_dict(dict_name_parts, orient='index')
    df_name_parts = df_name_parts.reset_index()
    df_name_parts.columns = ['name_part', 'cnt']
    
    name_part_to_df = []

    for p_name in df[col_name]:
        counter = 0
        for name_part in want_to_add_parts:
            if name_part in p_name:
                name_part_to_df.append(name_part)
                continue
            else:
                counter += 1
                if counter == len(want_to_add_parts):
                    name_part_to_df.append(missing_name_part)
                    
    return name_part_to_df

In [5]:
def adding_data_to_df(df, list_of_cols_to_add, list_of_names_of_cols_to_add):
    for col_index in range(len(list_of_cols_to_add)):
        col = list_of_cols_to_add[col_index]
        col_name = list_of_names_of_cols_to_add[col_index]
        df[col_name] = col
    return df

In [9]:
def transform_data_for_model(df):

    list_of_cols_to_add = []
    list_of_names_of_cols_to_add = []

    name_parts = take_name_part_from_df(df, col_name='Name', want_to_add_parts=['Mr.', 'Mrs.', 'Miss.', 'Master.', 'Dr.'], missing_name_part='no_name_part')
    list_of_cols_to_add.append(name_parts)
    list_of_names_of_cols_to_add.append('name_part')

    cab_num_for_df = []
    for cab_num in df.Cabin:
        if pd.isna(cab_num):
            cab_num_for_df.append('N')
        else:
            cab_num_for_df.append(cab_num.split(' ')[0][0])
    list_of_cols_to_add.append(cab_num_for_df)
    list_of_names_of_cols_to_add.append('cabin_letter')

    list_of_cols_to_add.append(df['Sex'].map({'male':0, 'female':1}))
    list_of_names_of_cols_to_add.append('sex_binary')
    
    df = adding_data_to_df(df, list_of_cols_to_add, list_of_names_of_cols_to_add)
    list_of_cols_to_add = []
    list_of_names_of_cols_to_add = []
    
    df['Embarked'] = df.Embarked.fillna('S')
    mean_age_dict = df[['name_part', 'Age']].groupby('name_part').agg({'Age':'median'}).to_dict(orient='dict')['Age']
    list_of_cols_to_add.append(df.apply(lambda row: mean_age_dict[row['name_part']] if np.isnan(row['Age']) else row['Age'], axis=1))
    list_of_names_of_cols_to_add.append('age_no_nan')

    df_for_return = adding_data_to_df(df, list_of_cols_to_add, list_of_names_of_cols_to_add)

    return df_for_return

def add_kaggle_cols(df, age_no_nun_col_name='age_no_nan', age_groups_col_name='age_no_nan_groups'):
    df.loc[df[age_no_nun_col_name] <= 16, age_groups_col_name] = 0
    df.loc[(df[age_no_nun_col_name] > 16) & (df[age_no_nun_col_name] <= 32), age_groups_col_name] = 1
    df.loc[(df[age_no_nun_col_name] > 32) & (df[age_no_nun_col_name] <= 48), age_groups_col_name] = 2
    df.loc[(df[age_no_nun_col_name] > 48) & (df[age_no_nun_col_name] <= 64), age_groups_col_name] = 3
    df.loc[df[age_no_nun_col_name] > 64, age_groups_col_name] = 4

    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = df['FamilySize'].apply(lambda x: 1 if x == 1 else 0)
    df['Age*Class'] = df[age_groups_col_name] * df.Pclass
    return df
    

df_train_for_model = transform_data_for_model(df_train)
df_train_for_model = add_kaggle_cols(df_train_for_model)
df_train_for_model = df_train_for_model[['Survived', 'Pclass', 'SibSp', 'Parch', 'Fare', 'Embarked', 'name_part', 'cabin_letter', 'sex_binary', 'age_no_nan_groups', 'FamilySize', 'IsAlone','Age*Class']]
df_train_for_model = pd.get_dummies(df_train_for_model, prefix=['Embarked','name_part','cabin_letter'], columns=['Embarked','name_part','cabin_letter'])
df_train_for_model

Unnamed: 0_level_0,Survived,Pclass,SibSp,Parch,Fare,sex_binary,age_no_nan_groups,FamilySize,IsAlone,Age*Class,...,name_part_no_name_part,cabin_letter_A,cabin_letter_B,cabin_letter_C,cabin_letter_D,cabin_letter_E,cabin_letter_F,cabin_letter_G,cabin_letter_N,cabin_letter_T
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,3,1,0,7.2500,0,1.0,2,0,3.0,...,0,0,0,0,0,0,0,0,1,0
2,1,1,1,0,71.2833,1,2.0,2,0,2.0,...,0,0,0,1,0,0,0,0,0,0
3,1,3,0,0,7.9250,1,1.0,1,1,3.0,...,0,0,0,0,0,0,0,0,1,0
4,1,1,1,0,53.1000,1,2.0,2,0,2.0,...,0,0,0,1,0,0,0,0,0,0
5,0,3,0,0,8.0500,0,2.0,1,1,6.0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,0,0,13.0000,0,1.0,1,1,2.0,...,1,0,0,0,0,0,0,0,1,0
888,1,1,0,0,30.0000,1,1.0,1,1,1.0,...,0,0,1,0,0,0,0,0,0,0
889,0,3,1,2,23.4500,1,1.0,4,0,3.0,...,0,0,0,0,0,0,0,0,1,0
890,1,1,0,0,30.0000,0,1.0,1,1,1.0,...,0,0,0,1,0,0,0,0,0,0


In [10]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from catboost import CatBoostClassifier
import statistics
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

from xgboost import XGBClassifier

In [22]:
feature_cols = df_train_for_model.columns[1:]
y = df_train_for_model['Survived']
X = df_train_for_model[feature_cols]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=9) # 70% training and 30% test

In [23]:
clf_dt = DecisionTreeClassifier(max_depth=4,  min_samples_leaf=20)
# clf_dt = clf_dt.fit(X_train, y_train)

clf_rf = RandomForestClassifier(max_depth=4)
# clf_rf = clf_rf.fit(X_train, y_train)

clf_xgb = XGBClassifier()
# clf_xgb = clf_xgb.fit(X_train, y_train)

cat_features = [0, 1]
clf_cb = CatBoostClassifier(iterations=100, learning_rate=1, depth=4)
# clf_cb.fit(X_train, y_train, cat_features)

eclf2 = VotingClassifier(estimators=[('dt', clf_dt), ('rf', clf_rf), ('xgb', clf_xgb), ('catb', clf_cb)], voting='soft')
eclf2 = eclf2.fit(X_train, y_train)

0:	learn: 0.4465672	total: 1.02ms	remaining: 101ms
1:	learn: 0.4047935	total: 2.02ms	remaining: 99ms
2:	learn: 0.3930578	total: 2.91ms	remaining: 94.1ms
3:	learn: 0.3798664	total: 3.67ms	remaining: 88.1ms
4:	learn: 0.3670385	total: 4.42ms	remaining: 83.9ms
5:	learn: 0.3390712	total: 5.22ms	remaining: 81.7ms
6:	learn: 0.3296203	total: 5.99ms	remaining: 79.6ms
7:	learn: 0.3180409	total: 6.86ms	remaining: 78.9ms
8:	learn: 0.3104681	total: 7.75ms	remaining: 78.4ms
9:	learn: 0.3047009	total: 8.51ms	remaining: 76.5ms
10:	learn: 0.2928493	total: 11ms	remaining: 89.1ms
11:	learn: 0.2834047	total: 11.9ms	remaining: 87ms
12:	learn: 0.2751210	total: 12.6ms	remaining: 84.5ms
13:	learn: 0.2680625	total: 13.5ms	remaining: 82.7ms
14:	learn: 0.2625920	total: 14.3ms	remaining: 81ms
15:	learn: 0.2547784	total: 15.2ms	remaining: 79.9ms
16:	learn: 0.2463692	total: 16.2ms	remaining: 78.9ms
17:	learn: 0.2417819	total: 17ms	remaining: 77.2ms
18:	learn: 0.2352983	total: 17.7ms	remaining: 75.7ms
19:	learn: 0.2



In [24]:
clf_xgb = clf_xgb.fit(X_train, y_train)
clf_rf = RandomForestClassifier(max_depth=4)
clf_rf = clf_rf.fit(X_train, y_train)



In [21]:
df_test_for_model = transform_data_for_model(df_test)
df_test_for_model = add_kaggle_cols(df_test_for_model)
df_test_for_model = df_test_for_model[['Pclass', 'SibSp', 'Parch', 'Fare', 'Embarked', 'name_part', 'cabin_letter', 'sex_binary', 'age_no_nan_groups', 'FamilySize', 'IsAlone','Age*Class']]
df_test_for_model = pd.get_dummies(df_test_for_model, prefix=['Embarked','name_part','cabin_letter'], columns=['Embarked','name_part','cabin_letter'])
# Жуткий костыль!
df_test_for_model['cabin_letter_T'] = 0
df_test_for_model['Fare'] = df_test_for_model['Fare'].fillna(df_test_for_model['Fare'].median())
df_test_for_model['Survived'] = clf_rf.predict(df_test_for_model[feature_cols])

In [17]:
df_test_for_model[['Survived']].to_csv('result_after_kaggle.csv')