In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv('train.csv', index_col='PassengerId')
df_test = pd.read_csv('test.csv', index_col='PassengerId')

In [3]:
def take_name_part_from_df(df, col_name='Name', want_to_add_parts=['Mr.', 'Mrs.', 'Miss.', 'Master.', 'Dr.'], missing_name_part='no_name_part'):
    dict_name_parts = {}
    for name in df[col_name]:
        name_parts = name.split(' ')
        for name_part in name_parts:
            if name_part not in dict_name_parts.keys():
                dict_name_parts[name_part] = 1
            else:
                dict_name_parts[name_part] += 1
    
    df_name_parts = pd.DataFrame.from_dict(dict_name_parts, orient='index')
    df_name_parts = df_name_parts.reset_index()
    df_name_parts.columns = ['name_part', 'cnt']
    
    name_part_to_df = []

    for p_name in df[col_name]:
        counter = 0
        for name_part in want_to_add_parts:
            if name_part in p_name:
                name_part_to_df.append(name_part)
                continue
            else:
                counter += 1
                if counter == len(want_to_add_parts):
                    name_part_to_df.append(missing_name_part)
                    
    return name_part_to_df

In [4]:
def adding_data_to_df(df, list_of_cols_to_add, list_of_names_of_cols_to_add):
    for col_index in range(len(list_of_cols_to_add)):
        col = list_of_cols_to_add[col_index]
        col_name = list_of_names_of_cols_to_add[col_index]
        df[col_name] = col
    return df

In [5]:
def transform_data_for_model(df):

    list_of_cols_to_add = []
    list_of_names_of_cols_to_add = []

    name_parts = take_name_part_from_df(df, col_name='Name', want_to_add_parts=['Mr.', 'Mrs.', 'Miss.', 'Master.', 'Dr.'], missing_name_part='no_name_part')
    list_of_cols_to_add.append(name_parts)
    list_of_names_of_cols_to_add.append('name_part')

    cab_num_for_df = []
    for cab_num in df.Cabin:
        if pd.isna(cab_num):
            cab_num_for_df.append('N')
        else:
            cab_num_for_df.append(cab_num.split(' ')[0][0])
    list_of_cols_to_add.append(cab_num_for_df)
    list_of_names_of_cols_to_add.append('cabin_letter')

    list_of_cols_to_add.append(df['Sex'].map({'male':0, 'female':1}))
    list_of_names_of_cols_to_add.append('sex_binary')
    
    df = adding_data_to_df(df, list_of_cols_to_add, list_of_names_of_cols_to_add)
    list_of_cols_to_add = []
    list_of_names_of_cols_to_add = []
    
    df['Embarked'] = df.Embarked.fillna('S')
    mean_age_dict = df[['name_part', 'Age']].groupby('name_part').agg({'Age':'median'}).to_dict(orient='dict')['Age']
    list_of_cols_to_add.append(df.apply(lambda row: mean_age_dict[row['name_part']] if np.isnan(row['Age']) else row['Age'], axis=1))
    list_of_names_of_cols_to_add.append('age_no_nan')

    df_for_return = adding_data_to_df(df, list_of_cols_to_add, list_of_names_of_cols_to_add)

    return df_for_return

df_train_for_model = transform_data_for_model(df_train)
df_train_for_model = df_train_for_model[['Survived', 'Pclass', 'SibSp', 'Parch', 'Fare', 'Embarked', 'name_part', 'cabin_letter', 'sex_binary', 'age_no_nan']]
df_train_for_model = pd.get_dummies(df_train_for_model, prefix=['Embarked','name_part','cabin_letter'], columns=['Embarked','name_part','cabin_letter'])

In [6]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from catboost import CatBoostClassifier
import statistics
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

from xgboost import XGBClassifier

In [7]:
feature_cols = df_train_for_model.columns[1:]
y = df_train_for_model['Survived']
X = df_train_for_model[feature_cols]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7) # 70% training and 30% test

In [8]:
clf_dt = DecisionTreeClassifier(max_depth=4,  min_samples_leaf=20)
# clf_dt = clf_dt.fit(X_train, y_train)

clf_rf = RandomForestClassifier(max_depth=4)
# clf_rf = clf_rf.fit(X_train, y_train)

clf_xgb = XGBClassifier()
# clf_xgb = clf_xgb.fit(X_train, y_train)

cat_features = [0, 1]
clf_cb = CatBoostClassifier(iterations=100, learning_rate=1, depth=4)
# clf_cb.fit(X_train, y_train, cat_features)

eclf2 = VotingClassifier(estimators=[('dt', clf_dt), ('rf', clf_rf), ('xgb', clf_xgb), ('catb', clf_cb)], voting='soft')
eclf2 = eclf2.fit(X_train, y_train)



0:	learn: 0.4117569	total: 151ms	remaining: 15s
1:	learn: 0.3970321	total: 152ms	remaining: 7.47s
2:	learn: 0.3747669	total: 153ms	remaining: 4.95s
3:	learn: 0.3450040	total: 154ms	remaining: 3.69s
4:	learn: 0.3277946	total: 155ms	remaining: 2.94s
5:	learn: 0.3209012	total: 156ms	remaining: 2.44s
6:	learn: 0.3044565	total: 156ms	remaining: 2.08s
7:	learn: 0.3014353	total: 157ms	remaining: 1.81s
8:	learn: 0.2952029	total: 158ms	remaining: 1.6s
9:	learn: 0.2737604	total: 159ms	remaining: 1.43s
10:	learn: 0.2634041	total: 159ms	remaining: 1.29s
11:	learn: 0.2554961	total: 160ms	remaining: 1.17s
12:	learn: 0.2459044	total: 161ms	remaining: 1.08s
13:	learn: 0.2374581	total: 162ms	remaining: 993ms
14:	learn: 0.2294192	total: 162ms	remaining: 921ms
15:	learn: 0.2239704	total: 163ms	remaining: 857ms
16:	learn: 0.2212638	total: 164ms	remaining: 800ms
17:	learn: 0.2128907	total: 165ms	remaining: 750ms
18:	learn: 0.2023029	total: 165ms	remaining: 705ms
19:	learn: 0.1976940	total: 166ms	remaining:

In [9]:
clf_xgb = clf_xgb.fit(X_train, y_train)
clf_rf = RandomForestClassifier(max_depth=4)
clf_rf = clf_rf.fit(X_train, y_train)



In [15]:
df_test_for_model = transform_data_for_model(df_test)
df_test_for_model = df_test_for_model[['Pclass', 'SibSp', 'Parch', 'Fare', 'Embarked', 'name_part', 'cabin_letter', 'sex_binary', 'age_no_nan']]
df_test_for_model = pd.get_dummies(df_test_for_model, prefix=['Embarked','name_part','cabin_letter'], columns=['Embarked','name_part','cabin_letter'])
# Жуткий костыль!
df_test_for_model['cabin_letter_T'] = 0
df_test_for_model['Fare'] = df_test_for_model['Fare'].fillna(df_test_for_model['Fare'].median())
df_test_for_model['Survived'] = clf_rf.predict(df_test_for_model[feature_cols])

In [11]:
df_test_for_model[['Survived']].to_csv('result_rf.csv')