In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
numeric_cols = list(df_train.select_dtypes(exclude='object').columns)
numeric_cols.remove('PassengerId')
numeric_cols.remove('Survived')
numeric_cols

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [5]:
cat_cols = list(df_train.select_dtypes(include='object').columns)
cat_cols.remove('Name')
cat_cols.remove('Ticket')
cat_cols.remove('Cabin')
cat_cols

['Sex', 'Embarked']

# Fill missing data

In [6]:
from sklearn.impute import SimpleImputer
imputer_num = SimpleImputer(strategy='median')
imputer_cat = SimpleImputer(strategy='most_frequent')

imputer_num.fit(df_train[numeric_cols])
imputer_cat.fit(df_train[cat_cols])

df_train[numeric_cols] = imputer_num.transform(df_train[numeric_cols])
df_train[cat_cols] = imputer_cat.transform(df_train[cat_cols])

df_test[numeric_cols] = imputer_num.transform(df_test[numeric_cols])
df_test[cat_cols] = imputer_cat.transform(df_test[cat_cols])

In [7]:
df_train[numeric_cols].isna().sum(), df_train[cat_cols].isna().sum()

(Pclass    0
 Age       0
 SibSp     0
 Parch     0
 Fare      0
 dtype: int64,
 Sex         0
 Embarked    0
 dtype: int64)

In [8]:
df_test[numeric_cols].isna().sum(), df_test[cat_cols].isna().sum()

(Pclass    0
 Age       0
 SibSp     0
 Parch     0
 Fare      0
 dtype: int64,
 Sex         0
 Embarked    0
 dtype: int64)

In [9]:
df_train['Title'] = df_train['Name'].apply(lambda name:name.split('.')[0].split(' ')[1])
df_test['Title'] = df_test['Name'].apply(lambda name:name.split('.')[0].split(' ')[1])

In [10]:
titles = list(df_train['Title'].value_counts().iloc[:4].index)
titles

['Mr', 'Miss', 'Mrs', 'Master']

In [11]:
df_train['Title'] = df_train['Title'].apply(lambda X:X if X in titles else 'other')
df_test['Title'] = df_test['Title'].apply(lambda X:X if X in titles else 'other')

In [12]:
df_train['Title']

0         Mr
1        Mrs
2       Miss
3        Mrs
4         Mr
       ...  
886    other
887     Miss
888     Miss
889       Mr
890       Mr
Name: Title, Length: 891, dtype: object

# Feature Encoding

In [13]:
cat_cols.append('Title')
cat_cols

['Sex', 'Embarked', 'Title']

In [14]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
ohe.fit(df_train[cat_cols])

OneHotEncoder(sparse=False)

In [17]:
new_cols = list(ohe.get_feature_names())
new_cols

['x0_female',
 'x0_male',
 'x1_C',
 'x1_Q',
 'x1_S',
 'x2_Master',
 'x2_Miss',
 'x2_Mr',
 'x2_Mrs',
 'x2_other']

In [18]:
df_train[new_cols] = ohe.transform(df_train[cat_cols])

In [19]:
df_test[new_cols] = ohe.transform(df_test[cat_cols])

In [20]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,x0_female,x0_male,x1_C,x1_Q,x1_S,x2_Master,x2_Miss,x2_Mr,x2_Mrs,x2_other
0,1,0,3.0,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.25,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,2,1,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3,1,3.0,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.925,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,4,1,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,5,0,3.0,"Allen, Mr. William Henry",male,35.0,0.0,0.0,373450,8.05,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [25]:
df_train['Cabin'].isna()

0       True
1      False
2       True
3      False
4       True
       ...  
886     True
887    False
888     True
889    False
890     True
Name: Cabin, Length: 891, dtype: bool

In [23]:
df_train['Cabin'].isna().apply(lambda x:int(not x))

0      0
1      1
2      0
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Cabin, Length: 891, dtype: int64

# Lets try for is_cabin

In [26]:
df_train['is_cabin'] = df_train['Cabin'].isna().apply(lambda X:int(not X))
df_test['is_cabin'] = df_test['Cabin'].isna().apply(lambda X:int(not X))

In [27]:
numeric_cols.append('is_cabin')

# Model Bulding

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [30]:
numeric_cols, cat_cols, new_cols

(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'is_cabin'],
 ['Sex', 'Embarked', 'Title'],
 ['x0_female',
  'x0_male',
  'x1_C',
  'x1_Q',
  'x1_S',
  'x2_Master',
  'x2_Miss',
  'x2_Mr',
  'x2_Mrs',
  'x2_other'])

In [31]:
params = {'n_estimators':[25,50,75,100,125,150,175,200],
          'max_depth':[1,2,3,4,5],
          'min_samples_leaf':[2,4,6,8]}
random_cv = RandomizedSearchCV(RandomForestClassifier(), params, cv=5)
random_cv.fit(df_train[numeric_cols + new_cols], df_train['Survived'])

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(),
                   param_distributions={'max_depth': [1, 2, 3, 4, 5],
                                        'min_samples_leaf': [2, 4, 6, 8],
                                        'n_estimators': [25, 50, 75, 100, 125,
                                                         150, 175, 200]})

In [33]:
random_cv.best_estimator_

RandomForestClassifier(max_depth=5, min_samples_leaf=4)

In [34]:
model = random_cv.best_estimator_

In [35]:
yp = model.predict(df_test[numeric_cols+new_cols])

In [36]:
df_test['Survived'] = yp

In [37]:
df_test[['PassengerId','Survived']].to_csv('sub_rf.csv', index=False)