# Loading Dataset And Dependencies

In [660]:
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [661]:
# loading data
df_train = pd.read_csv("./data/train.csv")
df_test = pd.read_csv("./data/test.csv")
df_data = pd.concat([df_train, df_test], ignore_index=True)  # Concatenate train and test datasets

In [662]:
# Encode sex
df_data['Sex_Code'] = df_data['Sex'].map({'female' : 1, 'male' : 0}).astype('int')

# split training set and testing set
df_train = df_data[:len(df_train)]
df_test = df_data[len(df_train):]

# Inputs set and labels
X = df_train.drop(labels=['Survived','PassengerId'],axis=1)
Y = df_train['Survived']

In [663]:
# Show Baseline
Base = ['Sex_Code','Pclass']
Base_Model = RandomForestClassifier(random_state=2,n_estimators=250,min_samples_split=20,oob_score=True)
Base_Model.fit(X[Base], Y)
print('Base oob score :%.5f' %(Base_Model.oob_score_),'   LB_Public : 0.76555')

Base oob score :0.73176    LB_Public : 0.76555


In [664]:
# Filling missing values
df_data['Fare'] = df_data['Fare'].fillna(df_data['Fare'].median())

# Making Bins
df_data['FareBin_5'] = pd.qcut(df_data['Fare'], 5)

label = LabelEncoder()
df_data['FareBin_Code_5'] = label.fit_transform(df_data['FareBin_5'])

In [665]:
# Family_size
df_data['Family_size'] = df_data['SibSp'] + df_data['Parch'] + 1
# how about the fare values of deplicate tickets 
deplicate_ticket = []
for tk in df_data.Ticket.unique():
    tem = df_data.loc[df_data.Ticket == tk, 'Fare']
    if tem.count() > 1:
        deplicate_ticket.append(df_data.loc[df_data.Ticket == tk,['Name','Ticket','Fare','Cabin','Family_size','Survived']])
deplicate_ticket = pd.concat(deplicate_ticket)

In [666]:
df_data['Connected_Survival'] = 0.5 # default 
for _, df_grp in df_data.groupby('Ticket'):
    if (len(df_grp) > 1):
        for ind, row in df_grp.iterrows():
            smax = df_grp.drop(ind)['Survived'].max()
            smin = df_grp.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                df_data.loc[df_data['PassengerId'] == passID, 'Connected_Survival'] = 1
            elif (smin==0.0):
                df_data.loc[df_data['PassengerId'] == passID, 'Connected_Survival'] = 0

In [667]:
# extracted title using name
df_data['Title'] = df_data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
df_data['Title'] = df_data['Title'].replace(['Capt', 'Col', 'Countess', 'Don',
                                               'Dr', 'Dona', 'Jonkheer', 
                                                'Major','Rev','Sir'],'Rare') 
df_data['Title'] = df_data['Title'].replace(['Mlle', 'Ms','Mme'],'Miss')
df_data['Title'] = df_data['Title'].replace(['Lady'],'Mrs')
df_data['Title'] = df_data['Title'].map({"Mr":0, "Rare" : 1, "Master" : 2,"Miss" : 3, "Mrs" : 4 })
Ti_pred = df_data.groupby('Title')['Age'].median().values
df_data['Ti_Age'] = df_data['Age']
# Filling the missing age
for i in range(0,5):
    df_data.loc[(df_data.Age.isnull()) & (df_data.Title == i),'Ti_Age'] = Ti_pred[i]
df_data['Ti_Age'] = df_data['Ti_Age'].astype('int')

# extract minor
df_data['Age_copy'] = df_data['Age'].fillna(-1)
df_data['Minor'] = (df_data['Age_copy'] < 14.0) & (df_data['Age_copy']>= 0)
df_data['Minor'] = df_data['Minor'] * 1
# We could capture more 8 Master in Pclass = 3 by filling missing age 
df_data['Ti_Minor'] = ((df_data['Ti_Age']) < 14.0) * 1
print('The # of masters we found in missing Age by Title : ', (df_data['Ti_Minor'] - df_data['Minor']).sum())

The # of masters we found in missing Age by Title :  8


In [668]:
from sklearn.ensemble import RandomForestRegressor

def random_forest_impute(df):
    known_age = df[df['New_Age'].notnull()]
    unknown_age = df[df['New_Age'].isnull()]
    
    X_train = known_age[['Pclass', 'Title']]
    y_train = known_age['New_Age']
    X_test = unknown_age[['Pclass', 'Title']]
    
    rfr = RandomForestRegressor(random_state=0, n_estimators=100)
    rfr.fit(X_train, y_train)
    
    predicted_ages = rfr.predict(X_test)
    df.loc[df['New_Age'].isnull(), 'New_Age'] = predicted_ages
    
    return df

impute_function = random_forest_impute
df_data['New_Age'] = df_data['Age']
df_data = impute_function(df_data)

df_data['Age_Group'] = pd.cut(df_data['New_Age'], bins=[0, 14, 60, 100], labels=['0', '1', '2']) # ['Child', 'Adult', 'Senior']

In [669]:
# splits again beacuse we just engineered new feature
df_train = df_data[:len(df_train)]
df_test = df_data[len(df_train):]
# Training set and labels
X = df_train.drop(labels=['Survived','PassengerId'],axis=1)
Y = df_train['Survived']
# Show columns
X.columns

minor = ['Sex_Code','Pclass','FareBin_Code_5','Connected_Survival','Ti_Minor']
minor_Model = RandomForestClassifier(random_state=2,n_estimators=250,min_samples_split=20,oob_score=True)
minor_Model.fit(X[minor], Y)
print('Minor oob score :%.5f' %(minor_Model.oob_score_),'   LB_Public : 0.82296')

Minor oob score :0.84400    LB_Public : 0.82296


In [670]:
# splits again beacuse we just engineered new feature
df_train = df_data[:len(df_train)]
df_test = df_data[len(df_train):]
# Training set and labels
X = df_train.drop(labels=['Survived','PassengerId'],axis=1)
Y = df_train['Survived']
# Show columns
X.columns

minor = ['Sex_Code','Pclass','FareBin_Code_5','Connected_Survival','Age_Group']
minor_Model = RandomForestClassifier(random_state=2,n_estimators=250,min_samples_split=20,oob_score=True)
minor_Model.fit(X[minor], Y)
print('Minor oob score :%.5f' %(minor_Model.oob_score_),'   LB_Public : 0.82296')

Minor oob score :0.84512    LB_Public : 0.82296


In [671]:
# X_Submit = df_test.drop(labels=['PassengerId'],axis=1)

# minor_pred = minor_Model.predict(X_Submit[minor])

# submit = pd.DataFrame({"PassengerId": df_test['PassengerId'],
#                       "Survived":minor_pred.astype(int)})
# submit.to_csv("submit_minor.csv",index=False)