In [3]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [4]:
# Load dataset
df_titanic = pd.read_csv(r'C:\Users\marku\Desktop\ML\MLGit\datasets\titanic.csv')

In [5]:
# Sets null values equal to the median of that column
def handle_null_median(df):
    # Need to set inplace=True, so it doesn't create a copy of the dataframe. Tried without and this led to null-values not being removed
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    df['Age'].fillna(df['Age'].median(), inplace=True)

    df['Embarked'].fillna('S', inplace=True)

    return  df

df_titanic = handle_null_median(df_titanic)
df_titanic['Sex'] = df_titanic['Sex'].replace(['female', 'male'], [0,1])
df_titanic['FamilyMembersCount'] = df_titanic['Parch'] + df_titanic['SibSp'] + 1
df_titanic = df_titanic.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch'], axis=1)

In [6]:
df_XGB = df_titanic.copy()
df_RF = df_titanic.copy()
df_GNB = df_titanic.copy()

In [7]:
def bins(df): # This is kind of feature engineering as well
    df['AgeGroup'] = 0
    df.loc[df['Age'] < 16.336, 'AgeGroup' ] = 0
    df.loc[(df['Age'] >= 16.336) & (df['Age'] < 32.252), 'AgeGroup'] = 1
    df.loc[(df['Age'] >= 32.252) & (df['Age'] < 48.168), 'AgeGroup'] = 2
    df.loc[(df['Age'] >= 48.168) & (df['Age'] < 64.084), 'AgeGroup'] = 3
    df.loc[df['Age'] >= 64.084, 'AgeGroup'] = 4

    # Could also create bins for fare, but not sure Fare is needed.
    return df

df_RF = bins(df_titanic)

In [8]:
df_titanic['Pclass'] =  df_titanic['Pclass'].astype('category')
df_titanic['Embarked'] =  df_titanic['Embarked'].astype('category')
df_titanic['AgeGroup'] =  df_titanic['AgeGroup'].astype('category')
df_titanic['Sex'] =  df_titanic['Sex'].astype('category')
df_titanic = df_titanic.drop('Age', axis=1)

# Random Forest

In [9]:
df_RF = pd.get_dummies(df_titanic)
df_RF = df_RF.drop(['Fare'], axis=1)
df_RF.head()

Unnamed: 0,Survived,FamilyMembersCount,Pclass_1,Pclass_2,Pclass_3,Sex_0,Sex_1,Embarked_C,Embarked_Q,Embarked_S,AgeGroup_0,AgeGroup_1,AgeGroup_2,AgeGroup_3,AgeGroup_4
0,0,2,0,0,1,0,1,0,0,1,0,1,0,0,0
1,1,2,1,0,0,1,0,1,0,0,0,0,1,0,0
2,1,1,0,0,1,1,0,0,0,1,0,1,0,0,0
3,1,2,1,0,0,1,0,0,0,1,0,0,1,0,0
4,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0


In [10]:
train_X, test_X, train_y, test_y = train_test_split(df_RF.drop('Survived', axis=1), df_RF['Survived'], random_state=42)

In [11]:
RFParam1 = {
    "max_depth":[4, 5, 6]
}

RF_Grid1 = GridSearchCV(RandomForestClassifier(verbose=0,
                                               n_estimators=100,
                                               min_samples_leaf=6,
                                               max_samples=.2,
                                               max_features=3,
                                               min_samples_split=12), RFParam1)
RF_Grid1.fit(train_X, train_y)
print(RF_Grid1.best_params_, RF_Grid1.best_score_)

{'max_depth': 5} 0.7934238581528448


In [12]:
RFParam2 = {
    "n_estimators":[50, 100, 200, 300]
}

RF_Grid2 = GridSearchCV(RandomForestClassifier(verbose=0,
                                               max_depth=5,
                                               min_samples_leaf=6,
                                               max_samples=.2,
                                               max_features=3,
                                               min_samples_split=12), RFParam2)
RF_Grid2.fit(train_X, train_y)
print(RF_Grid2.best_params_, RF_Grid2.best_score_)

{'n_estimators': 50} 0.7934238581528448


In [18]:
RFParam3 = {
    "min_samples_leaf": range(0, 10, 2)
}

RF_Grid3 = GridSearchCV(RandomForestClassifier(verbose=0,
                                               max_depth=5,
                                               n_estimators=50,
                                               max_samples=.2,
                                               max_features=3,
                                               min_samples_split=12), RFParam3)
RF_Grid3.fit(train_X, train_y)
print(RF_Grid3.best_params_, RF_Grid3.best_score_)

{'min_samples_leaf': 2} 0.814330602625968


5 fits failed out of a total of 25.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\marku\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\marku\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
    trees = Parallel(
  File "C:\Users\marku\anaconda3\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\Users\marku\anaconda3\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\marku\anaconda

In [21]:
RFParam4 = {
    "max_samples": [.2,.3,.4,.5]
}

RF_Grid4 = GridSearchCV(RandomForestClassifier(verbose=0,
                                               max_depth=5,
                                               n_estimators=50,
                                               min_samples_leaf=6,
                                               max_features=3,
                                               min_samples_split=12), RFParam4)
RF_Grid4.fit(train_X, train_y)
print(RF_Grid4.best_params_, RF_Grid4.best_score_)

{'max_samples': 0.4} 0.8053529345752442


In [22]:
RFParam5 = {
    'max_features': [2,3,4,5]
}

RF_Grid5 = GridSearchCV(RandomForestClassifier(verbose=0,
                                               max_depth=5,
                                               n_estimators=50,
                                               min_samples_leaf=6,
                                               max_samples=.4,
                                               min_samples_split=12), RFParam5)
RF_Grid5.fit(train_X, train_y)
print(RF_Grid5.best_params_, RF_Grid5.best_score_)

{'max_features': 4} 0.7993827853215127


In [26]:
RFParam6 = {
    'min_samples_split': [6, 8, 12, 16]
}

RF_Grid6 = GridSearchCV(RandomForestClassifier(verbose=0,
                                               max_depth=5,
                                               n_estimators=50,
                                               min_samples_leaf=6,
                                               max_samples=.4), RFParam6)
RF_Grid6.fit(train_X, train_y)
print(RF_Grid6.best_params_, RF_Grid6.best_score_)

{'min_samples_split': 12} 0.8053753787453708


In [30]:
RFModel = RandomForestClassifier(verbose=0,
                                 max_depth=5,
                                 n_estimators=50,
                                 min_samples_leaf=6,
                                 max_samples=.4)
RF_scores = cross_val_score(RFModel, train_X, train_y)
(RF_scores.mean(), RF_scores.std())

(0.8053641566603075, 0.035885543474865195)

In [51]:
RFModel.fit(train_X, train_y)
pred = RFModel.predict(test_X)
accuracy_score(test_y, pred)

0.820627802690583