In [111]:
import pandas as pd

In [112]:
df_train = pd.read_csv("train.csv")

In [113]:
df_train.shape

(891, 12)

In [114]:
df_test = pd.read_csv('test.csv')

In [115]:
df_test.shape

(418, 11)

In [116]:
df = pd.concat([df_train, df_test])

In [117]:
df.shape

(1309, 12)

In [118]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### Getting rid of all the columns that are not going to support anything to the model.

In [119]:
df.drop(['Ticket', 'PassengerId', 'Cabin', 'Name'], axis=1, inplace=True)

In [120]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    float64
 1   Pclass    1309 non-null   int64  
 2   Sex       1309 non-null   object 
 3   Age       1046 non-null   float64
 4   SibSp     1309 non-null   int64  
 5   Parch     1309 non-null   int64  
 6   Fare      1308 non-null   float64
 7   Embarked  1307 non-null   object 
dtypes: float64(3), int64(3), object(2)
memory usage: 92.0+ KB


### Replace all the NaN values of the column "Age" with the current mean of that column

In [121]:
df['Age'].fillna((df['Age'].mean()), inplace=True)
df['Age'] = df['Age'].apply(lambda x: round(x, 1))

#### You can see that we still are missing a few values in the Survived column. 
#### We do no have another choice
#### that to eliminate all the rows that doesnt have any values for this column

In [122]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    float64
 1   Pclass    1309 non-null   int64  
 2   Sex       1309 non-null   object 
 3   Age       1309 non-null   float64
 4   SibSp     1309 non-null   int64  
 5   Parch     1309 non-null   int64  
 6   Fare      1308 non-null   float64
 7   Embarked  1307 non-null   object 
dtypes: float64(3), int64(3), object(2)
memory usage: 92.0+ KB


In [123]:
df['Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
413    29.9
414    39.0
415    38.5
416    29.9
417    29.9
Name: Age, Length: 1309, dtype: float64

In [124]:
df.dropna(inplace=True)

In [125]:
df.shape

(889, 8)

## Now that we cleaned a little more the dataset. We can start to build the model. But remember that we still have some categorical attributes that we will need to take into account when we want to feed our algorithm.

In [126]:
from sklearn.model_selection import train_test_split

In [127]:
X = df.drop(['Survived'], axis=1)

In [128]:
y = df['Survived'].values

In [129]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### This is an important step. We are going to separate each the categorical and numerical columns
#### So that then we can pass it though a pipeline and make the correct transformation

In [130]:
X_train_cat = X_train[['Sex', 'Embarked']]
X_train_num = X_train[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]

X_test_cat = X_test[['Sex', 'Embarked']]
X_test_num = X_test[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]

In [131]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [104]:
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
    ])

#### We are going to only fit the training numerical data and not for the test numerical values. 

In [105]:
num_pipeline.fit(X_train_num)

Pipeline(steps=[('std_scaler', StandardScaler())])

#### Then we can transform the training and testing sets

In [106]:
X_train_num_tr = num_pipeline.transform(X_train_num)

In [107]:
X_test_num_tr = num_pipeline.transform(X_test_num)

In [108]:
cat_pipeline = Pipeline([
    ('one_hot', OneHotEncoder()),
    ])

In [109]:
X_train_cat_1h = cat_pipeline.fit_transform(X_train_cat)
X_test_cat_1h = cat_pipeline.fit_transform(X_test_cat)

In [134]:
df_cat_train = pd.DataFrame(X_train_cat_1h.toarray())
df_cat_test = pd.DataFrame(X_test_cat_1h.toarray())

In [135]:
df_cat_train

Unnamed: 0,0,1,2,3,4
0,1.0,0.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...
706,0.0,1.0,0.0,0.0,1.0
707,0.0,1.0,0.0,0.0,1.0
708,1.0,0.0,0.0,0.0,1.0
709,1.0,0.0,0.0,0.0,1.0


In [143]:
df_num_train = pd.DataFrame(X_train_num_tr)
df_num_test = pd.DataFrame(X_test_num_tr)

In [144]:
df_num_train

Unnamed: 0,0,1,2,3,4
0,-1.584104,-0.602830,-0.474516,-0.475644,2.430597
1,0.812275,-0.000354,0.381780,-0.475644,-0.358135
2,0.812275,0.159798,-0.474516,-0.475644,-0.490949
3,0.812275,-0.000354,6.375852,2.010994,0.762595
4,0.812275,-1.212933,3.806964,2.010994,0.301860
...,...,...,...,...,...
706,0.812275,-0.000354,-0.474516,-0.475644,-0.494000
707,0.812275,-0.374042,-0.474516,-0.475644,-0.652155
708,-1.584104,1.380003,-0.474516,-0.475644,-0.124717
709,0.812275,-0.679093,1.238076,2.010994,0.047083


In [145]:
df_full_tranform_X_train = pd.concat([df_num_train, df_cat_train], axis=1)

In [146]:
df_full_tranform_X_train.shape

(711, 10)

In [147]:
df_full_tranform_X_test = pd.concat([df_num_test, df_cat_test], axis=1)

In [149]:
df_full_tranform_X_test.shape

(178, 10)

#### We finally got a cleaned and transformed dataset. As you can see the column names disapear.
#### This can be fixed manually if you wanted

In [148]:
df_full_tranform_X_train.head()

Unnamed: 0,0,1,2,3,4,0.1,1.1,2.1,3.1,4.1
0,-1.584104,-0.60283,-0.474516,-0.475644,2.430597,1.0,0.0,0.0,0.0,1.0
1,0.812275,-0.000354,0.38178,-0.475644,-0.358135,1.0,0.0,1.0,0.0,0.0
2,0.812275,0.159798,-0.474516,-0.475644,-0.490949,0.0,1.0,0.0,0.0,1.0
3,0.812275,-0.000354,6.375852,2.010994,0.762595,1.0,0.0,0.0,0.0,1.0
4,0.812275,-1.212933,3.806964,2.010994,0.30186,0.0,1.0,0.0,0.0,1.0


### Now we are ready to create our Model

In [160]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [151]:
rand_forest_clf = RandomForestClassifier()

In [159]:
y_pred = cross_val_predict(rand_forest_clf, df_full_tranform_X_train, y_train, cv=10)

### Wow a really good start...? Well we will need to see a little bit more

In [156]:
accuracy_score(y_train, y_pred)

0.9859353023909986

In [162]:
precision_score(y_train, y_pred)

0.796

In [163]:
recall_score(y_train, y_pred)

0.7343173431734318

#### Now we can see clearly the true values of these model. We will try and play with the hyperparameters of this model

In [164]:
confusion_matrix(y_train, y_pred)

array([[389,  51],
       [ 72, 199]])

In [166]:
cross_val_score(rand_forest_clf, df_full_tranform_X_train, y_train, cv=10, scoring="accuracy")

array([0.80555556, 0.78873239, 0.78873239, 0.88732394, 0.81690141,
       0.87323944, 0.84507042, 0.85915493, 0.78873239, 0.83098592])

## Let us Tune a bit the model with RandomSearch

In [175]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

In [176]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [177]:
rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(
    estimator = rf, 
    param_distributions = random_grid, 
    n_iter = 100, 
    cv = 3, 
    verbose=2, 
    random_state=42, 
    n_jobs = -1)

rf_random.fit(df_full_tranform_X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   36.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  5.7min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [178]:
rf_random.best_params_

{'n_estimators': 1200,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': False}

In [179]:
y_pred = rf_random.predict(df_full_tranform_X_train)

In [181]:
accuracy_score(y_train, y_pred)

0.9254571026722925

In [182]:
precision_score(y_train, y_pred)

0.957983193277311

In [183]:
recall_score(y_train, y_pred)

0.8413284132841329

In [184]:
confusion_matrix(y_train, y_pred)

array([[430,  10],
       [ 43, 228]])

##### Now that we got a better result we can still try to get better, now that we know some 
##### of the best Hyperparameters for RandomForestClassifier we can try to dive deeper 
##### using GridSearchCV

In [187]:
from sklearn.model_selection import GridSearchCV

In [188]:
param_grid = {
    'n_estimators': [1100, 1250, 1200, 1250, 1300],
    'min_samples_split': [3, 4, 5, 6, 7],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'max_features': ['sqrt'],
    'max_depth': [6, 8, 10, 12, 14, 16, 18],
    'bootstrap': [False]}

In [189]:
rf_grid_sch = RandomForestClassifier()

In [190]:
grid_search = GridSearchCV(
    estimator = rf_grid_sch, 
    param_grid = param_grid, 
    cv = 3, 
    n_jobs = -1, 
    verbose = 2)

In [191]:
grid_search.fit(df_full_tranform_X_train, y_train)

Fitting 3 folds for each of 875 candidates, totalling 2625 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   29.4s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 16.2min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 24.2min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed: 32.0min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed: 43.5min
[Parallel(n_jobs=-1)]: Done 2625 out of 2625 | elapsed: 44.3min finished


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [False],
                         'max_depth': [6, 8, 10, 12, 14, 16, 18],
                         'max_features': ['sqrt'],
                         'min_samples_leaf': [1, 2, 3, 4, 5],
                         'min_samples_split': [3, 4, 5, 6, 7],
                         'n_estimators': [1100, 1250, 1200, 1250, 1300]},
             verbose=2)

In [196]:
grid_search.best_params_

{'bootstrap': False,
 'max_depth': 12,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 1250}

In [192]:
y_pred = grid_search.predict(df_full_tranform_X_train)

In [193]:
accuracy_score(y_train, y_pred)

0.929676511954993

In [194]:
precision_score(y_train, y_pred)

0.9585062240663901

In [195]:
recall_score(y_train, y_pred)

0.8523985239852399

In [197]:
confusion_matrix(y_train, y_pred)

array([[430,  10],
       [ 40, 231]])

### Now we are ready to test our predictions with the test set

In [198]:
y_pred_test = grid_search.predict(df_full_tranform_X_test)

In [199]:
accuracy_score(y_test, y_pred_test)

0.7865168539325843

In [200]:
precision_score(y_test, y_pred_test)

0.7313432835820896

In [201]:
recall_score(y_test, y_pred_test)

0.7101449275362319

In [202]:
confusion_matrix(y_test, y_pred_test)

array([[91, 18],
       [20, 49]])