In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("excel_full_train.csv")
test = pd.read_csv("excel_test.csv")

In [3]:
X = train.drop(['PassengerId','Survived'], axis = 1)
y = train['Survived']

In [4]:
#### Use Test Train Split to divide into train and test
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=21)

In [5]:
from sklearn.ensemble import RandomForestClassifier

In [6]:
RF_Model = RandomForestClassifier(n_estimators = 100)

In [7]:
RF_Model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Grid Search

In [8]:
from sklearn.model_selection import GridSearchCV

In [9]:
#Using max_depth, criterion will suffice for DT Models, rest all will remain constant 
parameters = {'n_estimators' : (10,30,50,70,90,100)
              , 'criterion' : ('gini', 'entropy')
              , 'max_depth' : (3,5,7,9,10)
              , 'max_features' : ('auto', 'sqrt')
              , 'min_samples_split' : (2,4,6)
              #, 'min_weight_fraction_leaf' : (0.0,0.1,0.2,0.3)
             }

In [11]:
RF_grid  = GridSearchCV(RandomForestClassifier(n_jobs = -1, oob_score= False), param_grid = parameters, cv = 3, verbose = True)

In [12]:
RF_grid_model = RF_grid.fit(X_train, y_train)

Fitting 3 folds for each of 360 candidates, totalling 1080 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1080 out of 1080 | elapsed:  3.7min finished


In [13]:
RF_grid_model.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=7, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=6,
                       min_weight_fraction_leaf=0.0, n_estimators=90, n_jobs=-1,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [14]:
RF_grid_model.best_score_

0.8346709470304976

In [15]:
#Build model with best estimates 
RF_Model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=7, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=6,
                       min_weight_fraction_leaf=0.0, n_estimators=90, n_jobs=-1,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [16]:
RF_Model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=7, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=6,
                       min_weight_fraction_leaf=0.0, n_estimators=90, n_jobs=-1,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [17]:
RF_Model.score(X_train, y_train)

0.8780096308186196

In [18]:
best_feat = pd.DataFrame({'Features': X_train.columns,'Importance': RF_Model.feature_importances_})
best_feat.sort_values('Importance', ascending = False)

Unnamed: 0,Features,Importance
4,Sex_1,0.182989
6,Title_Mr,0.174139
1,Fare_1,0.087799
2,Age_1,0.086056
0,Pclass,0.076328
7,Title_Mrs,0.07411
8,Title_Miss,0.073721
3,Family,0.052938
12,Cabin_M,0.037125
10,Embarked_S,0.01897


### Evaluation of Test

In [19]:
y_pred = RF_Model.predict(X_test)

In [20]:
print(f'Test : {RF_Model.score(X_test, y_test):.3f}')
print(f'Train : {RF_Model.score(X_train, y_train):.3f}')

Test : 0.854
Train : 0.878


### Prediction of Test

In [21]:
sub_test = test.drop(['PassengerId'], axis = 1)

In [22]:
sub_test_pred = RF_Model.predict(sub_test).astype(int)

In [24]:
AllSub = pd.DataFrame({ 'PassengerId': test['PassengerId'],
                       'Survived' : sub_test_pred
    
})

AllSub.to_csv("All_Var_Video_Random_Forest.csv", index = False)

In [25]:
#Kaggle LB Score - 0.79425