In [None]:
import pandas as pd

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train.head(1)

In [None]:
test.head(1)

In [None]:
all = pd.concat([train, test], sort = False)
all.info()

In [None]:
#Fill Missing numbers with median
all['Age'] = all['Age'].fillna(value=all['Age'].median())
all['Fare'] = all['Fare'].fillna(value=all['Fare'].median())

In [None]:
all['Embarked'] = all['Embarked'].fillna('S')
all.info()

In [None]:
#Drop unwanted variables
all_1 = all.drop(['Name', 'Ticket', 'Cabin'], axis = 1)
all_1.head(1)

In [None]:
all_dummies = pd.get_dummies(all_1, drop_first = True)
all_dummies.head(1)

In [17]:
all_train = all_dummies[all_dummies['Survived'].notna()]
all_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 10 columns):
PassengerId    891 non-null int64
Survived       891 non-null float64
Pclass         891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Sex_male       891 non-null uint8
Embarked_Q     891 non-null uint8
Embarked_S     891 non-null uint8
dtypes: float64(3), int64(4), uint8(3)
memory usage: 58.3 KB


In [18]:
all_test = all_dummies[all_dummies['Survived'].isna()]
all_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 10 columns):
PassengerId    418 non-null int64
Survived       0 non-null float64
Pclass         418 non-null int64
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           418 non-null float64
Sex_male       418 non-null uint8
Embarked_Q     418 non-null uint8
Embarked_S     418 non-null uint8
dtypes: float64(3), int64(4), uint8(3)
memory usage: 27.3 KB


In [19]:
X = all_train.drop(['PassengerId','Survived'],axis=1)

In [20]:
y = all_train['Survived']

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test 
= train_test_split(X, y, test_size=0.30, random_state=101)

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
rfModel = RandomForestClassifier(n_estimators = 100)

In [24]:
rfModel.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [26]:
rfModel.score(X_train, y_train)

0.9807383627608347

In [29]:
from sklearn.model_selection import GridSearchCV

In [30]:
#Using max_depth, criterion will suffice for DT Models, rest all will remain constant 
parameters = {'criterion' : ('gini', 'entropy')
              , 'max_depth' : (3,4,5,6,7,8,9,10,11,12,13,14,15,20)
              , 'max_features' : ('auto', 'sqrt', 'log2')
              , 'n_estimators' : (10,100)
              #, 'min_samples_leaf' : (1,2,3,4,5,6)
              #, 'min_impurity_decrease' : (0.0,0.1,0.2,0.3)
              #, 'min_weight_fraction_leaf' : (0.0,0.1,0.2,0.3)
             }

In [31]:
rf_grid  = GridSearchCV(RandomForestClassifier(), param_grid = parameters, cv = 3)

In [32]:
rf_grid_model = rf_grid.fit(X_train, y_train)

In [33]:
rf_grid_model.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=4, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [34]:
#Build model with best estimates 
rfModel = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [35]:
rfModel.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [36]:
rfModel.score(X_train, y_train)

0.8571428571428571

In [37]:
best_feat = pd.DataFrame({'Features': X_train.columns,'Importance': rfModel.feature_importances_})
best_feat.sort_values('Importance', ascending = False)

Unnamed: 0,Features,Importance
5,Sex_male,0.445215
4,Fare,0.195512
0,Pclass,0.118974
1,Age,0.110036
2,SibSp,0.055628
3,Parch,0.035435
7,Embarked_S,0.026654
6,Embarked_Q,0.012546


In [38]:
y_pred = rfModel.predict(X_test)

In [39]:
print(f'Test : {rfModel.score(X_test, y_test):.3f}')
print(f'Train : {rfModel.score(X_train, y_train):.3f}')

Test : 0.813
Train : 0.857


In [42]:
sub_test = all_test.drop(['PassengerId', 'Survived'], axis = 1)

In [43]:
sub_test_pred = rfModel.predict(sub_test).astype(int)

In [44]:
AllSub = pd.DataFrame({ 'PassengerId': test['PassengerId'],
                       'Survived' : sub_test_pred
    
})

AllSub.to_csv("All_Var_Video_rf_all_var.csv", index = False)