In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import acquire
import prepare
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix




In [2]:
train, test, validate = prepare.prep_titanic()
train, test, validate = prepare.impute(train, test, validate, my_strategy = 'mean', column_list = 'age')


In [3]:
train = train.replace({'class':{'First':'1', 'Second': '2', 'Third': '3'}})
validate = validate.replace({'class':{'First':'1', 'Second': '2', 'Third': '3'}})
test = test.replace({'class':{'First':'1', 'Second': '2', 'Third': '3'}})

In [4]:
train
validate
test

Unnamed: 0,survived,age,sibsp,parch,fare,class,embark_town,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
519,0,32.000000,0,0,7.8958,3,Southampton,1,0,1,1
330,1,29.794372,2,0,23.2500,3,Queenstown,0,1,0,0
381,1,1.000000,0,2,15.7417,3,Cherbourg,0,0,0,0
234,0,24.000000,0,0,10.5000,2,Southampton,1,0,1,1
736,0,48.000000,1,3,34.3750,3,Southampton,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
738,0,29.794372,0,0,7.8958,3,Southampton,1,0,1,1
272,1,41.000000,0,1,19.5000,2,Southampton,0,0,1,0
122,0,32.500000,1,0,30.0708,2,Cherbourg,0,0,0,1
291,1,19.000000,1,0,91.0792,1,Cherbourg,0,0,0,0


In [5]:
train.survived.value_counts()

0    351
1    217
Name: survived, dtype: int64

## Random Forest Model 1
#### with max_dept = 20 and min_samples_leaf = 1

In [6]:
x_train1 = train[['sex_male', 'class', 'fare', 'age']]

y_train = train.survived


In [7]:
rf1 = RandomForestClassifier(random_state = 123, max_depth = 20, min_samples_leaf = 1)

In [8]:
rf1.fit(x_train1, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [9]:
# Calculating accuracy
a1 = rf1.score(x_train1, y_train)
a1

0.9806338028169014

In [10]:
y_pred1 = rf1.predict(x_train1)

In [11]:
print(classification_report(y_train, y_pred1
                           ))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       351
           1       0.99      0.96      0.97       217

    accuracy                           0.98       568
   macro avg       0.98      0.98      0.98       568
weighted avg       0.98      0.98      0.98       568



In [12]:
confusion_matrix(y_train, y_pred1)

array([[349,   2],
       [  9, 208]])

In [13]:
print(f'the accuracy of the model is: {rf1.score(x_train1, y_train)}')
print()
print("the classification report of the model is")
print(classification_report(y_train, y_pred1))


the accuracy of the model is: 0.9806338028169014

the classification report of the model is
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       351
           1       0.99      0.96      0.97       217

    accuracy                           0.98       568
   macro avg       0.98      0.98      0.98       568
weighted avg       0.98      0.98      0.98       568



## Model 2 
#### with max_depth = 3, min_samples_leaf = 5

In [14]:
x_train2 = train[['sex_male', 'class', 'fare', 'age']]

y_train = train.survived

In [15]:
rf2 = RandomForestClassifier(random_state = 129, max_depth = 3, min_samples_leaf = 5)

In [16]:
rf2.fit(x_train2, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=129,
                       verbose=0, warm_start=False)

In [17]:
# Calculating accuracy
a2 = rf2.score(x_train2, y_train)
a2

0.8098591549295775

In [18]:
y_pred2 = rf2.predict(x_train2)

In [19]:
print(classification_report(y_train, y_pred2))

              precision    recall  f1-score   support

           0       0.81      0.91      0.85       351
           1       0.81      0.65      0.72       217

    accuracy                           0.81       568
   macro avg       0.81      0.78      0.79       568
weighted avg       0.81      0.81      0.81       568



In [20]:
confusion_matrix(y_train, y_pred2)

array([[318,  33],
       [ 75, 142]])

In [21]:
print(f'the accuracy of the model is: {rf2.score(x_train2, y_train)}')
print()
print("the classification report of the model is")
print(classification_report(y_train, y_pred2))


the accuracy of the model is: 0.8098591549295775

the classification report of the model is
              precision    recall  f1-score   support

           0       0.81      0.91      0.85       351
           1       0.81      0.65      0.72       217

    accuracy                           0.81       568
   macro avg       0.81      0.78      0.79       568
weighted avg       0.81      0.81      0.81       568



- All of the evaluation metrics changed(decreased) significantly when we changed our hypermeters. Model 1 performed betterwith the sample data and its because in model 1, the min number of samples required in the leaf before it stops splittig was 2 compared to model 1which had 5. Also, in model 1, max_depth of the tree  was 20, whereas in model 2, it was 3

## Model 3 
#### with max_depth = 3, min_samples_leaf = 2

In [22]:
x_train3 = train[['sex_male', 'class', 'fare', 'age']]

y_train = train.survived

In [23]:
rf3 = RandomForestClassifier(max_depth = 3, min_samples_leaf = 2, random_state = 121)

In [24]:
rf3.fit(x_train3, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=121,
                       verbose=0, warm_start=False)

In [25]:
# Calculating accuracy
a3 = rf3.score(x_train3, y_train)
a3

0.8169014084507042

In [26]:
y_pred3 = rf3.predict(x_train3)

In [27]:
print(classification_report(y_train, y_pred3))

              precision    recall  f1-score   support

           0       0.81      0.91      0.86       351
           1       0.82      0.66      0.73       217

    accuracy                           0.82       568
   macro avg       0.82      0.79      0.80       568
weighted avg       0.82      0.82      0.81       568



## Model 4
#### with max_depth = 5, min_samples_leaf = 2

In [28]:
x_train4 = train[['sex_male', 'class', 'fare', 'age']]

y_train = train.survived

In [29]:
rf4 = RandomForestClassifier(max_depth = 5, min_samples_leaf = 2, random_state = 119, n_estimators = 100)

In [30]:
rf4.fit(x_train4, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=119,
                       verbose=0, warm_start=False)

In [31]:
# Calculating Accuracy
a4 = rf4.score(x_train4, y_train)
a4

0.8538732394366197

In [32]:
y_pred4 = rf4.predict(x_train4)

In [33]:
print(classification_report(y_train, y_pred4))

              precision    recall  f1-score   support

           0       0.83      0.95      0.89       351
           1       0.90      0.70      0.78       217

    accuracy                           0.85       568
   macro avg       0.87      0.82      0.84       568
weighted avg       0.86      0.85      0.85       568



## Model 5
#### with max_depth = 10, min_samples_leaf = 2

In [34]:
x_train5 = train[['sex_male', 'class', 'fare', 'age']]

y_train = train.survived

In [35]:
rf5 = RandomForestClassifier(max_depth = 10, min_samples_leaf = 2, random_state = 118, n_estimators = 100)

In [36]:
rf5.fit(x_train5, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=118,
                       verbose=0, warm_start=False)

In [37]:
# Calculating Accuracy
a5 = rf5.score(x_train5, y_train)
a5

0.9014084507042254

In [38]:
y_pred5 = rf5.predict(x_train5)

In [39]:
print(classification_report(y_train, y_pred5))

              precision    recall  f1-score   support

           0       0.89      0.97      0.92       351
           1       0.94      0.80      0.86       217

    accuracy                           0.90       568
   macro avg       0.91      0.88      0.89       568
weighted avg       0.90      0.90      0.90       568



In [40]:
print('Accuracy of model 5:', a5)
print()
print('Accuracy of model 4:', a4)
print()
print('Accuracy of model 3:', a3)
print()
print('Accuracy of model 2:', a2)
print()
print('Accuracy of model 1:', a1)

Accuracy of model 5: 0.9014084507042254

Accuracy of model 4: 0.8538732394366197

Accuracy of model 3: 0.8169014084507042

Accuracy of model 2: 0.8098591549295775

Accuracy of model 1: 0.9806338028169014


### Out of 5 models we will chose model 1, model 4, and model 5 to run with validate data


### Validate with model 1

In [41]:
x_val1 = validate[['sex_male', 'class', 'fare', 'age']]
y_val = validate.survived

In [42]:
# Calculating accuracy

a1_val = rf1.score(x_val1, y_val)
a1_val


### looking at this accuracy score, model 1 might have been overfitted. Its accuracy score with training data was
### 98%

0.8461538461538461

In [43]:
y_val_pred1 = rf1.predict(x_val1)

In [44]:
print(classification_report(y_val, y_val_pred1))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88        88
           1       0.81      0.78      0.80        55

    accuracy                           0.85       143
   macro avg       0.84      0.83      0.84       143
weighted avg       0.85      0.85      0.85       143



### Validate with model 4

In [45]:
x_val4 = validate[['sex_male', 'class', 'fare', 'age']]
y_val = validate.survived

In [46]:
# Calculating accuracy

a4_val = rf4.score(x_val4, y_val)
a4_val

# model 4 is doing well with both training and validate data set, there is no difference in accuracy 

0.8461538461538461

In [47]:
y_val_pred4 = rf4.predict(x_val4)

In [48]:
print(classification_report(y_val, y_val_pred4))

              precision    recall  f1-score   support

           0       0.84      0.92      0.88        88
           1       0.85      0.73      0.78        55

    accuracy                           0.85       143
   macro avg       0.85      0.82      0.83       143
weighted avg       0.85      0.85      0.84       143



### Validate with model 5

In [49]:
x_val5 = validate[['sex_male', 'class', 'fare', 'age']]
y_val = validate.survived

In [50]:
# Calculating accuracy

a5_val = rf5.score(x_val5, y_val)
a5_val


# this models accuracy decreased by 4 percent compared to training data set. Overall, much better accuracy than 
# other models

0.8671328671328671

In [51]:
y_val_pred5 = rf5.predict(x_val5)

In [52]:
print(classification_report(y_val, y_val_pred5))

              precision    recall  f1-score   support

           0       0.86      0.93      0.90        88
           1       0.88      0.76      0.82        55

    accuracy                           0.87       143
   macro avg       0.87      0.85      0.86       143
weighted avg       0.87      0.87      0.87       143



## Model 5 accuracy was cosistently better with training and validate data set, so we will use this model with our test data set

In [53]:
x_test= test[['sex_male', 'class', 'fare', 'age']]
y_test = test.survived

In [54]:
# Calculating accuracy
a_test = rf5.score(x_test, y_test)
a_test

0.8202247191011236

In [55]:
y_test_pred = rf5.predict(x_test)

In [56]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86       110
           1       0.81      0.69      0.75        68

    accuracy                           0.82       178
   macro avg       0.82      0.80      0.80       178
weighted avg       0.82      0.82      0.82       178



### Our model was able to accurately classify at the rate of 82% on previously unseen data

# KNN Model Building

### Model 1
 #### with n_neighbors = 5

In [57]:
from sklearn.neighbors import KNeighborsClassifier

In [58]:
x_train1 = train[['sex_male', 'class', 'fare', 'age']]
y_train = train.survived

In [59]:
knn1 = KNeighborsClassifier(n_neighbors = 5)

In [60]:
knn1.fit(x_train1, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [61]:
knn1.score(x_train1, y_train)

0.7887323943661971

In [62]:
y_pred1 = knn1.predict(x_train1)

In [63]:
print(classification_report(y_train, y_pred1))

              precision    recall  f1-score   support

           0       0.80      0.87      0.84       351
           1       0.76      0.65      0.70       217

    accuracy                           0.79       568
   macro avg       0.78      0.76      0.77       568
weighted avg       0.79      0.79      0.78       568



## Model 2
### with n_neighbors = 10

In [64]:
x_train2 = train[['sex_male', 'class', 'fare', 'age']]
y_train = train.survived

In [65]:
knn2 = KNeighborsClassifier(n_neighbors = 10)

In [66]:
knn2.fit(x_train2, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')

In [67]:
knn2.score(x_train2, y_train)

0.7535211267605634

In [68]:
y_pred2 = knn2.predict(x_train2)

In [69]:
print(classification_report(y_train, y_pred2))

              precision    recall  f1-score   support

           0       0.75      0.91      0.82       351
           1       0.77      0.50      0.61       217

    accuracy                           0.75       568
   macro avg       0.76      0.71      0.71       568
weighted avg       0.76      0.75      0.74       568



## Model 3
### with n_neighbors = 20

In [70]:
x_train3 = train[['sex_male', 'class', 'fare', 'age']]
y_train = train.survived

In [71]:
knn3 = KNeighborsClassifier(n_neighbors = 20)

In [72]:
knn3.fit(x_train3, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                     weights='uniform')

In [73]:
knn3.score(x_train3, y_train)

0.7200704225352113

In [74]:
y_pred3 = knn3.predict(x_train3)

In [75]:
print(classification_report(y_train, y_pred3))

              precision    recall  f1-score   support

           0       0.72      0.91      0.80       351
           1       0.74      0.41      0.53       217

    accuracy                           0.72       568
   macro avg       0.73      0.66      0.67       568
weighted avg       0.72      0.72      0.70       568



## Model 4
### with default n_neighbors but reducing the number of predictors to 3

In [76]:
x_train4 = train[['sex_male', 'class', 'fare']]
y_train = train.survived

In [77]:
knn4 = KNeighborsClassifier()

In [78]:
knn4.fit(x_train4, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [79]:
knn4.score(x_train4, y_train)

0.8257042253521126

In [80]:
y_pred4 = knn4.predict(x_train4)

In [81]:
print(classification_report(y_train, y_pred4))

              precision    recall  f1-score   support

           0       0.84      0.88      0.86       351
           1       0.80      0.73      0.76       217

    accuracy                           0.83       568
   macro avg       0.82      0.81      0.81       568
weighted avg       0.82      0.83      0.82       568



## Model 5
### with default n_neighbors but reducing the number of predictors to 2

In [82]:
x_train5 = train[['sex_male', 'class']]
y_train = train.survived

In [83]:
knn5 = KNeighborsClassifier()

In [84]:
knn5.fit(x_train5, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [85]:
knn5.score(x_train5, y_train)

0.5070422535211268

In [86]:
y_pred5 = knn5.predict(x_train5)

In [87]:
print(classification_report(y_train, y_pred5))

              precision    recall  f1-score   support

           0       0.74      0.31      0.44       351
           1       0.42      0.82      0.56       217

    accuracy                           0.51       568
   macro avg       0.58      0.57      0.50       568
weighted avg       0.62      0.51      0.49       568



### Out of all 5 models, 1, 2, 4 have the top 3 accuracy, so we will use these with our validation data



### Validating with model 1

In [88]:
x_val1 = validate[['sex_male', 'class', 'fare', 'age']]
y_val = validate.survived

In [89]:
knn1.score(x_val1, y_val)

0.7132867132867133

In [90]:
y_val_pred1 = knn1.predict(x_val1)

In [91]:
print(classification_report(y_val, y_val_pred1))

              precision    recall  f1-score   support

           0       0.75      0.80      0.77        88
           1       0.64      0.58      0.61        55

    accuracy                           0.71       143
   macro avg       0.70      0.69      0.69       143
weighted avg       0.71      0.71      0.71       143



### Model 1 accuracy on validate data is 71%




### Validating with model 2

In [92]:
x_val2 = validate[['sex_male', 'class', 'fare', 'age']]
y_val = validate.survived

In [93]:
knn2.score(x_val2, y_val)

0.6923076923076923

In [94]:
y_val_pred2 = knn2.predict(x_val2)

In [95]:
print(classification_report(y_val, y_val_pred2))

              precision    recall  f1-score   support

           0       0.70      0.86      0.78        88
           1       0.66      0.42      0.51        55

    accuracy                           0.69       143
   macro avg       0.68      0.64      0.64       143
weighted avg       0.69      0.69      0.67       143



### Model 2 accuracy on validate data is 69%




### Validating with model 4. Model 4 only uses three features

In [96]:
x_val4 = validate[['sex_male', 'class', 'fare']]
y_val = validate.survived

In [97]:
knn4.score(x_val4, y_val)

0.7762237762237763

In [98]:
y_val_pred4 = knn4.predict(x_val4)

In [99]:
print(classification_report(y_val, y_val_pred4))

              precision    recall  f1-score   support

           0       0.82      0.82      0.82        88
           1       0.71      0.71      0.71        55

    accuracy                           0.78       143
   macro avg       0.76      0.76      0.76       143
weighted avg       0.78      0.78      0.78       143



### Model 4 accuracy with validate data is 78%. 


### So we will use model 4 with out Test Data. Model 4 uses three features




## Model 4 on Test Data

In [100]:
x_test = test[['sex_male', 'class', 'fare']]
y_test = test.survived

In [101]:
knn4.score(x_test, y_test)

0.8146067415730337

In [102]:
y_test_pred = knn4.predict(x_test)

In [103]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85       110
           1       0.75      0.76      0.76        68

    accuracy                           0.81       178
   macro avg       0.80      0.81      0.80       178
weighted avg       0.82      0.81      0.81       178



## Model 4 predicted the right outcome with 81 % accuracy on unseen data