In [1]:
import pandas as pd
import prepare
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score
from acquire import get_titanic_data

## Logistic Regression

### In this exercise, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

For all of the models you create, choose a threshold that optimizes for accuracy.

Do your work for these exercises in either a notebook or a python script named model within your classification-exercises repository. Add, commit, and push your work.

1. Start by defining your baseline model.

In [2]:
train, validate, test = prepare.prep_titanic()
train

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,embarked_Q,embarked_S,sex_male
583,0,1,36.000000,0,0,40.1250,1,0,0,1
337,1,1,41.000000,0,0,134.5000,1,0,0,0
50,0,3,7.000000,4,1,39.6875,0,0,1,1
218,1,1,32.000000,0,0,76.2917,1,0,0,0
31,1,1,29.916875,1,0,146.5208,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
313,0,3,28.000000,0,0,7.8958,1,0,1,1
636,0,3,32.000000,0,0,7.9250,1,0,1,1
222,0,3,51.000000,0,0,8.0500,1,0,1,1
485,0,3,29.916875,3,1,25.4667,0,0,1,0


In [3]:
train.survived.value_counts(normalize=True)
# If you assume that everyone died without a baseline model you would get a 62% accuracy

0    0.617706
1    0.382294
Name: survived, dtype: float64

In [4]:
logit1 = LogisticRegression()

X_train1 = train.drop(columns=['survived'])
y_train = train.survived

logit1 = logit1.fit(X_train1, y_train)
print(logit1.coef_)
print(X_train1.columns)

[[-1.07859679e+00 -3.10510561e-02 -5.17601592e-01 -2.04025452e-01
   1.66729023e-03 -9.16177527e-01  8.99227745e-01  2.28830240e-01
  -2.42572095e+00]]
Index(['pclass', 'age', 'sibsp', 'parch', 'fare', 'alone', 'embarked_Q',
       'embarked_S', 'sex_male'],
      dtype='object')


In [5]:
y_pred1 = logit1.predict(X_train1)

In [6]:
logit1.score(X_train1, y_train)

0.8048289738430584

This baseline model has an 80% accuracy which is much more accurate than just assuming everyone died.

2. Create another model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [7]:
X_train2 = train[['age', 'fare', 'pclass']]

logit2 = LogisticRegression()
logit2 = logit2.fit(X_train2, y_train)

print(logit2.coef_)
print(X_train2.columns)

[[-0.03051881  0.00266519 -0.97983178]]
Index(['age', 'fare', 'pclass'], dtype='object')


In [8]:
logit2.score(X_train2, y_train)

0.716297786720322

This model only has a 72% accuracy which isn't as good as the baseline.

3. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [9]:
X_train3 = train[['age', 'fare', 'pclass', 'sex_male']]

logit3 = LogisticRegression()
logit3 = logit3.fit(X_train3, y_train)

print(logit3.coef_)
print(X_train3.columns)

[[-2.66594879e-02  9.02716903e-04 -1.11402368e+00 -2.45878213e+00]]
Index(['age', 'fare', 'pclass', 'sex_male'], dtype='object')


In [10]:
logit3.score(X_train3, y_train)

0.7987927565392354

This model has an accuracy of ~80% which is roughly on par with the baseline

4. Try out other combinations of features and models.

In [11]:
def my_logit(X_train):
    logreg = LogisticRegression()
    logreg = logreg.fit(X_train, y_train)
    return logreg, logreg.coef_, logreg.score(X_train, y_train)

In [12]:
X_train4 = train[['pclass', 'sex_male', 'alone']]

logit4, coefs, acc = my_logit(X_train4)

print(coefs, acc)

[[-0.95701015 -2.40744024 -0.30828946]] 0.7847082494969819


The accuracy of this model is only 78% which is still worse than the baseline model.

In [13]:
X_train5 = train[['pclass', 'sex_male', 'alone', 'age']]

logit5, coefs, acc = my_logit(X_train5)

print(coefs, acc)

[[-1.12720398 -2.41479961 -0.17176794 -0.02570129]] 0.7967806841046278


The accuracy of this model is also ~80% which is roughly on par with the baseline model.

In [14]:
X_train6 = train[['sex_male']]

logit6, coefs, acc = my_logit(X_train6)

print(coefs, acc)

[[-2.37681345]] 0.7847082494969819


The accuracy of this model is only 78% which isn't quite as good.

5. Use you best 3 models to predict and evaluate on your validate sample.

My three best models are model 1 (all vars) & 3 (pclass, sex_male, fare, age) & 5 (pclass, sex_male, alone, age).

In [15]:
X_validate1 = validate.drop(columns='survived')
X_validate3 = validate[['age', 'fare', 'pclass', 'sex_male']]
X_validate5 = validate[['pclass', 'sex_male', 'alone', 'age']]
y_validate = validate.survived

acc1 = logit1.score(X_validate1, y_validate)
acc3 = logit3.score(X_validate3, y_validate)
acc5 = logit5.score(X_validate5, y_validate)

print(acc1, acc3, acc5)

0.7990654205607477 0.780373831775701 0.7850467289719626


In [16]:
y1_pred = logit1.predict(X_validate1)
y3_pred = logit3.predict(X_validate3)
y5_pred = logit5.predict(X_validate5)

print('y1: All Vars:\n', classification_report(y_validate, y1_pred))
print('y3: pclass, sex_male, fare, age:\n', classification_report(y_validate, y3_pred))
print('y5: pclass, sex_male, alone, age:\n', classification_report(y_validate, y5_pred))

y1: All Vars:
               precision    recall  f1-score   support

           0       0.82      0.87      0.84       132
           1       0.77      0.68      0.72        82

    accuracy                           0.80       214
   macro avg       0.79      0.78      0.78       214
weighted avg       0.80      0.80      0.80       214

y3: pclass, sex_male, fare, age:
               precision    recall  f1-score   support

           0       0.81      0.83      0.82       132
           1       0.72      0.70      0.71        82

    accuracy                           0.78       214
   macro avg       0.77      0.76      0.77       214
weighted avg       0.78      0.78      0.78       214

y5: pclass, sex_male, alone, age:
               precision    recall  f1-score   support

           0       0.82      0.83      0.83       132
           1       0.72      0.71      0.72        82

    accuracy                           0.79       214
   macro avg       0.77      0.77      0.77 

6. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

The best model is model 1 which has all X variables.

In [17]:
X_test = test.drop(columns=['survived'])
y_test = test.survived

test_acc = logit1.score(X_test, y_test)
y1_pred = logit1.predict(X_test)

print(test_acc)
print('test report:\n', classification_report(y_test, y1_pred))

0.797752808988764
test report:
               precision    recall  f1-score   support

           0       0.84      0.84      0.84       110
           1       0.74      0.74      0.74        68

    accuracy                           0.80       178
   macro avg       0.79      0.79      0.79       178
weighted avg       0.80      0.80      0.80       178



## Decision Tree

### In this exercise, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

Continue working in your model file. Add, commit, and push your changes.

In [18]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
from graphviz import Graph
from sklearn.metrics import precision_recall_fscore_support, f1_score

1. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [19]:
train

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,embarked_Q,embarked_S,sex_male
583,0,1,36.000000,0,0,40.1250,1,0,0,1
337,1,1,41.000000,0,0,134.5000,1,0,0,0
50,0,3,7.000000,4,1,39.6875,0,0,1,1
218,1,1,32.000000,0,0,76.2917,1,0,0,0
31,1,1,29.916875,1,0,146.5208,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
313,0,3,28.000000,0,0,7.8958,1,0,1,1
636,0,3,32.000000,0,0,7.9250,1,0,1,1
222,0,3,51.000000,0,0,8.0500,1,0,1,1
485,0,3,29.916875,3,1,25.4667,0,0,1,0


In [20]:
X_train = train[['pclass', 'sex_male', 'age', 'alone']]
y_train = train.survived

In [21]:
clf1 = DecisionTreeClassifier(max_depth=5, random_state=123)

In [22]:
clf1.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [23]:
y_pred1 = clf1.predict(X_train)
y_pred1

array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,

2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [24]:
print('Accuracy of Decision Tree classifier on training data: {:.2f}'.format(clf1.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training data: 0.83


In [25]:
print(classification_report(y_train, y_pred1))

              precision    recall  f1-score   support

           0       0.79      0.98      0.88       307
           1       0.96      0.58      0.73       190

    accuracy                           0.83       497
   macro avg       0.87      0.78      0.80       497
weighted avg       0.86      0.83      0.82       497



In [26]:
confusion_matrix(y_train, y_pred1)

array([[302,   5],
       [ 79, 111]])

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [27]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred1).ravel()

In [28]:
acc = round((tp + tn) / (tp + fp + fn + tn), 2)
recall = round(tp / (tp + fn), 2)
fpr = round(fp / (fp + tn), 2)
tnr = round(tn / (fp + tn), 2)
fnr = round(fn / (tp + fn), 2)
precision = round(tp / (tp + fp), 2)
f1 = round(f1_score(y_train, y_pred1), 2)
p, r, f, support = precision_recall_fscore_support(y_train, y_pred1)

print(f'accuracy: {acc}')
print(f'recall(true positive rate): {recall}')
print(f'false positive rate: {fpr}')
print(f'true negative rate: {tnr}')
print(f'false negative rate: {fnr}')
print(f'precision: {precision}')
print(f'f1-score: {f1}')
print(f'support: {support}')

accuracy: 0.83
recall(true positive rate): 0.58
false positive rate: 0.02
true negative rate: 0.98
false negative rate: 0.42
precision: 0.96
f1-score: 0.73
support: [307 190]


4. Run through steps 2-4 using a different max_depth value.

In [29]:
clf2 = DecisionTreeClassifier(max_depth = 10, random_state = 75)

In [30]:
clf2.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=10, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=75, splitter='best')

In [31]:
y_pred2 = clf2.predict(X_train)
y_pred2

array([1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,

In [32]:
print('Accuracy of Decision Tree classifier with more depth on training set: {:.2f}'.format(clf2.score(X_train, y_train)))

Accuracy of Decision Tree classifier with more depth on training set: 0.88


In [33]:
print(classification_report(y_train, y_pred2))

              precision    recall  f1-score   support

           0       0.86      0.96      0.91       307
           1       0.92      0.76      0.83       190

    accuracy                           0.88       497
   macro avg       0.89      0.86      0.87       497
weighted avg       0.88      0.88      0.88       497



In [34]:
confusion_matrix(y_train, y_pred2)

array([[294,  13],
       [ 46, 144]])

In [35]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred2).ravel()

In [36]:
acc = round((tp + tn) / (tp + fp + fn + tn), 2)
recall = round(tp / (tp + fn), 2)
fpr = round(fp / (fp + tn), 2)
tnr = round(tn / (fp + tn), 2)
fnr = round(fn / (tp + fn), 2)
precision = round(tp / (tp + fp), 2)
f1 = round(f1_score(y_train, y_pred2), 2)
p, r, f, support = precision_recall_fscore_support(y_train, y_pred2)

print(f'accuracy: {acc}')
print(f'recall(true positive rate): {recall}')
print(f'false positive rate: {fpr}')
print(f'true negative rate: {tnr}')
print(f'false negative rate: {fnr}')
print(f'precision: {precision}')
print(f'f1-score: {f1}')
print(f'support: {support}')

accuracy: 0.88
recall(true positive rate): 0.76
false positive rate: 0.04
true negative rate: 0.96
false negative rate: 0.24
precision: 0.92
f1-score: 0.83
support: [307 190]


5. Which model performs better on your in-sample data?

The second model with more depth performs better on my in-sample data.

6. Which model performs best on your out-of-sample data, the validate set?

In [37]:
X_validate = validate[['pclass', 'sex_male', 'age', 'alone']]
y_validate = validate.survived

In [38]:
y_pred = clf1.predict(X_validate)

In [39]:
print(f'Accuracy of first decision tree(max_depth = 5) on validate data is: {clf1.score(X_validate, y_validate)}')

Accuracy of first decision tree(max_depth = 5) on validate data is: 0.7710280373831776


In [40]:
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.93      0.83       132
           1       0.82      0.51      0.63        82

    accuracy                           0.77       214
   macro avg       0.79      0.72      0.73       214
weighted avg       0.78      0.77      0.76       214



In [41]:
y_pred = clf2.predict(X_validate)

In [42]:
print(f'Accuracy of second decision tree(max_depth = 10) on validate data is: {clf2.score(X_validate, y_validate)}')

Accuracy of second decision tree(max_depth = 10) on validate data is: 0.7897196261682243


In [43]:
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.86      0.84       132
           1       0.75      0.67      0.71        82

    accuracy                           0.79       214
   macro avg       0.78      0.77      0.77       214
weighted avg       0.79      0.79      0.79       214



The second model is roughly 2% more accurate than the first model.

## Random Forest

In [44]:
from sklearn.ensemble import RandomForestClassifier

### Continue working in your model file. Be sure to add, commit, and push your changes.

1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 20.

In [45]:
X_train = train[['pclass', 'sex_male', 'age', 'alone']]
y_train = train.survived

In [46]:
rf = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=123)

In [47]:
rf = rf.fit(X_train, y_train)

In [48]:
y_pred_rf = rf.predict(X_train)
y_pred_rf

array([1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,

2. Evaluate your results using the model score, confusion matrix, and classification report.

In [49]:
rf.score(X_train, y_train)

0.9114688128772636

In [50]:
print(classification_report(y_train, y_pred_rf))

              precision    recall  f1-score   support

           0       0.91      0.95      0.93       307
           1       0.91      0.85      0.88       190

    accuracy                           0.91       497
   macro avg       0.91      0.90      0.91       497
weighted avg       0.91      0.91      0.91       497



In [51]:
print(confusion_matrix(y_train, y_pred_rf))

[[291  16]
 [ 28 162]]


3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [52]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred_rf).ravel()

In [53]:
acc = round((tp + tn) / (tp + fp + fn + tn), 2)
recall = round(tp / (tp + fn), 2)
fpr = round(fp / (fp + tn), 2)
tnr = round(tn / (fp + tn), 2)
fnr = round(fn / (tp + fn), 2)
precision = round(tp / (tp + fp), 2)
f1 = round(f1_score(y_train, y_pred_rf), 2)
p, r, f, support = precision_recall_fscore_support(y_train, y_pred_rf)

print(f'accuracy: {acc}')
print(f'recall(true positive rate): {recall}')
print(f'false positive rate: {fpr}')
print(f'true negative rate: {tnr}')
print(f'false negative rate: {fnr}')
print(f'precision: {precision}')
print(f'f1-score: {f1}')
print(f'support: {support}')

accuracy: 0.91
recall(true positive rate): 0.85
false positive rate: 0.05
true negative rate: 0.95
false negative rate: 0.15
precision: 0.91
f1-score: 0.88
support: [307 190]


4. Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.

In [54]:
rf2 = RandomForestClassifier(n_estimators=100, max_depth=3, min_samples_leaf=5)
rf2 = rf2.fit(X_train, y_train)

In [55]:
y_pred_rf2 = rf2.predict(X_train)
y_pred_rf2

array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,

In [56]:
rf2.score(X_train, y_train)

0.8169014084507042

In [57]:
print(classification_report(y_train, y_pred_rf2))

              precision    recall  f1-score   support

           0       0.81      0.93      0.86       307
           1       0.85      0.64      0.73       190

    accuracy                           0.82       497
   macro avg       0.83      0.78      0.79       497
weighted avg       0.82      0.82      0.81       497



In [58]:
print(confusion_matrix(y_train, y_pred_rf2))

[[285  22]
 [ 69 121]]


In [59]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred_rf2).ravel()

In [60]:
acc = round((tp + tn) / (tp + fp + fn + tn), 2)
recall = round(tp / (tp + fn), 2)
fpr = round(fp / (fp + tn), 2)
tnr = round(tn / (fp + tn), 2)
fnr = round(fn / (tp + fn), 2)
precision = round(tp / (tp + fp), 2)
f1 = round(f1_score(y_train, y_pred_rf2), 2)
p, r, f, support = precision_recall_fscore_support(y_train, y_pred_rf2)

print(f'accuracy: {acc}')
print(f'recall(true positive rate): {recall}')
print(f'false positive rate: {fpr}')
print(f'true negative rate: {tnr}')
print(f'false negative rate: {fnr}')
print(f'precision: {precision}')
print(f'f1-score: {f1}')
print(f'support: {support}')

accuracy: 0.82
recall(true positive rate): 0.64
false positive rate: 0.07
true negative rate: 0.93
false negative rate: 0.36
precision: 0.85
f1-score: 0.73
support: [307 190]


5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

- The accuracy of model 1 is 10% higher than model 2.
- The recall of model 1 is about 11% higher than model 2.
- The precision of model 1 is only about 2% higher than model 2.
- The f1-score of model 1 is about 16% higher than model 2.

Clearly model 1 performs better on the in-sample data but with a max_depth of 20 it could be overfitting the data.
model 2 having a min_samples_leaf of 5 could be underfitting the data as well.

After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [61]:
rf3 = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_leaf=1, random_state=123)
rf3 = rf3.fit(X_train, y_train)

In [62]:
y_pred_rf3 = rf3.predict(X_train)
rf3.score(X_train, y_train)

0.9114688128772636

In [63]:
print(classification_report(y_train, y_pred_rf3))

              precision    recall  f1-score   support

           0       0.91      0.95      0.93       307
           1       0.91      0.85      0.88       190

    accuracy                           0.91       497
   macro avg       0.91      0.90      0.91       497
weighted avg       0.91      0.91      0.91       497



In [64]:
print(confusion_matrix(y_train, y_pred_rf3))

[[291  16]
 [ 28 162]]


In [65]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred_rf3).ravel()

In [66]:
acc = round((tp + tn) / (tp + fp + fn + tn), 2)
recall = round(tp / (tp + fn), 2)
fpr = round(fp / (fp + tn), 2)
tnr = round(tn / (fp + tn), 2)
fnr = round(fn / (tp + fn), 2)
precision = round(tp / (tp + fp), 2)
f1 = round(f1_score(y_train, y_pred_rf3), 2)
p, r, f, support = precision_recall_fscore_support(y_train, y_pred_rf3)

print(f'accuracy: {acc}')
print(f'recall(true positive rate): {recall}')
print(f'false positive rate: {fpr}')
print(f'true negative rate: {tnr}')
print(f'false negative rate: {fnr}')
print(f'precision: {precision}')
print(f'f1-score: {f1}')
print(f'support: {support}')

accuracy: 0.91
recall(true positive rate): 0.85
false positive rate: 0.05
true negative rate: 0.95
false negative rate: 0.15
precision: 0.91
f1-score: 0.88
support: [307 190]


In [67]:
rf4 = RandomForestClassifier(n_estimators=30, max_depth=10, min_samples_leaf=1)
rf4 = rf4.fit(X_train, y_train)

In [68]:
y_pred_rf4 = rf4.predict(X_train)

In [69]:
rf4.score(X_train, y_train)

0.9054325955734407

In [70]:
print(classification_report(y_train, y_pred_rf4))

              precision    recall  f1-score   support

           0       0.89      0.97      0.93       307
           1       0.94      0.81      0.87       190

    accuracy                           0.91       497
   macro avg       0.91      0.89      0.90       497
weighted avg       0.91      0.91      0.90       497



In [71]:
print(confusion_matrix(y_train, y_pred_rf4))

[[297  10]
 [ 37 153]]


In [72]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred_rf4).ravel()

In [73]:
acc = round((tp + tn) / (tp + fp + fn + tn), 2)
recall = round(tp / (tp + fn), 2)
fpr = round(fp / (fp + tn), 2)
tnr = round(tn / (fp + tn), 2)
fnr = round(fn / (tp + fn), 2)
precision = round(tp / (tp + fp), 2)
f1 = round(f1_score(y_train, y_pred_rf4), 2)
p, r, f, support = precision_recall_fscore_support(y_train, y_pred_rf4)

print(f'accuracy: {acc}')
print(f'recall(true positive rate): {recall}')
print(f'false positive rate: {fpr}')
print(f'true negative rate: {tnr}')
print(f'false negative rate: {fnr}')
print(f'precision: {precision}')
print(f'f1-score: {f1}')
print(f'support: {support}')

accuracy: 0.91
recall(true positive rate): 0.81
false positive rate: 0.03
true negative rate: 0.97
false negative rate: 0.19
precision: 0.94
f1-score: 0.87
support: [307 190]


So far my best 3 models are model 1, model 3, and model 4. So now we try them on validate.

In [74]:
y_pred_rf1 = rf.predict(X_validate)
y_pred_rf3 = rf3.predict(X_validate)
y_pred_rf4 = rf4.predict(X_validate)

In [75]:
print(classification_report(y_validate, y_pred_rf1))
print(confusion_matrix(y_validate, y_pred_rf1))

              precision    recall  f1-score   support

           0       0.81      0.86      0.84       132
           1       0.76      0.68      0.72        82

    accuracy                           0.79       214
   macro avg       0.79      0.77      0.78       214
weighted avg       0.79      0.79      0.79       214

[[114  18]
 [ 26  56]]


In [76]:
print(classification_report(y_validate, y_pred_rf3))
print(confusion_matrix(y_validate, y_pred_rf3))

              precision    recall  f1-score   support

           0       0.81      0.86      0.84       132
           1       0.76      0.68      0.72        82

    accuracy                           0.79       214
   macro avg       0.79      0.77      0.78       214
weighted avg       0.79      0.79      0.79       214

[[114  18]
 [ 26  56]]


In [77]:
print(classification_report(y_validate, y_pred_rf4))
print(confusion_matrix(y_validate, y_pred_rf4))

              precision    recall  f1-score   support

           0       0.81      0.86      0.84       132
           1       0.75      0.67      0.71        82

    accuracy                           0.79       214
   macro avg       0.78      0.77      0.77       214
weighted avg       0.79      0.79      0.79       214

[[114  18]
 [ 27  55]]


##  K Nearest Neighbor

### Continue working in your model notebook or python script.

In [78]:
from sklearn.neighbors import KNeighborsClassifier

1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [79]:
X_train

Unnamed: 0,pclass,sex_male,age,alone
583,1,1,36.000000,1
337,1,0,41.000000,1
50,3,1,7.000000,0
218,1,0,32.000000,1
31,1,0,29.916875,0
...,...,...,...,...
313,3,1,28.000000,1
636,3,1,32.000000,1
222,3,1,51.000000,1
485,3,0,29.916875,0


In [80]:
knn1 = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn1 = knn1.fit(X_train, y_train)

In [81]:
y_pred_knn1 = knn1.predict(X_train)

2. Evaluate your results using the model score, confusion matrix, and classification report.

In [82]:
acc = knn1.score(X_train, y_train)
acc

0.8289738430583501

In [83]:
print(confusion_matrix(y_train, y_pred_knn1))
print(classification_report(y_train, y_pred_knn1))

[[273  34]
 [ 51 139]]
              precision    recall  f1-score   support

           0       0.84      0.89      0.87       307
           1       0.80      0.73      0.77       190

    accuracy                           0.83       497
   macro avg       0.82      0.81      0.82       497
weighted avg       0.83      0.83      0.83       497



3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [84]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred_knn1).ravel()
acc = round((tp + tn) / (tp + fp + fn + tn), 2)
recall = round(tp / (tp + fn), 2)
fpr = round(fp / (fp + tn), 2)
tnr = round(tn / (fp + tn), 2)
fnr = round(fn / (tp + fn), 2)
precision = round(tp / (tp + fp), 2)
f1 = round(f1_score(y_train, y_pred_knn1), 2)
p, r, f, support = precision_recall_fscore_support(y_train, y_pred_knn1)

print(f'accuracy: {acc}')
print(f'recall(true positive rate): {recall}')
print(f'false positive rate: {fpr}')
print(f'true negative rate: {tnr}')
print(f'false negative rate: {fnr}')
print(f'precision: {precision}')
print(f'f1-score: {f1}')
print(f'support: {support}')

accuracy: 0.83
recall(true positive rate): 0.73
false positive rate: 0.11
true negative rate: 0.89
false negative rate: 0.27
precision: 0.8
f1-score: 0.77
support: [307 190]


4. Run through steps 2-4 setting k to 10

In [85]:
knn2 = KNeighborsClassifier(n_neighbors=10, weights='uniform')
knn2 = knn2.fit(X_train, y_train)

In [86]:
y_pred_knn2 = knn2.predict(X_train)
acc = knn2.score(X_train, y_train)
acc

0.7826961770623743

In [87]:
print(confusion_matrix(y_train, y_pred_knn2))
print(classification_report(y_train, y_pred_knn2))

[[288  19]
 [ 89 101]]
              precision    recall  f1-score   support

           0       0.76      0.94      0.84       307
           1       0.84      0.53      0.65       190

    accuracy                           0.78       497
   macro avg       0.80      0.73      0.75       497
weighted avg       0.79      0.78      0.77       497



In [88]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred_knn2).ravel()
acc = round((tp + tn) / (tp + fp + fn + tn), 2)
recall = round(tp / (tp + fn), 2)
fpr = round(fp / (fp + tn), 2)
tnr = round(tn / (fp + tn), 2)
fnr = round(fn / (tp + fn), 2)
precision = round(tp / (tp + fp), 2)
f1 = round(f1_score(y_train, y_pred_knn2), 2)
p, r, f, support = precision_recall_fscore_support(y_train, y_pred_knn2)

print(f'accuracy: {acc}')
print(f'recall(true positive rate): {recall}')
print(f'false positive rate: {fpr}')
print(f'true negative rate: {tnr}')
print(f'false negative rate: {fnr}')
print(f'precision: {precision}')
print(f'f1-score: {f1}')
print(f'support: {support}')

accuracy: 0.78
recall(true positive rate): 0.53
false positive rate: 0.06
true negative rate: 0.94
false negative rate: 0.47
precision: 0.84
f1-score: 0.65
support: [307 190]


5. Run through setps 2-4 setting k to 20

In [89]:
knn3 = KNeighborsClassifier(n_neighbors=20, weights='uniform')
knn3 = knn3.fit(X_train, y_train)
y_pred_knn3 = knn3.predict(X_train)

In [90]:
print(classification_report(y_train, y_pred_knn3))
print(confusion_matrix(y_train, y_pred_knn3))

              precision    recall  f1-score   support

           0       0.70      0.90      0.79       307
           1       0.71      0.39      0.50       190

    accuracy                           0.71       497
   macro avg       0.71      0.65      0.65       497
weighted avg       0.71      0.71      0.68       497

[[277  30]
 [116  74]]


In [92]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred_knn3).ravel()
acc = round((tp + tn) / (tp + fp + fn + tn), 2)
recall = round(recall_score(y_train, y_pred_knn3), 2)
fpr = round(fp / (fp + tn), 2)
tnr = round(tn / (fp + tn), 2)
fnr = round(fn / (tp + fn), 2)
precision = round(precision_score(y_train, y_pred_knn3), 2)
f1 = round(f1_score(y_train, y_pred_knn3), 2)
p, r, f, support = precision_recall_fscore_support(y_train, y_pred_knn3)

print(f'accuracy: {acc}')
print(f'recall(true positive rate): {recall}')
print(f'false positive rate: {fpr}')
print(f'true negative rate: {tnr}')
print(f'false negative rate: {fnr}')
print(f'precision: {precision}')
print(f'f1-score: {f1}')
print(f'support: {support}')

accuracy: 0.71
recall(true positive rate): 0.39
false positive rate: 0.1
true negative rate: 0.9
false negative rate: 0.61
precision: 0.71
f1-score: 0.5
support: [307 190]


6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

7. Which model performs best on our out-of-sample data from validate?