In [121]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from pydataset import data

import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

from acquire import get_titanic_data
from prepare_pro import generic_split

import graphviz
from graphviz import Graph

import warnings
warnings.filterwarnings("ignore")

from jupyterthemes import jtplot
jtplot.style()

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Using the titanic data, in your classification-exercises repository,
# create a notebook, model.ipynb where you will do the following:
titanic = get_titanic_data()
titanic.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   passenger_id  891 non-null    int64  
 1   survived      891 non-null    int64  
 2   pclass        891 non-null    int64  
 3   sex           891 non-null    object 
 4   age           714 non-null    float64
 5   sibsp         891 non-null    int64  
 6   parch         891 non-null    int64  
 7   fare          891 non-null    float64
 8   embarked      889 non-null    object 
 9   class         891 non-null    object 
 10  deck          203 non-null    object 
 11  embark_town   889 non-null    object 
 12  alone         891 non-null    int64  
dtypes: float64(2), int64(6), object(5)
memory usage: 90.6+ KB


In [3]:
titanic = titanic[titanic.age.isna() == False]

In [4]:
titanic['is_male'] = titanic.sex == 'male'
dummy_df = pd.get_dummies(titanic[['class','embark_town']])
dummy_df

Unnamed: 0,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,0,0,1,0,0,1
1,1,0,0,1,0,0
2,0,0,1,0,0,1
3,1,0,0,0,0,1
4,0,0,1,0,0,1
...,...,...,...,...,...,...
885,0,0,1,0,1,0
886,0,1,0,0,0,1
887,1,0,0,0,0,1
889,1,0,0,1,0,0


In [5]:
titanic = pd.concat([titanic, dummy_df],axis=1)

In [6]:
titanic.drop(columns=['passenger_id','pclass','embarked','deck','sex','embark_town','class'],inplace=True)
titanic.head(1)

Unnamed: 0,survived,age,sibsp,parch,fare,alone,is_male,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,0,22.0,1,0,7.25,0,True,0,0,1,0,0,1


In [7]:
titanic.survived.value_counts()

0    424
1    290
Name: survived, dtype: int64

In [8]:
#1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for
# a classification problem is predicting the most prevelant class in the training dataset (the mode). 
# When you make those predictions, what is your accuracy? This is your baseline accuracy.
titanic['baseline_prediction'] = 0
titanic.head(1) #baseline prediction is 0 == did not survive

Unnamed: 0,survived,age,sibsp,parch,fare,alone,is_male,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,baseline_prediction
0,0,22.0,1,0,7.25,0,True,0,0,1,0,0,1,0


In [9]:
train, validate, test = generic_split(titanic, stratify_by='survived')

In [10]:
baseline_accuracy = (train.survived == train.baseline_prediction).mean()
baseline_accuracy

0.5939849624060151

In [11]:
#2. Fit the decision tree classifier to your training sample and transform 
# (i.e. make predictions on the training sample)
x_train = train.drop(columns=['survived'])
y_train = train.survived # labeled data == supervise algorithm

x_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

x_test = test.drop(columns=['survived'])
y_test = test.survived

In [12]:
clf = DecisionTreeClassifier(max_depth=2, random_state=123)
clf = clf.fit(x_train,y_train)

In [13]:
dot_data = export_graphviz(clf, feature_names= x_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('Titanic_decision_tree', view=True, format="pdf")

'Titanic_decision_tree.pdf'

In [14]:
y_pred = clf.predict(x_train)
y_pred[0:3]

array([0, 0, 1])

In [15]:
y_pred_proba = clf.predict_proba(x_train)
y_pred_proba[0:3]

array([[0.825     , 0.175     ],
       [0.58      , 0.42      ],
       [0.06315789, 0.93684211]])

In [16]:
confusion_matrix(y_train, y_pred,
                 labels = [0, 1])

array([[227,  10],
       [ 63,  99]])

In [17]:
#3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(x_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.82


In [18]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.96      0.86       237
           1       0.91      0.61      0.73       162

    accuracy                           0.82       399
   macro avg       0.85      0.78      0.80       399
weighted avg       0.83      0.82      0.81       399



In [19]:
#4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, 
# false negative rate, precision, recall, f1-score, and support.
print("Model 1")
pd.DataFrame(classification_report(y_train, y_pred,output_dict=True)).T

Model 1


Unnamed: 0,precision,recall,f1-score,support
0,0.782759,0.957806,0.86148,237.0
1,0.908257,0.611111,0.730627,162.0
accuracy,0.817043,0.817043,0.817043,0.817043
macro avg,0.845508,0.784459,0.796054,399.0
weighted avg,0.833713,0.817043,0.808352,399.0


In [20]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(x_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.76


In [21]:
#5. Run through steps 2-4 using a different max_depth value.
clf_difdepth = DecisionTreeClassifier(max_depth=4, random_state=123)
clf_difdepth = clf_difdepth.fit(x_train,y_train)

In [22]:
dot_data2 = export_graphviz(clf_difdepth, feature_names= x_train.columns, rounded=True, filled=True, out_file=None, class_names=['did_not_survive','survived'])
graph2 = graphviz.Source(dot_data2) 

graph2.render('Titanic_decision_tree', view=True, format="pdf")

'Titanic_decision_tree.pdf'

In [23]:
y_pred2 = clf_difdepth.predict(x_train)

In [24]:
confusion_matrix(y_train, y_pred2,
                 labels = [0, 1])

array([[231,   6],
       [ 52, 110]])

In [25]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf_difdepth.score(x_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.85


In [26]:
print("Model 2")
pd.DataFrame(classification_report(y_train, y_pred2,output_dict=True)).T

Model 2


Unnamed: 0,precision,recall,f1-score,support
0,0.816254,0.974684,0.888462,237.0
1,0.948276,0.679012,0.791367,162.0
accuracy,0.854637,0.854637,0.854637,0.854637
macro avg,0.882265,0.826848,0.839914,399.0
weighted avg,0.869857,0.854637,0.84904,399.0


In [27]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf_difdepth.score(x_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.78


In [28]:
#6. Which model performs better on your in-sample data? 
# model 2 has higher scores overall

print("Model 1")
pd.DataFrame(classification_report(y_train, y_pred,output_dict=True)).T

Model 1


Unnamed: 0,precision,recall,f1-score,support
0,0.782759,0.957806,0.86148,237.0
1,0.908257,0.611111,0.730627,162.0
accuracy,0.817043,0.817043,0.817043,0.817043
macro avg,0.845508,0.784459,0.796054,399.0
weighted avg,0.833713,0.817043,0.808352,399.0


In [29]:
print("Model 2")
pd.DataFrame(classification_report(y_train, y_pred2,output_dict=True)).T

Model 2


Unnamed: 0,precision,recall,f1-score,support
0,0.816254,0.974684,0.888462,237.0
1,0.948276,0.679012,0.791367,162.0
accuracy,0.854637,0.854637,0.854637,0.854637
macro avg,0.882265,0.826848,0.839914,399.0
weighted avg,0.869857,0.854637,0.84904,399.0


In [30]:
#7. Which model performs best on your out-of-sample data, the validate set?
print('Accuracy of Decision Tree classifier (2 depth) on validate set: {:.2f}'
     .format(clf.score(x_validate, y_validate)))


Accuracy of Decision Tree classifier (2 depth) on validate set: 0.76


In [31]:
print('Accuracy of Decision Tree classifier (4 depth) on validate set: {:.2f}'
     .format(clf_difdepth.score(x_validate, y_validate)))

Accuracy of Decision Tree classifier (4 depth) on validate set: 0.78


In [32]:
# model 2 has higher accuracy on the validate set

In [33]:
y_pred_validate = clf.predict(x_validate)
y_pred_validate2 = clf_difdepth.predict(x_validate)

In [34]:
print("Model 1 validate")
pd.DataFrame(classification_report(y_validate, y_pred_validate,output_dict=True)).T

Model 1 validate


Unnamed: 0,precision,recall,f1-score,support
0,0.744,0.911765,0.819383,102.0
1,0.808511,0.542857,0.649573,70.0
accuracy,0.761628,0.761628,0.761628,0.761628
macro avg,0.776255,0.727311,0.734478,172.0
weighted avg,0.770254,0.761628,0.750274,172.0


In [35]:
print("Model 2 validate")
pd.DataFrame(classification_report(y_validate, y_pred_validate2,output_dict=True)).T

Model 2 validate


Unnamed: 0,precision,recall,f1-score,support
0,0.75,0.941176,0.834783,102.0
1,0.863636,0.542857,0.666667,70.0
accuracy,0.77907,0.77907,0.77907,0.77907
macro avg,0.806818,0.742017,0.750725,172.0
weighted avg,0.796247,0.77907,0.766363,172.0


In [36]:
confusion_matrix(y_validate, y_pred_validate,
                 labels = [0, 1])

array([[93,  9],
       [32, 38]])

In [37]:
confusion_matrix(y_validate, y_pred_validate2,
                 labels = [0, 1])

array([[96,  6],
       [32, 38]])

In [38]:
# model 2 reduced false positive rate

In [39]:
# 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample)
# setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

rf = RandomForestClassifier(bootstrap=True,
                            class_weight=None,
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10,
                            random_state=123)

In [40]:
rf = rf.fit(x_train, y_train)

In [41]:
print(rf.feature_importances_)

[0.25746175 0.04075686 0.04345531 0.23247315 0.01531672 0.25673994
 0.02867526 0.0194732  0.07377139 0.01385568 0.00612958 0.01189114
 0.        ]


In [42]:
y_pred = rf.predict(x_train)

In [43]:
#2. Evaluate your results using the model score, confusion matrix, and classification report.

print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(x_train, y_train)))

Accuracy of random forest classifier on training set: 0.97


In [44]:
print(confusion_matrix(y_train, y_pred))

[[236   1]
 [ 12 150]]


In [45]:
print("Model 1 Random Forest")
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T

Model 1 Random Forest


Unnamed: 0,precision,recall,f1-score,support
0,0.951613,0.995781,0.973196,237.0
1,0.993377,0.925926,0.958466,162.0
accuracy,0.967419,0.967419,0.967419,0.967419
macro avg,0.972495,0.960853,0.965831,399.0
weighted avg,0.96857,0.967419,0.967216,399.0


In [46]:
# 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate,
# false negative rate, precision, recall, f1-score, and support.
report1 = pd.DataFrame(classification_report(
    y_train, y_pred, output_dict=True)).T

In [47]:
print(f'Accuracy: {round(report1.precision.accuracy,3)}')
print(f'True Positive Rate: {round(report1.recall[1],3)}')
print(f'False Positive Rate: {round(1 - report1.recall[1],3)}')
print(f'True Negative Rate: {round(report1.recall[0],3)}')
print(f'False Negative Rate: {round(1 - report1.recall[0],3)}')

Accuracy: 0.967
True Positive Rate: 0.926
False Positive Rate: 0.074
True Negative Rate: 0.996
False Negative Rate: 0.004


In [48]:
# 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.
rf2 = RandomForestClassifier(max_depth=5, min_samples_leaf=5, random_state=123)

In [49]:
rf2 = rf2.fit(x_train, y_train)

In [50]:
y_pred2 = rf2.predict(x_train)

In [51]:
print('Accuracy of random forest classifier 2 on training set: {:.2f}'
      .format(rf2.score(x_train, y_train)))

Accuracy of random forest classifier 2 on training set: 0.83


In [52]:
print(confusion_matrix(y_train, y_pred2))

[[227  10]
 [ 57 105]]


In [53]:
report2 = pd.DataFrame(classification_report(
    y_train, y_pred2, output_dict=True)).T
report2

Unnamed: 0,precision,recall,f1-score,support
0,0.799296,0.957806,0.871401,237.0
1,0.913043,0.648148,0.758123,162.0
accuracy,0.83208,0.83208,0.83208,0.83208
macro avg,0.85617,0.802977,0.814762,399.0
weighted avg,0.845479,0.83208,0.825408,399.0


In [54]:
print(f'Accuracy: {round(report2.precision.accuracy,3)}')
print(f'True Positive Rate: {round(report2.recall[1],3)}')
print(f'False Positive Rate: {round(1 - report2.recall[1],3)}')
print(f'True Negative Rate: {round(report2.recall[0],3)}')
print(f'False Negative Rate: {round(1 - report2.recall[0],3)}')

Accuracy: 0.832
True Positive Rate: 0.648
False Positive Rate: 0.352
True Negative Rate: 0.958
False Negative Rate: 0.042


In [55]:
#5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?
#model 2 has terrible recall. Model 1 performs better on train data possibly because it's overfit

In [56]:
# After making a few models, which one has the best performance (or closest metrics) on both train and validate?
y_pred_val = rf.predict(x_validate)
y_pred2_val = rf2.predict(x_validate)

In [57]:
print('Accuracy of random forest classifier 1 on validate set: {:.2f}'
     .format(rf.score(x_validate, y_validate)))

Accuracy of random forest classifier 1 on validate set: 0.78


In [58]:
print('Accuracy of random forest classifier 2 on validate set: {:.2f}'
      .format(rf2.score(x_validate, y_validate)))

Accuracy of random forest classifier 2 on validate set: 0.79


In [59]:
report1_val = pd.DataFrame(classification_report(y_validate, y_pred_val,output_dict=True)).T
report1_val

Unnamed: 0,precision,recall,f1-score,support
0,0.782609,0.882353,0.829493,102.0
1,0.789474,0.642857,0.708661,70.0
accuracy,0.784884,0.784884,0.784884,0.784884
macro avg,0.786041,0.762605,0.769077,172.0
weighted avg,0.785403,0.784884,0.780317,172.0


In [60]:
print(f'Accuracy: {round(report1_val.precision.accuracy,3)}')
print(f'True Positive Rate: {round(report1_val.recall[1],3)}')
print(f'False Positive Rate: {round(1 - report1_val.recall[1],3)}')
print(f'True Negative Rate: {round(report1_val.recall[0],3)}')
print(f'False Negative Rate: {round(1 - report1_val.recall[0],3)}')

Accuracy: 0.785
True Positive Rate: 0.643
False Positive Rate: 0.357
True Negative Rate: 0.882
False Negative Rate: 0.118


In [61]:
report2_val = pd.DataFrame(classification_report(y_validate, y_pred2_val,output_dict=True)).T
report2_val

Unnamed: 0,precision,recall,f1-score,support
0,0.75,0.970588,0.846154,102.0
1,0.925,0.528571,0.672727,70.0
accuracy,0.790698,0.790698,0.790698,0.790698
macro avg,0.8375,0.74958,0.759441,172.0
weighted avg,0.821221,0.790698,0.775573,172.0


In [62]:
print(f'Accuracy: {round(report2_val.precision.accuracy,3)}')
print(f'True Positive Rate: {round(report2_val.recall[1],3)}')
print(f'False Positive Rate: {round(1 - report2_val.recall[1],3)}')
print(f'True Negative Rate: {round(report2_val.recall[0],3)}')
print(f'False Negative Rate: {round(1 - report2_val.recall[0],3)}')

Accuracy: 0.791
True Positive Rate: 0.529
False Positive Rate: 0.471
True Negative Rate: 0.971
False Negative Rate: 0.029


In [63]:
#model 2 has closest train and validate metrics


In [64]:
#1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)

KNeighborsClassifier()

In [72]:
y_pred = knn.predict(x_train)

In [71]:
#2. Evaluate your results using the model score, confusion matrix, and classification report.
accuracy = knn.score(x_train, y_train)
print(f"accuracy is {accuracy:.3}")

accuracy is 0.762


In [73]:
print(confusion_matrix(y_train, y_pred))

[[197  40]
 [ 55 107]]


In [74]:
report = pd.DataFrame(classification_report(y_train, y_pred,output_dict=True)).T
report

Unnamed: 0,precision,recall,f1-score,support
0,0.781746,0.831224,0.805726,237.0
1,0.727891,0.660494,0.692557,162.0
accuracy,0.761905,0.761905,0.761905,0.761905
macro avg,0.754819,0.745859,0.749141,399.0
weighted avg,0.75988,0.761905,0.759778,399.0


In [77]:
#3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, 
# false negative rate, precision, recall, f1-score, and support.
def get_metrics_bin(knn, X, y):
    '''
    get_metrics_bin will take in a sklearn classifier model, an X and a y variable and utilize
    the model to make a prediction and then gather accuracy, class report evaluations

    return:  a classification report as a pandas DataFrame
    '''
    y_pred = knn.predict(X)
    accuracy = knn.score(X, y)
    conf = confusion_matrix(y, y_pred)
    class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
    tpr = conf[1][1] / conf[1].sum()
    fpr = conf[0][1] / conf[0].sum()
    tnr = conf[0][0] / conf[0].sum()
    fnr = conf[1][0] / conf[1].sum()
    print(f'''
    The accuracy for our model is {accuracy:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')
    return class_report

In [80]:
class_report = get_metrics_bin(knn, x_train, y_train)
class_report


    The accuracy for our model is 0.7619
    The True Positive Rate is 0.66, The False Positive Rate is 0.169,
    The True Negative Rate is 0.831, and the False Negative Rate is 0.34
    


Unnamed: 0,precision,recall,f1-score,support
0,0.781746,0.831224,0.805726,237.0
1,0.727891,0.660494,0.692557,162.0
accuracy,0.761905,0.761905,0.761905,0.761905
macro avg,0.754819,0.745859,0.749141,399.0
weighted avg,0.75988,0.761905,0.759778,399.0


In [82]:
#4. Run through steps 2-4 setting k to 10
knn2 = KNeighborsClassifier(n_neighbors=10)
knn2.fit(x_train, y_train)

class_report2 = get_metrics_bin(knn2, x_train,y_train)
class_report2


    The accuracy for our model is 0.7143
    The True Positive Rate is 0.506, The False Positive Rate is 0.143,
    The True Negative Rate is 0.857, and the False Negative Rate is 0.494
    


Unnamed: 0,precision,recall,f1-score,support
0,0.717314,0.85654,0.780769,237.0
1,0.706897,0.506173,0.589928,162.0
accuracy,0.714286,0.714286,0.714286,0.714286
macro avg,0.712106,0.681356,0.685349,399.0
weighted avg,0.713085,0.714286,0.703285,399.0


In [84]:
#5. Run through setps 2-4 setting k to 20
knn3 = KNeighborsClassifier(n_neighbors=20)
knn3.fit(x_train, y_train)

class_report3 = get_metrics_bin(knn3, x_train,y_train)
class_report3


    The accuracy for our model is 0.6892
    The True Positive Rate is 0.488, The False Positive Rate is 0.173,
    The True Negative Rate is 0.827, and the False Negative Rate is 0.512
    


Unnamed: 0,precision,recall,f1-score,support
0,0.702509,0.827004,0.75969,237.0
1,0.658333,0.487654,0.560284,162.0
accuracy,0.689223,0.689223,0.689223,0.689223
macro avg,0.680421,0.657329,0.659987,399.0
weighted avg,0.684573,0.689223,0.678728,399.0


In [69]:
#6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?
#The higher the K after 5 the worse the metrics. Default performs better because the number of observations is not large for this dataset 

In [85]:
#7. Which model performs best on our out-of-sample data from validate?
class_report1_val = get_metrics_bin(knn, x_validate,y_validate)
class_report1_val


    The accuracy for our model is 0.6977
    The True Positive Rate is 0.643, The False Positive Rate is 0.265,
    The True Negative Rate is 0.735, and the False Negative Rate is 0.357
    


Unnamed: 0,precision,recall,f1-score,support
0,0.75,0.735294,0.742574,102.0
1,0.625,0.642857,0.633803,70.0
accuracy,0.697674,0.697674,0.697674,0.697674
macro avg,0.6875,0.689076,0.688189,172.0
weighted avg,0.699128,0.697674,0.698307,172.0


In [86]:
class_report2_val = get_metrics_bin(knn2, x_validate,y_validate)
class_report2_val


    The accuracy for our model is 0.7326
    The True Positive Rate is 0.529, The False Positive Rate is 0.127,
    The True Negative Rate is 0.873, and the False Negative Rate is 0.471
    


Unnamed: 0,precision,recall,f1-score,support
0,0.729508,0.872549,0.794643,102.0
1,0.74,0.528571,0.616667,70.0
accuracy,0.732558,0.732558,0.732558,0.732558
macro avg,0.734754,0.70056,0.705655,172.0
weighted avg,0.733778,0.732558,0.722211,172.0


In [87]:
class_report3_val = get_metrics_bin(knn3, x_validate,y_validate)
class_report3_val


    The accuracy for our model is 0.75
    The True Positive Rate is 0.629, The False Positive Rate is 0.167,
    The True Negative Rate is 0.833, and the False Negative Rate is 0.371
    


Unnamed: 0,precision,recall,f1-score,support
0,0.765766,0.833333,0.798122,102.0
1,0.721311,0.628571,0.671756,70.0
accuracy,0.75,0.75,0.75,0.75
macro avg,0.743539,0.730952,0.734939,172.0
weighted avg,0.747674,0.75,0.746694,172.0


In [None]:
#Model 3 with K=20 performs better on the validate set....

In [96]:
#1. Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?
from sklearn.linear_model import LogisticRegression
print(baseline_accuracy)
x_train.head(1)

0.5939849624060151


Unnamed: 0,age,sibsp,parch,fare,alone,is_male,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,baseline_prediction
652,21.0,0,0,8.4333,1,True,0,0,1,0,0,1,0


In [100]:
x_train_log = x_train[['age','fare','class_First','class_Second','class_Third']]
logit = LogisticRegression(C=1, class_weight={0:1, 1:99},
                           random_state=123)
logit.fit(x_train_log, y_train)

y_pred_log = logit.predict(x_train_log)
confusion_matrix(y_train,y_pred_log)

array([[  0, 237],
       [  0, 162]])

In [99]:
class_report_log1 = get_metrics_bin(logit, x_train_log,y_train)
class_report_log1#worse than baseline


    The accuracy for our model is 0.406
    The True Positive Rate is 1.0, The False Positive Rate is 1.0,
    The True Negative Rate is 0.0, and the False Negative Rate is 0.0
    


Unnamed: 0,precision,recall,f1-score,support
0,0.0,0.0,0.0,237.0
1,0.406015,1.0,0.57754,162.0
accuracy,0.406015,0.406015,0.406015,0.406015
macro avg,0.203008,0.5,0.28877,399.0
weighted avg,0.164848,0.406015,0.23449,399.0


In [110]:
#2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.
x_train_log2 = x_train[['age','is_male','fare','class_First','class_Second','class_Third']]

logit2 = logit.fit(x_train_log2, y_train)

y_pred_log2 = logit2.predict(x_train_log2)
confusion_matrix(y_train,y_pred_log2)

array([[  0, 237],
       [  0, 162]])

In [102]:
class_report_log2 = get_metrics_bin(logit2, x_train_log2,y_train)
class_report_log2


    The accuracy for our model is 0.406
    The True Positive Rate is 1.0, The False Positive Rate is 1.0,
    The True Negative Rate is 0.0, and the False Negative Rate is 0.0
    


Unnamed: 0,precision,recall,f1-score,support
0,0.0,0.0,0.0,237.0
1,0.406015,1.0,0.57754,162.0
accuracy,0.406015,0.406015,0.406015,0.406015
macro avg,0.203008,0.5,0.28877,399.0
weighted avg,0.164848,0.406015,0.23449,399.0


In [115]:
#3. Try out other combinations of features and models.
x_train_log3 = x_train[['age','is_male','fare','class_First','class_Second',
                        'class_Third','embark_town_Cherbourg','embark_town_Queenstown','embark_town_Southampton']]
x_train_log3.columns.tolist()

['age',
 'is_male',
 'fare',
 'class_First',
 'class_Second',
 'class_Third',
 'embark_town_Cherbourg',
 'embark_town_Queenstown',
 'embark_town_Southampton']

In [118]:
x_train_log3.columns=['age',
 'is_male',
 'fare',
 'class_First',
 'class_Second',
 'class_Third',
 'embarktown_Cherbourg',
 'embarktown_Queenstown',
 'embarktown_Southampton']
x_train_log3.columns

Index(['age', 'is_male', 'fare', 'class_First', 'class_Second', 'class_Third',
       'embarktown_Cherbourg', 'embarktown_Queenstown',
       'embarktown_Southampton'],
      dtype='object')

In [122]:
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.23.2.


In [119]:
logit3 = logit.fit(x_train_log3, y_train)

#y_pred_log3 = logit3.predict(x_train)
#confusion_matrix(y_train,y_pred_log3)

AttributeError: 'str' object has no attribute 'decode'

In [None]:
#4. Use you best 3 models to predict and evaluate on your validate sample.

#5. Choose you best model from the validation performation, and evaluate it on the test dataset.
# How do the performance metrics compare to validate? to train?