In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import acquire
import prepare
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
titanic = acquire.get_titanic_data()
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
titanic = prepare.prep_titanic(titanic)
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,male,1,0,7.25,Southampton,0,1,0,1
1,1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,2,1,3,female,0,0,7.925,Southampton,1,0,0,1
3,3,1,1,female,1,0,53.1,Southampton,0,0,0,1
4,4,0,3,male,0,0,8.05,Southampton,1,1,0,1


In [4]:
train, validate, test = prepare.my_train_test_split(titanic, 'survived')

In [5]:
train.shape, validate.shape, test.shape

((534, 12), (178, 12), (179, 12))

In [6]:
[dataset.drop(columns=['sex', 'embark_town', 'passenger_id'], inplace=True) for dataset in [train, validate, test]]

[None, None, None]

In [7]:
train.shape, validate.shape, test.shape

((534, 9), (178, 9), (179, 9))

1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [9]:
X_train = train.drop(columns='survived')
y_train = train.survived

In [10]:
train.survived.value_counts()

0    329
1    205
Name: survived, dtype: int64

In [11]:
# Those who didn't survive is my baseline prediction.
(y_train == 0).mean()
print(f'Our baseline accuracy for nonsurvival in all cases on the Titanic is {(y_train == 0).mean():.2}')


Our baseline accuracy for nonsurvival in all cases on the Titanic is 0.62


2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [12]:
clf = DecisionTreeClassifier()
clf

DecisionTreeClassifier()

3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [13]:
clf.fit(X_train, y_train)
print(f'training score: {clf.score(X_train, y_train):.2%}')


training score: 94.57%


In [14]:
y_pred = clf.predict(X_train)
y_pred[0:5]

array([0, 1, 0, 1, 0])

In [15]:
y_preds = pd.DataFrame({'y_act': y_train, 'baseline': 0, 'model_1': clf.predict(X_train)})
y_preds

Unnamed: 0,y_act,baseline,model_1
455,1,0,0
380,1,0,1
492,0,0,0
55,1,0,1
243,0,0,0
...,...,...,...
695,0,0,0
128,1,0,1
636,0,0,0
222,0,0,0


In [16]:
confusion_matrix(y_train, y_pred)

array([[327,   2],
       [ 27, 178]])

4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [17]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.95


In [18]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,327,2
1,27,178


In [19]:
pd.DataFrame(classification_report(y_preds.y_act, y_preds.model_1, output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.923729,0.988889,0.945693,0.956309,0.948743
recall,0.993921,0.868293,0.945693,0.931107,0.945693
f1-score,0.95754,0.924675,0.945693,0.941108,0.944924
support,329.0,205.0,0.945693,534.0,534.0


In [20]:
conf = confusion_matrix(y_preds.y_act, y_preds.model_1)
conf

array([[327,   2],
       [ 27, 178]])

In [21]:
rubric_df = pd.DataFrame({
    'pred_death': ['True Negative', 'False Negative'],
    'pred_survive':['False Positive','True Positive']
    }, index=['actual_death', 'actual_survive'])
rubric_df

Unnamed: 0,pred_death,pred_survive
actual_death,True Negative,False Positive
actual_survive,False Negative,True Positive


In [22]:
rubric_df + ': ' + conf.astype(str)

Unnamed: 0,pred_death,pred_survive
actual_death,True Negative: 327,False Positive: 2
actual_survive,False Negative: 27,True Positive: 178


In [23]:
TN = conf[0,0] 
FP = conf[0,1]
FN = conf[1,0]
TP = conf[1,1]

In [24]:
TN, FP, FN, TP

(327, 2, 27, 178)

In [25]:
# accuracy
ALL = TP + FP + FN + TN
acc = (TP + TN) / ALL
# true positive rate, also recall
TPR = recall = TP/ (TP + FN)
# false positive rate
FPR = FP / (FP + TN)
# true negative rate
TNR = TN / (TN + FP)
# false negative rate
FNR = FN / (FN + TP)
# precision
precision = TP / (TP + FP)
# f1
f1_score = 2 * (precision*recall) / (precision+recall)
# support
support_pos = TP + FN
support_neg = FP + TN

5. Run through steps 2-4 using a different max_depth value.

In [26]:
clf2 = DecisionTreeClassifier(max_depth=3)
clf2

DecisionTreeClassifier(max_depth=3)

In [27]:
clf2.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3)

In [28]:
print(f'training score: {clf2.score(X_train, y_train):.2%}')


training score: 81.84%


In [29]:
y_preds['model2'] = clf2.predict(X_train)
y_preds[0:5]

Unnamed: 0,y_act,baseline,model_1,model2
455,1,0,0,0
380,1,0,1,1
492,0,0,0,0
55,1,0,1,0
243,0,0,0,0


In [30]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf2.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.82


In [31]:
X_val, y_val = validate.drop(columns='survived'), validate.survived

In [32]:
clf.score(X_train, y_train)

0.9456928838951311

In [33]:
X_train.shape, X_val.shape

((534, 8), (178, 8))

In [34]:
clf.score(X_val, y_val)

0.7415730337078652

In [35]:
clf2.score(X_train, y_train)

0.8183520599250936

In [36]:
clf2.score(X_val, y_val)

0.7921348314606742

In [37]:
models = []
model_scores = []
for i in range(2,8):
    model = DecisionTreeClassifier(max_depth=i)
    model.fit(X_train, y_train)
    models.append(model)
    model_scores.append(model.score(X_train, y_train))

In [40]:
model_scores

[0.7921348314606742,
 0.8183520599250936,
 0.8202247191011236,
 0.8333333333333334,
 0.8595505617977528,
 0.8651685393258427]

6. Which model performs better on your in-sample data?

The second model has a better training score/ validate score ratio but the first model has better precision on detecting those who won't survive by a slight margin.  They both have the same accuracy though. 

7. Which model performs best on your out-of-sample data, the validate set?

The second model 

In [10]:
telco = acquire.get_telco_data()
telco.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,1,2,1,0004-TLHLJ,Male,0,No,No,4,Yes,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,1,2,1,0011-IGKFF,Male,1,Yes,No,13,Yes,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,2,2,1,0013-EXCHZ,Female,1,Yes,No,3,Yes,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


In [11]:
new_telco = prepare.prep_telco(telco)
new_telco.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,streaming_tv_Yes,streaming_movies_No internet service,streaming_movies_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,No,Yes,...,1,0,0,1,0,0,0,0,0,1
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,No,No,...,0,0,1,0,0,0,0,0,0,1
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,No,No,...,0,0,0,0,0,1,0,0,1,0
3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,No,Yes,...,1,0,1,0,0,1,0,0,1,0
4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,No,No,...,1,0,0,0,0,1,0,0,0,1


In [15]:
new_telco.gender_encoded

0       1
1       0
2       0
3       0
4       1
       ..
7038    1
7039    0
7040    0
7041    0
7042    0
Name: gender_encoded, Length: 7043, dtype: int64

In [4]:
train, validate, test = prepare.my_train_test_split(telco, 'churn')

In [5]:
train.shape, validate.shape, test.shape

((4225, 42), (1409, 42), (1409, 42))

In [6]:
train.churn.value_counts()

No     3104
Yes    1121
Name: churn, dtype: int64

In [7]:
train['baseline_prediction'] = 'No'

In [8]:
train.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,streaming_movies_No internet service,streaming_movies_Yes,contract_type_One year,contract_type_Two year,fiber_optic,internet_service_type_None,auto_payment,electronic_payment,mailed_check,baseline_prediction
5911,8319-QBEHW,Male,0,No,Yes,26,No,No phone service,No,Yes,...,0,0,1,0,0,0,0,0,0,No
2479,3547-LQRIK,Female,0,Yes,No,47,Yes,Yes,No internet service,No internet service,...,1,0,1,0,0,1,0,1,0,No
5889,8280-MQRQN,Female,0,No,No,1,Yes,No,No,No,...,0,0,0,0,0,0,0,0,1,No
6087,8626-PTQGE,Male,0,No,No,69,No,No phone service,Yes,Yes,...,0,1,0,1,0,0,0,0,0,No
785,1142-IHLOO,Female,0,No,No,51,Yes,No,No,No,...,0,1,1,0,1,0,1,0,0,No


In [9]:
baseline_acc = (train.baseline_prediction == train.churn).mean()

print(f'baseline accuracy: {baseline_acc:.2%}')

baseline accuracy: 73.47%


In [10]:
clf = DecisionTreeClassifier()
clf

DecisionTreeClassifier()

In [11]:
X_cols = ['fiber_optic', 'online_backup', 'multiple_lines', 'tenure', 'senior_citizen']
y_col = 'churn'

In [12]:
X_train, y_train = train[X_cols], train[y_col]

In [15]:
X_validate, y_validate = validate[X_cols], validate[y_col]
X_test, y_test = test[X_cols], test[y_col]

In [17]:
clf.fit(X_train, y_train)
print(f'training score: {clf.score(X_train, y_train):.2%}')
print(f'validate score: {clf.score(X_validate, y_validate):.2%}')

ValueError: could not convert string to float: 'Yes'