In [105]:
import numpy as np
import pandas as pd
from pydataset import data

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import acquire
import prepare


from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report

from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix

In [80]:
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)

Using cached csv


In [100]:
df= df.drop(columns = ['sex', 'embark_town'])
df.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,1,0,7.25,0,1,0,1
1,1,1,1,0,71.2833,0,0,0,0
2,1,3,0,0,7.925,1,0,0,1
3,1,1,1,0,53.1,0,0,0,1
4,0,3,0,0,8.05,1,1,0,1


In [63]:
def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [64]:
train, validate, test = train_validate_test_split(df, target='survived', seed = 123)

In [65]:
train.shape

(498, 9)

In [66]:
validate.shape

(214, 9)

In [67]:
test.shape

(179, 9)

In [68]:
# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [69]:
X_train.shape

(498, 8)

In [97]:
y_train.value_counts()

0    307
1    191
Name: survived, dtype: int64

## QUESTION 1 = BASELINE

In [99]:
# The mode(MOST OCCURING) is a great baseline
baseline = y_train.mode()

# Produce a boolean array with True representing a match between the baseline prediction and reality
matches_baseline_prediction = (y_train == 0)

baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

Baseline accuracy: 0.62


## QUESTION 2: FIT - TRANSFORMATION

In [114]:
# make the model

tree1 = DecisionTreeClassifier(max_depth=1, random_state=123)

# Fit the model (on train and only train)
tree1 = tree1.fit(X_train, y_train)

# Use the model
# We'll evaluate the model's performance on train, first
y_predictions = tree1.predict(X_train)


## QUESTION 3: EVALUATE PERFORMANCE

In [102]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(tree1.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.80


In [106]:
# confusion matrix - actual on left predicted on top
pd.DataFrame(confusion_matrix(y_train, y_predictions))

Unnamed: 0,0,1
0,265,42
1,58,133


In [107]:
# create classification report
print(classification_report(y_train, y_predictions))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.76      0.70      0.73       191

    accuracy                           0.80       498
   macro avg       0.79      0.78      0.78       498
weighted avg       0.80      0.80      0.80       498



In [108]:
# Produce the classification report on the actual y values and this model's predicted

report = classification_report(y_train, y_predictions, output_dict = True)
print("Tree of 1 depth")
pd.DataFrame(report)

Tree of 1 depth


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.820433,0.76,0.799197,0.790217,0.797255
recall,0.863192,0.696335,0.799197,0.779764,0.799197
f1-score,0.84127,0.726776,0.799197,0.784023,0.797358
support,307.0,191.0,0.799197,498.0,498.0


## QUESTION 4: EXTRA RE-CALCULATED METRICS

In [134]:
# NOT SURVIVED IS OUR POSITIVE CASE '0'

TP = 265
FP = 58
FN = 42
TN = 133
ALL = TP + FP + FN + TN

accuracy = (TP + TN) / ALL
print(f'Accuracy: {accuracy}')

true_positive_rate = TP/(TP + TN)
print(f'True Positive Rate: {true_positive_rate}')

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")


Accuracy: 0.7991967871485943
True Positive Rate: 0.6658291457286433
False Positive Rate: 0.3036649214659686
True Negative Rate: 0.6963350785340314
False Negative Rate: 0.13680781758957655
Precision: 0.8204334365325078
Recall: 0.8631921824104235
F1 Score: 0.8412698412698413
Support (0): 307
Support (1): 191


## QUESTION 5: FINDING OPTIMAL MAX_DEPTH

In [136]:
for i in range (2,21):
    # Make the model
    tree = DecisionTreeClassifier (max_depth = i, random_state = 123)
    
    # Fit the model ( on train only)
    tree = tree.fit(X_train, y_train)
    
    # Use the model
    # We'll  evaluate the model's performance on train, first
    y_predictions = tree.predict(X_train)
    
    # Produce the classification report on the actual y values and this model's predicted y values
    report = classification_report(y_train, y_predictions, output_dict = True)
    print(f'Tree of {i} depth')
    print(pd.DataFrame(report))
    print()

Tree of 2 depth
                    0           1  accuracy   macro avg  weighted avg
precision    0.820433    0.760000  0.799197    0.790217      0.797255
recall       0.863192    0.696335  0.799197    0.779764      0.799197
f1-score     0.841270    0.726776  0.799197    0.784023      0.797358
support    307.000000  191.000000  0.799197  498.000000    498.000000

Tree of 3 depth
                    0           1  accuracy   macro avg  weighted avg
precision    0.828829    0.812121  0.823293    0.820475      0.822421
recall       0.899023    0.701571  0.823293    0.800297      0.823293
f1-score     0.862500    0.752809  0.823293    0.807654      0.820430
support    307.000000  191.000000  0.823293  498.000000    498.000000

Tree of 4 depth
                    0           1  accuracy   macro avg  weighted avg
precision    0.829341    0.817073  0.825301    0.823207      0.824636
recall       0.902280    0.701571  0.825301    0.801925      0.825301
f1-score     0.864275    0.754930  0.825

## QUESTIONS 6: FURTHER EVALUATION

In [123]:
# max depth of 14+ produced the highest accuracy

In [129]:
metrics = []

for i in range (2, 25):
    tree = DecisionTreeClassifier(max_depth = i, random_state=123)
    
    # Fit the model
    tree = tree.fit(X_train, y_train)
    
    in_sample_accuracy = tree.score(X_train, y_train)
    
    out_of_sample_accuracy = tree.score(X_validate, y_validate)
    
    output = {
        'max_depth':i,
        'train_accuracy': in_sample_accuracy,
        'validate_accuracy': out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df['difference'] = df.train_accuracy - df.validate_accuracy
df

Unnamed: 0,max_depth,train_accuracy,validate_accuracy,difference
0,2,0.799197,0.761682,0.037515
1,3,0.823293,0.785047,0.038246
2,4,0.825301,0.785047,0.040254
3,5,0.837349,0.757009,0.08034
4,6,0.859438,0.766355,0.093083
5,7,0.863454,0.761682,0.101772
6,8,0.89759,0.757009,0.140581
7,9,0.909639,0.761682,0.147956
8,10,0.923695,0.766355,0.15734
9,11,0.931727,0.761682,0.170045


In [130]:
df[df.difference <= 0.10].sort_values(by=['validate_accuracy', 'difference'], ascending =[False,True])

Unnamed: 0,max_depth,train_accuracy,validate_accuracy,difference
1,3,0.823293,0.785047,0.038246
2,4,0.825301,0.785047,0.040254
4,6,0.859438,0.766355,0.093083
0,2,0.799197,0.761682,0.037515
3,5,0.837349,0.757009,0.08034


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   survived                 891 non-null    int64  
 1   pclass                   891 non-null    int64  
 2   sex                      891 non-null    object 
 3   sibsp                    891 non-null    int64  
 4   parch                    891 non-null    int64  
 5   fare                     891 non-null    float64
 6   embark_town              891 non-null    object 
 7   alone                    891 non-null    int64  
 8   sex_male                 891 non-null    uint8  
 9   embark_town_Queenstown   891 non-null    uint8  
 10  embark_town_Southampton  891 non-null    uint8  
dtypes: float64(1), int64(5), object(2), uint8(3)
memory usage: 65.3+ KB


In [17]:
df.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

In [18]:
df.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,male,1,0,7.25,Southampton,0,1,0,1
1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,1,3,female,0,0,7.925,Southampton,1,0,0,1
3,1,1,female,1,0,53.1,Southampton,0,0,0,1
4,0,3,male,0,0,8.05,Southampton,1,1,0,1


In [19]:
train_val, test = train_test_split(df,
                                  train_size = 0.8,
                                  random_state=123,
                                  stratify=df.survived)

In [20]:
train, val = train_test_split(train_val,
                                  train_size = 0.7,
                                  random_state=123,
                                  stratify=train_val.survived)

In [21]:
train.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,0,1,male,0,0,40.125,Cherbourg,1,1,0,0
165,1,3,male,0,2,20.525,Southampton,0,1,0,1
50,0,3,male,4,1,39.6875,Southampton,0,1,0,1
259,1,2,female,0,1,26.0,Southampton,0,0,0,1
306,1,1,female,0,0,110.8833,Cherbourg,1,0,0,0


In [22]:
x_train = train.drop(columns=['survived'])

y_train = train[['survived']]

In [23]:
type(y_train)

pandas.core.frame.DataFrame

In [24]:
type(train['survived'])

pandas.core.series.Series

In [32]:
y_train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [28]:
y_train['baseline'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train['baseline'] = 0


In [29]:
baseline_score = accuracy_score(y_train.survived, y_train.baseline)

In [30]:
baseline_score

0.6164658634538153

## Model creation
step one: Create the thing step two: fit the thing step three: use the thing

In [33]:
#create the thing

In [34]:
selected_feats = ['sex_male', 'pclass']

In [35]:
# create the classifier object

clf = RandomForestClassifier(random_state = 123)

In [38]:
# fit the thing:
# invoke the fit method, don't assign it to anything
# this will train our model

selected_feats

['sex_male', 'pclass']

In [39]:
#use the thing

clf.fit(x_train[selected_feats], y_train.survived)

RandomForestClassifier(random_state=123)

In [40]:
y_train['y_pred'] = clf.predict(x_train[selected_feats])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train['y_pred'] = clf.predict(x_train[selected_feats])


In [41]:
y_train.head()

Unnamed: 0,survived,baseline,y_pred
583,0,0,0
165,1,0,0
50,0,0,0
259,1,0,1
306,1,0,1
