In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from acquire import get_titanic_data

from pydataset import data
from prepare import tvt

# Exercises
## In these exercises, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluating, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

In [2]:
titanic = get_titanic_data()
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
def dropped(titanic,columns={'sepal_length', 'petal_width'}):
    '''
    dropped will take the dataframe and remove any columns that are indicated,
    hopefully making the process faster
    '''
    titanic = titanic.drop(columns=columns)
    return titanic

def prep_titanic(titanic):
    '''
    prep_titanic will do all the cleaning we need of the database 'titanic_db'.
    It will:
    - drop unnecessary columns ('deck', 'embark_town', 'pclass')
    - fill in null values from columns (embarked' and 'age')
    - create dummy variables for the categorical columns 'sex', 'embarked', and 'class'
    - concatenate the previous dataframe to the new ones with dummy variables
    
    return: concatenated and cleaned dataframe 'titanic_db' as 'df'
    '''
    titanic = dropped(titanic,columns={'deck', 'embark_town', 'pclass', 'passenger_id'})
    round(titanic['age'].mean())
    top_embarked = titanic['embarked'].describe()['top']
    titanic['embarked'] = titanic['embarked'].fillna(value='S')
    titanic['age'] = titanic['age'].fillna(value='30').astype(int)
    titanic['sex_encoded'] = titanic['sex'].map({'male': 1, 'female': 0})
    titanic['embarked_encoded'] = titanic['embarked'].map({'S': 1, 'C': 0, 'Q':2})
    titanic['class_encoded'] = titanic['class'].map({'Second': 1, 'First': 0, 'Third':2})
    titanic = dropped(titanic,columns={'sex', 'embarked', 'class'})
    return titanic

def print_and_label(df):
    '''
    print_and_label will take a confusion matrix and by indexing, will store and calculate certain criteria for you.
    return: accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
    '''
    TP = neo['0_actual']['0_predicted']
    FP = neo['1_actual']['0_predicted']
    FN = neo['0_actual']['1_predicted']
    TN = neo['1_actual']['1_predicted']
    neo_accuracy = (TP+TN)/(TP+FP+TN+FN)
    neo_precision = (TP)/(TP+FP)
    neo_recall = (TP)/(TP+FN)
    TPR = (TP)/(TP+FP)
    FPR = (FP)/(FP+TN)
    TNR = (TN)/(FP+TN)
    FNR = (FN)/(FN+TP)
    neo_support_0 = TP + FN
    neo_support_1 = FP + TN
    neo_f1 =  2 * ((neo_precision * neo_recall) / (neo_precision + neo_recall))
    return print(f'Accuracy: \n{neo_accuracy:2%}\n\nTrue Positive Rate: \n{TPR:2%}\
    \n\nFalse Positive Rate: \n{FPR:2%}\n\nTrue Negative Rate: \n{TNR:2%}\
    \n\nFalse Negative Rate: \n{FNR:2%}\n\nPrecision: \n{neo_precision:2%}\
    \n\nRecall: \n{neo_recall:2%}\n\nF1 Score: \n{neo_f1:2%}\
    \n\nSupport for Did Not Survive: \n{neo_support_0}\n\nSupport for Did Survive: \n{neo_support_1}')

In [4]:
titanic = prep_titanic(titanic)
titanic.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,sex_encoded,embarked_encoded,class_encoded
0,0,22,1,0,7.25,0,1,1,2
1,1,38,1,0,71.2833,0,0,0,0
2,1,26,0,0,7.925,1,0,1,2
3,1,35,1,0,53.1,0,0,1,0
4,0,35,0,0,8.05,1,1,1,2


In [5]:
# establishing baseline
baseline_accuracy = (titanic['survived'].value_counts().idxmax() == titanic.survived).mean()
baseline_accuracy

0.6161616161616161

## Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

In [6]:
train, validate, test = tvt(titanic, ['survived'])

X_train1 = train.drop(columns=['survived', 'parch', 'sex_encoded', 'embarked_encoded', 'alone', 'sibsp'])
y_train1 = train.survived

X_validate1 = validate.drop(columns=['survived', 'parch', 'sex_encoded', 'embarked_encoded', 'alone', 'sibsp'])
y_validate1 = validate.survived

X_test1 = test.drop(columns=['survived', 'parch', 'sex_encoded', 'embarked_encoded', 'alone', 'sibsp'])
y_test1 = test.survived

In [7]:
X_train1.head()

Unnamed: 0,age,fare,class_encoded
72,21,73.5,1
682,20,9.225,2
399,28,12.65,1
756,28,7.7958,2
394,24,16.7,2


In [8]:
logit1 = LogisticRegression()
logit1.fit(X_train1, y_train1)

In [9]:
y_pred1 = logit1.predict(X_train1)
y_pred1[:5]

array([1, 0, 0, 0, 0])

In [10]:
y_pred_prob1 = logit1.predict_proba(X_train1)
y_pred_prob1[:5]

array([[0.4608449 , 0.5391551 ],
       [0.72074835, 0.27925165],
       [0.53746511, 0.46253489],
       [0.76877933, 0.23122067],
       [0.74327439, 0.25672561]])

In [11]:
model_1_train = logit1.score(X_train1,y_train1)
model_1_validate = logit1.score(X_validate1,y_validate1)
model_1_overfit = model_1_train-model_1_validate
model_1_beat_baseline = model_1_train - baseline_accuracy

In [12]:
print(f'Accuracy Scores:\n\
\nBaseline Accuracy:\n{baseline_accuracy:2%}\
\n\nModel 1 Training Set:\n{model_1_train:2%}\n\nModel 1 Validation Set:\n{model_1_validate:2%}\n\nModel 1 Overfit:\n{model_1_overfit:2%}\n\n\
Model 1 Beat Baseline Accuracy by:\n{model_1_beat_baseline:2%}')

Accuracy Scores:

Baseline Accuracy:
61.616162%

Model 1 Training Set:
69.477912%

Model 1 Validation Set:
70.560748%

Model 1 Overfit:
-1.082836%

Model 1 Beat Baseline Accuracy by:
7.861750%


## Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [13]:
titanic = get_titanic_data()

In [14]:
titanic = prep_titanic(titanic)

In [15]:
train, validate, test = tvt(titanic, ['survived'])

X_train2 = train.drop(columns=['survived', 'parch', 'embarked_encoded', 'alone', 'sibsp'])
y_train2 = train.survived

X_validate2 = validate.drop(columns=['survived', 'parch', 'embarked_encoded', 'alone', 'sibsp'])
y_validate2 = validate.survived

X_test2 = test.drop(columns=['survived', 'parch', 'embarked_encoded', 'alone', 'sibsp'])
y_test2 = test.survived

In [16]:
logit2 = LogisticRegression()
logit2.fit(X_train2, y_train2)

y_pred2 = logit2.predict(X_train2)
y_pred2[:5]

y_pred_prob2 = logit2.predict_proba(X_train2)
y_pred_prob2[:5]

array([[0.71124139, 0.28875861],
       [0.88214019, 0.11785981],
       [0.19781408, 0.80218592],
       [0.90268428, 0.09731572],
       [0.42279187, 0.57720813]])

In [17]:
model_2_train = logit2.score(X_train2,y_train2)
model_2_validate = logit2.score(X_validate2,y_validate2)
model_2_overfit = model_2_train-model_2_validate
model_2_beat_baseline = model_2_train - baseline_accuracy

print(f'Accuracy Scores:\n\
\nBaseline Accuracy:\n{baseline_accuracy:2%}\
\n\nModel 2 Training Set:\n{model_2_train:2%}\n\nModel 2 Validation Set:\n{model_2_validate:2%}\n\nModel 2 Overfit:\n{model_2_overfit:2%}\n\n\
Model 2 Beat Baseline Accuracy by:\n{model_2_beat_baseline:2%}')

Accuracy Scores:

Baseline Accuracy:
61.616162%

Model 2 Training Set:
78.714859%

Model 2 Validation Set:
80.373832%

Model 2 Overfit:
-1.658972%

Model 2 Beat Baseline Accuracy by:
17.098698%


## Try out other combinations of features and models.

In [18]:
titanic = get_titanic_data()

In [19]:
titanic = prep_titanic(titanic)

In [20]:
train, validate, test = tvt(titanic, ['survived'])

X_train3 = train.drop(columns=['survived', 'alone', 'sibsp'])
y_train3 = train.survived

X_validate3 = validate.drop(columns=['survived', 'alone', 'sibsp'])
y_validate3 = validate.survived

X_test3 = test.drop(columns=['survived', 'alone', 'sibsp'])
y_test3 = test.survived

In [21]:
logit3 = LogisticRegression()
logit3.fit(X_train3, y_train3)

In [22]:
y_pred3 = logit3.predict(X_train3)
y_pred3[:5]

array([0, 0, 1, 0, 0])

In [23]:
y_pred_prob3 = logit3.predict_proba(X_train3)
y_pred_prob3[:5]

array([[0.68378754, 0.31621246],
       [0.87271875, 0.12728125],
       [0.17961923, 0.82038077],
       [0.89585135, 0.10414865],
       [0.50043772, 0.49956228]])

In [24]:
model_3_train = logit3.score(X_train3,y_train3)
model_3_validate = logit3.score(X_validate3,y_validate3)
model_3_overfit = model_3_train-model_3_validate
model_3_beat_baseline = model_3_train - baseline_accuracy

print(f'Accuracy Scores:\n\
\nBaseline Accuracy:\n{baseline_accuracy:3%}\
\n\nModel 3 Training Set:\n{model_3_train:3%}\n\nModel 3 Validation Set:\n{model_3_validate:3%}\n\nModel 3 Overfit:\n{model_3_overfit:3%}\n\n\
Model 3 Beat Baseline Accuracy by:\n{model_3_beat_baseline:3%}')

Accuracy Scores:

Baseline Accuracy:
61.616162%

Model 3 Training Set:
78.313253%

Model 3 Validation Set:
80.841121%

Model 3 Overfit:
-2.527868%

Model 3 Beat Baseline Accuracy by:
16.697091%


In [25]:
titanic = get_titanic_data()

In [26]:
titanic = prep_titanic(titanic)

In [27]:
train, validate, test = tvt(titanic, ['survived'])

X_train4 = train.drop(columns=['survived', 'parch', 'embarked_encoded', 'alone'])
y_train4 = train.survived

X_validate4 = validate.drop(columns=['survived', 'parch', 'embarked_encoded', 'alone'])
y_validate4 = validate.survived

X_test4 = test.drop(columns=['survived', 'parch', 'embarked_encoded', 'alone'])
y_test4 = test.survived

In [28]:
logit4 = LogisticRegression()
logit4.fit(X_train4, y_train4)

In [29]:
y_pred4 = logit4.predict(X_train4)
y_pred4[:5]

array([0, 0, 1, 0, 1])

In [30]:
y_pred_prob4 = logit4.predict_proba(X_train4)
y_pred_prob4[:5]

array([[0.65169918, 0.34830082],
       [0.85889123, 0.14110877],
       [0.16278493, 0.83721507],
       [0.88898691, 0.11101309],
       [0.35077326, 0.64922674]])

In [31]:
model_4_train = logit4.score(X_train4,y_train4)
model_4_validate = logit4.score(X_validate4,y_validate4)
model_4_overfit = model_4_train-model_4_validate
model_4_beat_baseline = model_4_train - baseline_accuracy

print(f'Accuracy Scores:\n\
\nBaseline Accuracy:\n{baseline_accuracy:4%}\
\n\nModel 4 Training Set:\n{model_4_train:4%}\n\nModel 4 Validation Set:\n{model_4_validate:4%}\n\nModel 4 Overfit:\n{model_4_overfit:4%}\n\n\
Model 4 Beat Baseline Accuracy by:\n{model_4_beat_baseline:4%}')

Accuracy Scores:

Baseline Accuracy:
61.616162%

Model 4 Training Set:
79.919679%

Model 4 Validation Set:
80.841121%

Model 4 Overfit:
-0.921443%

Model 4 Beat Baseline Accuracy by:
18.303517%


In [32]:
titanic = get_titanic_data()

In [33]:
titanic = prep_titanic(titanic)

In [34]:
train, validate, test = tvt(titanic, ['survived'])

X_train5 = train.drop(columns=['survived'])
y_train5 = train.survived

X_validate5 = validate.drop(columns=['survived'])
y_validate5 = validate.survived

X_test5 = test.drop(columns=['survived'])
y_test5 = test.survived

In [35]:
logit5 = LogisticRegression()
logit5.fit(X_train5, y_train5)

In [36]:
y_pred5 = logit5.predict(X_train5)
y_pred5[:5]

array([0, 0, 1, 0, 1])

In [37]:
y_pred_prob5 = logit5.predict_proba(X_train5)
y_pred_prob5[:5]

array([[0.66588286, 0.33411714],
       [0.865785  , 0.134215  ],
       [0.17964589, 0.82035411],
       [0.89515206, 0.10484794],
       [0.32515584, 0.67484416]])

In [38]:
model_5_train = logit5.score(X_train5,y_train5)
model_5_validate = logit5.score(X_validate5,y_validate5)
model_5_overfit = model_5_train-model_5_validate
model_5_beat_baseline = model_5_train - baseline_accuracy

print(f'Accuracy Scores:\n\
\nBaseline Accuracy:\n{baseline_accuracy:5%}\
\n\nModel 5 Training Set:\n{model_5_train:5%}\n\nModel 5 Validation Set:\n{model_5_validate:5%}\n\nModel 5 Overfit:\n{model_5_overfit:5%}\n\n\
Model 5 Beat Baseline Accuracy by:\n{model_5_beat_baseline:5%}')

Accuracy Scores:

Baseline Accuracy:
61.616162%

Model 5 Training Set:
80.522088%

Model 5 Validation Set:
80.373832%

Model 5 Overfit:
0.148257%

Model 5 Beat Baseline Accuracy by:
18.905927%


In [39]:
print(f'Baseline Accuracy:\n{baseline_accuracy:2%}\
\n\nAccuracy Scores:\
\n\nModel 1 Training Set:\n{model_1_train:2%}\n\
Model 1 Beat Baseline Accuracy by:\n{model_1_beat_baseline:2%}\
\n\nModel 2 Training Set:\n{model_2_train:2%}\n\
Model 2 Beat Baseline Accuracy by:\n{model_2_beat_baseline:2%}\
\n\nModel 3 Training Set:\n{model_3_train:2%}\n\
Model 3 Beat Baseline Accuracy by:\n{model_3_beat_baseline:2%}\
\n\nModel 4 Training Set:\n{model_4_train:2%}\n\
Model 4 Beat Baseline Accuracy by:\n{model_4_beat_baseline:2%}\
\n\nModel 5 Training Set:\n{model_5_train:2%}\n\
Model 5 Beat Baseline Accuracy by:\n{model_5_beat_baseline:2%}')

Baseline Accuracy:
61.616162%

Accuracy Scores:

Model 1 Training Set:
69.477912%
Model 1 Beat Baseline Accuracy by:
7.861750%

Model 2 Training Set:
78.714859%
Model 2 Beat Baseline Accuracy by:
17.098698%

Model 3 Training Set:
78.313253%
Model 3 Beat Baseline Accuracy by:
16.697091%

Model 4 Training Set:
79.919679%
Model 4 Beat Baseline Accuracy by:
18.303517%

Model 5 Training Set:
80.522088%
Model 5 Beat Baseline Accuracy by:
18.905927%


## Use your best 3 models to predict and evaluate on your validate sample.

Model's 5, 4, and 2 performed the best.

In [40]:
print(f'Baseline Accuracy:\n{baseline_accuracy:2%}\n\
\nModel 2 Validate Set:\n{model_2_validate:2%}\n\
\nModel 4 Validate Set:\n{model_4_validate:2%}\n\
\nModel 5 Validate Set:\n{model_5_validate:2%}')

Baseline Accuracy:
61.616162%

Model 2 Validate Set:
80.373832%

Model 4 Validate Set:
80.841121%

Model 5 Validate Set:
80.373832%


## Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [43]:
model_4_test = logit4.score(X_test4,y_test4)
model_4_test_beat_baseline = model_4_test - baseline_accuracy

print(f'Accuracy Scores:\n\
\nBaseline Accuracy:\n{baseline_accuracy:4%}\
\n\nModel 4 Training Set:\n{model_4_train:4%}\n\n\
Model 4 Validation Set:\n{model_4_validate:4%}\n\n\
Model 4 Test Set:\n{model_4_test:4%}\n\n\
Model 4 Overfit:\n{model_4_overfit:4%}\n\n\
Model 4 Test Set Beat Baseline Accuracy by:\n{model_4_test_beat_baseline:4%}')

Accuracy Scores:

Baseline Accuracy:
61.616162%

Model 4 Training Set:
79.919679%

Model 4 Validation Set:
80.841121%

Model 4 Test Set:
77.653631%

Model 4 Overfit:
-0.921443%

Model 4 Test Set Beat Baseline Accuracy by:
16.037470%


## Bonus1 How do different strategies for handling the missing values in the age column affect model performance?

In [63]:
titanic = get_titanic_data()

In [64]:
# code to determine which age occurs the most 
titanic['age'].value_counts().head(1)

age
24.0    30
Name: count, dtype: int64

In [66]:
'''
Prior prep method:
titanic = dropped(titanic,columns={'deck', 'embark_town', 'pclass', 'passenger_id'})
    round(titanic['age'].mean())
    top_embarked = titanic['embarked'].describe()['top']
    titanic['embarked'] = titanic['embarked'].fillna(value='S')
    titanic['age'] = titanic['age'].fillna(value='30').astype(int)
    titanic['sex_encoded'] = titanic['sex'].map({'male': 1, 'female': 0})
    titanic['embarked_encoded'] = titanic['embarked'].map({'S': 1, 'C': 0, 'Q':2})
    titanic['class_encoded'] = titanic['class'].map({'Second': 1, 'First': 0, 'Third':2})
    titanic = dropped(titanic,columns={'sex', 'embarked', 'class'})
'''
# different strategy for handling missing values in age column
titanic = dropped(titanic,columns={'deck', 'embark_town', 'pclass', 'passenger_id'})
top_embarked = titanic['embarked'].describe()['top']
titanic['embarked'] = titanic['embarked'].fillna(value='S')
# changing value from average of 30 to most occurances 24
titanic['age'] = titanic['age'].fillna(value='24').astype(int)
titanic['sex_encoded'] = titanic['sex'].map({'male': 1, 'female': 0})
titanic['embarked_encoded'] = titanic['embarked'].map({'S': 1, 'C': 0, 'Q':2})
titanic['class_encoded'] = titanic['class'].map({'Second': 1, 'First': 0, 'Third':2})
titanic = dropped(titanic,columns={'sex', 'embarked', 'class'})

In [67]:
train, validate, test = tvt(titanic, ['survived'])

In [68]:
X_train1 = train.drop(columns=['survived', 'parch', 'sex_encoded', 'embarked_encoded', 'alone', 'sibsp'])
y_train1 = train.survived

X_validate1 = validate.drop(columns=['survived', 'parch', 'sex_encoded', 'embarked_encoded', 'alone', 'sibsp'])
y_validate1 = validate.survived

X_test1 = test.drop(columns=['survived', 'parch', 'sex_encoded', 'embarked_encoded', 'alone', 'sibsp'])
y_test1 = test.survived

In [69]:
logit1 = LogisticRegression()
logit1.fit(X_train1, y_train1)

In [70]:
y_pred1 = logit1.predict(X_train1)
y_pred1[:5]

array([1, 0, 0, 0, 0])

In [71]:
y_pred_prob1 = logit1.predict_proba(X_train1)
y_pred_prob1[:5]

array([[0.46963168, 0.53036832],
       [0.73051559, 0.26948441],
       [0.54539842, 0.45460158],
       [0.77643769, 0.22356231],
       [0.75197833, 0.24802167]])

In [72]:
model_1_train = logit1.score(X_train1,y_train1)
model_1_validate = logit1.score(X_validate1,y_validate1)
model_1_overfit = model_1_train-model_1_validate
model_1_beat_baseline = model_1_train - baseline_accuracy


print(f'Accuracy Scores:\n\
\nBaseline Accuracy:\n{baseline_accuracy:2%}\
\n\nModel 1 Training Set:\n{model_1_train:2%}\n\nModel 1 Validation Set:\n{model_1_validate:2%}\n\nModel 1 Overfit:\n{model_1_overfit:2%}\n\n\
Model 1 Beat Baseline Accuracy by:\n{model_1_beat_baseline:2%}')

Accuracy Scores:

Baseline Accuracy:
61.616162%

Model 1 Training Set:
70.281124%

Model 1 Validation Set:
70.093458%

Model 1 Overfit:
0.187667%

Model 1 Beat Baseline Accuracy by:
8.664963%


Original Model 1: 
Accuracy Scores:

Baseline Accuracy:
61.616162%
Model 1 Training Set:
69.477912%
Model 1 Validation Set:
70.560748%
Model 1 Overfit:
-1.082836%
Model 1 Beat Baseline Accuracy by:
7.861750%

New Model 1:
Accuracy Scores:

Baseline Accuracy:
61.616162%
Model 1 Training Set:
70.281124%
Model 1 Validation Set:
70.093458%
Model 1 Overfit:
0.187667%
Model 1 Beat Baseline Accuracy by:
8.664963%

### Answer: By filling the null values in the column age to the most occcuring instead of the average, our Model 1 got a better training set percentage but almost no change in our validation set.