# Imports, acquire and prepare data:

Use the titanic data set:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import (classification_report,
 confusion_matrix, accuracy_score, precision_score, recall_score)
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

import acquire, prepare

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [2]:
#Get titanic data set
titanic = acquire.get_titanic_data()

# Questions:

## 1. Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

In [3]:
cols_to_keep = ['survived', 'age', 'fare', 'pclass']

In [4]:
working_df = titanic[cols_to_keep]
working_df2 = titanic[cols_to_keep]

In [5]:
working_df.age = working_df.age.fillna(working_df.age.median())

In [6]:
working_df2 = working_df2.dropna(how='any')

In [7]:
#Prepare titanic data for processing
#titanic = prepare.prep_titanic(titanic)

In [8]:
#Split the data into train, validate and test sets
train, val, test = prepare.train_validate(working_df, 'survived')

In [9]:
#Split the data into train, validate and test sets
train1, val1, test1 = prepare.train_validate(working_df2, 'survived')

In [10]:
#Seperate the target columns in each set for input into the algorithm
X_train = train.drop(columns='survived')
y_train = train.survived

X_val = val.drop(columns='survived')
y_val = val.survived

X_test = test.drop(columns='survived')
y_test = test.survived

In [11]:
#Seperate the target columns in each set for input into the algorithm
X_train1 = train1.drop(columns='survived')
y_train1 = train1.survived

X_val1 = val1.drop(columns='survived')
y_val1 = val1.survived

X_test1 = test1.drop(columns='survived')
y_test1 = test1.survived

In [12]:
#Shows that deceased is our baseline:
train.survived.value_counts()

0    192
1    119
Name: survived, dtype: int64

In [13]:
#Shows that deceased is our baseline:
train1.survived.value_counts()

0    148
1    101
Name: survived, dtype: int64

In [14]:
logit = LogisticRegression()

In [15]:
logit1 = LogisticRegression()

In [16]:
logit.fit(X_train, y_train)

LogisticRegression()

In [17]:
logit1.fit(X_train1, y_train1)

LogisticRegression()

In [18]:
logit.score(X_train, y_train)

0.7202572347266881

In [19]:
logit.score(X_val, y_val)

0.6891025641025641

In [20]:
logit1.score(X_train1, y_train1)

0.7349397590361446

In [21]:
logit1.score(X_val1, y_val1)

0.712

In [22]:
bl_fill_na = round(len(train[train.survived == 0]) / len(train.survived),3)
bl_fill_na

0.617

In [23]:
bl_drop_na = round(len(train1[train1.survived == 0]) / len(train1.survived),3)
bl_drop_na

0.594

In [24]:
y_pred = logit.predict(X_train)
y_pred1 = logit1.predict(X_train1)


In [25]:
pd.DataFrame(confusion_matrix(y_train, y_pred), index=['Pred_live', 'Pred_death'], columns=['Act_live','Act_death'])

Unnamed: 0,Act_live,Act_death
Pred_live,172,20
Pred_death,67,52


In [26]:
pd.DataFrame(confusion_matrix(y_train1, y_pred1), index=['Pred_live', 'Pred_death'], columns=['Act_live','Act_death'])

Unnamed: 0,Act_live,Act_death
Pred_live,125,23
Pred_death,43,58


In [27]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.90      0.80       192
           1       0.72      0.44      0.54       119

    accuracy                           0.72       311
   macro avg       0.72      0.67      0.67       311
weighted avg       0.72      0.72      0.70       311



In [28]:
print(classification_report(y_train1, y_pred1))

              precision    recall  f1-score   support

           0       0.74      0.84      0.79       148
           1       0.72      0.57      0.64       101

    accuracy                           0.73       249
   macro avg       0.73      0.71      0.71       249
weighted avg       0.73      0.73      0.73       249



## 2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [29]:
cols_to_keep = ['sex','survived', 'age', 'fare', 'pclass']
with_sex = titanic[cols_to_keep]
with_sex1 = titanic[cols_to_keep]

In [30]:
with_sex.age = with_sex.age.fillna(with_sex.age.median())

In [31]:
with_sex.sex.value_counts()

male      577
female    314
Name: sex, dtype: int64

In [32]:
dummy_df = pd.get_dummies(with_sex[['sex']], drop_first=True)

In [33]:
with_sex = pd.concat([with_sex, dummy_df], axis=1)

In [34]:
with_sex1 = with_sex1.dropna(how='any')

In [35]:
with_sex1.sex.value_counts()

male      453
female    261
Name: sex, dtype: int64

In [36]:
dummy_df1 = pd.get_dummies(with_sex1[['sex']], drop_first=True)

In [37]:
with_sex1 = pd.concat([with_sex1, dummy_df1], axis=1)

In [38]:
with_sex = with_sex.drop(columns='sex')

In [39]:
with_sex1 = with_sex1.drop(columns='sex')

In [40]:
#Split the data into train, validate and test sets
train2, val2, test2 = prepare.train_validate(with_sex, 'survived')

In [41]:
#Split the data into train, validate and test sets
train3, val3, test3 = prepare.train_validate(with_sex1, 'survived')

In [42]:
#Seperate the target columns in each set for input into the algorithm
#Instead of creating a new training set, you could just access the X_train via DataFrame X_train['sex','survived', 'age', 'fare', 'pclass'].

X_train2 = train2.drop(columns='survived')
y_train2 = train2.survived

X_val2 = val2.drop(columns='survived')
y_val2 = val2.survived

X_test2 = test2.drop(columns='survived')
y_test2 = test2.survived

In [43]:
#Seperate the target columns in each set for input into the algorithm
X_train3 = train3.drop(columns='survived')
y_train3 = train3.survived

X_val3 = val3.drop(columns='survived')
y_val3 = val3.survived

X_test3 = test3.drop(columns='survived')
y_test3 = test3.survived

In [44]:
logit2 = LogisticRegression()

In [45]:
logit3 = LogisticRegression()

In [46]:
logit2.fit(X_train2, y_train2)

LogisticRegression()

In [47]:
logit3.fit(X_train3, y_train3)

LogisticRegression()

In [48]:
logit2.score(X_train2, y_train2)

0.8135048231511254

In [49]:
logit2.score(X_val2, y_val2)

0.8205128205128205

In [50]:
logit3.score(X_train3, y_train3)

0.8112449799196787

In [51]:
logit3.score(X_val3, y_val3)

0.808

In [52]:
y_pred2 = logit2.predict(X_train2)
y_pred3 = logit3.predict(X_train3)

In [53]:
pd.DataFrame(confusion_matrix(y_train2, y_pred2), index=['Pred_live', 'Pred_death'], columns=['Act_live','Act_death'])

Unnamed: 0,Act_live,Act_death
Pred_live,165,27
Pred_death,31,88


In [54]:
pd.DataFrame(confusion_matrix(y_train3, y_pred3), index=['Pred_live', 'Pred_death'], columns=['Act_live','Act_death'])

Unnamed: 0,Act_live,Act_death
Pred_live,127,21
Pred_death,26,75


In [55]:
print(classification_report(y_train2, y_pred2))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85       192
           1       0.77      0.74      0.75       119

    accuracy                           0.81       311
   macro avg       0.80      0.80      0.80       311
weighted avg       0.81      0.81      0.81       311



In [56]:
print(classification_report(y_train3, y_pred3))

              precision    recall  f1-score   support

           0       0.83      0.86      0.84       148
           1       0.78      0.74      0.76       101

    accuracy                           0.81       249
   macro avg       0.81      0.80      0.80       249
weighted avg       0.81      0.81      0.81       249



## 3. Try out other combinations of features and models.

In [57]:
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [58]:
cols_to_keep = ['sex','survived', 'age', 'fare', 'pclass', 'embark_town']
new_df = titanic[cols_to_keep]

new_df.age = new_df.age.fillna(new_df.age.median())

new_df.sex.value_counts()

dummy_df = pd.get_dummies(new_df[['sex', 'embark_town']], drop_first=True)

new_df = pd.concat([new_df, dummy_df], axis=1)

new_df = new_df.drop(columns=['sex', 'embark_town'])

#Split the data into train, validate and test sets
train4, val4, test4 = prepare.train_validate(new_df, 'survived')

#Seperate the target columns in each set for input into the algorithm
X_train4 = train4.drop(columns='survived')
y_train4 = train4.survived

X_val4 = val4.drop(columns='survived')
y_val4 = val4.survived

X_test4 = test4.drop(columns='survived')
y_test4 = test4.survived

logit4 = LogisticRegression()

logit4.fit(X_train4, y_train4)

print(f'Train score: {logit4.score(X_train4, y_train4)}')

print(f'Validate score: {logit4.score(X_val4, y_val4)}')

y_pred4 = logit4.predict(X_train4)

print(pd.DataFrame(confusion_matrix(y_train4, y_pred4), index=['Pred_live', 'Pred_death'], columns=['Act_live','Act_death']))
print("-----------------------------------------------")
print(classification_report(y_train4, y_pred4))

Train score: 0.8135048231511254
Validate score: 0.8076923076923077
            Act_live  Act_death
Pred_live        168         24
Pred_death        34         85
-----------------------------------------------
              precision    recall  f1-score   support

           0       0.83      0.88      0.85       192
           1       0.78      0.71      0.75       119

    accuracy                           0.81       311
   macro avg       0.81      0.79      0.80       311
weighted avg       0.81      0.81      0.81       311



## 4. Use you best 3 models to predict and evaluate on your validate sample.

In [59]:
print(f'Model 0 validate score: {logit.score(X_val, y_val):.3f}')
print(f'Model 1 validate score: {logit1.score(X_val1, y_val1):.3f}')
print(f'Model 2 validate score: {logit2.score(X_val2, y_val2):.3f}')
print(f'Model 3 validate score: {logit3.score(X_val3, y_val3):.3f}')
print(f'Model 4 validate score: {logit4.score(X_val4, y_val4):.3f}')


Model 0 validate score: 0.689
Model 1 validate score: 0.712
Model 2 validate score: 0.821
Model 3 validate score: 0.808
Model 4 validate score: 0.808


## 5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [60]:
print(f'Model 2 test score: {logit2.score(X_test2, y_test2):.3f}')

Model 2 test score: 0.735


## Bonus1 How do different strategies for handling the missing values in the age column affect model performance?

In [61]:
print('These models used median age to fill NAs:')
print(f'Model 0 test score:     {logit.score(X_test, y_test):.3f}')
print(f'Model 0 validate score: {logit.score(X_val, y_val):.3f}')
print('------------------------------------')
print(f'Model 2 test score:     {logit2.score(X_test2, y_test2):.3f}')
print(f'Model 2 validate score: {logit2.score(X_val2, y_val2):.3f}')
print('====================================')
print('These models dropped age rows with NAs:')
print(f'Model 1 test score:     {logit1.score(X_test1, y_test1):.3f}')
print(f'Model 1 validate score: {logit1.score(X_val1, y_val1):.3f}')
print('------------------------------------')
print(f'Model 3 test score:     {logit3.score(X_test3, y_test3):.3f}')
print(f'Model 3 validate score: {logit3.score(X_val3, y_val3):.3f}')

These models used median age to fill NAs:
Model 0 test score:     0.698
Model 0 validate score: 0.689
------------------------------------
Model 2 test score:     0.735
Model 2 validate score: 0.821
These models dropped age rows with NAs:
Model 1 test score:     0.674
Model 1 validate score: 0.712
------------------------------------
Model 3 test score:     0.795
Model 3 validate score: 0.808


## Bonus2: How do different strategies for encoding sex affect model performance?

## Bonus3: scikit-learn's LogisticRegression classifier is actually applying a regularization penalty to the coefficients by default. 

This penalty causes the magnitude of the coefficients in the resulting model to be smaller than they otherwise would be. This value can be modified with the C hyper parameter. Small values of C correspond to a larger penalty, and large values of C correspond to a smaller penalty.
Try out the following values for C and note how the coefficients and the model's performance on both the dataset it was trained on and on the validate split are affected.

C= .01, .1, 1, 10, 100, 1000

## Bonus Bonus: how does scaling the data interact with your choice of C?