In [38]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from pydataset import data

import prepare as prep 
import acquire as acq
import modeling as mod

1. Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

In [32]:
train, validate, test = prep.get_prep_split_titanic()

In [33]:
train

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S
455,455,1,3,male,29.0,0,0,7.8958,C,1,1,0,0
380,380,1,1,female,42.0,0,0,227.5250,C,1,0,0,0
492,492,0,1,male,55.0,0,0,30.5000,S,1,1,0,1
55,55,1,1,male,,0,0,35.5000,S,1,1,0,1
243,243,0,3,male,22.0,0,0,7.1250,S,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,695,0,2,male,52.0,0,0,13.5000,S,1,1,0,1
128,128,1,3,female,,1,1,22.3583,C,0,0,0,0
636,636,0,3,male,32.0,0,0,7.9250,S,1,1,0,1
222,222,0,3,male,51.0,0,0,8.0500,S,1,1,0,1


In [240]:
# train x & y
x_train1 = train[["age", "fare", "pclass"]]
x_train1["age"] = x_train1.age.fillna(29)
y_train1 = train.survived
baseline_prediction = y_train1.mode()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 534 entries, 455 to 496
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     534 non-null    float64
 1   fare    534 non-null    float64
 2   pclass  534 non-null    int64  
dtypes: float64(2), int64(1)
memory usage: 16.7 KB


In [183]:
baseline_prediction

0    0
Name: survived, dtype: int64

In [184]:
pd.Series(baseline_prediction[0], range(len(y_train1)))

0      0
1      0
2      0
3      0
4      0
      ..
529    0
530    0
531    0
532    0
533    0
Length: 534, dtype: int64

In [185]:
# create function to establish baseline
def establish_baseline(y_train):
    #est baseline
    baseline_prediction = y_train.mode()
    
    #create series of prediction with that baseline val
    #same len as y_train
    y_train_pred = pd.Series((baseline_prediction[0]), range(len(y_train)))
    
    #compute the confusion matrix for Accuracy
    cm= confusion_matrix(y_train, y_train_pred)
    tn, fp, fn, tp = cm.ravel()
    
    accuracy = (tp+tn) / (tp+tn+fp+fn)
    
    return accuracy

In [186]:
# baseline accuracy
establish_baseline(y_train1)

0.6161048689138576

In [187]:
x_validate1 = validate[["age", "fare", "pclass"]]
x_validate1["age"] = x_validate1.age.fillna(29)
y_validate1 = validate.survived

In [188]:
x_test1 = test[["age", "fare", "pclass"]]
x_test1["age"] = x_test1.age.fillna(29)
y_test1 = test.survived

In [189]:
logit1 = LogisticRegression(random_state=123)
logit1.fit(x_train1, y_train1)

LogisticRegression(random_state=123)

In [190]:
def coef_weight(logit):
    print('Coefficient: \n', logit.coef_)
    print('Intercept: \n', logit.intercept_)

In [191]:
coef_weight(logit1)

Coefficient: 
 [[-0.0294718   0.00226405 -0.92631523]]
Intercept: 
 [2.42588444]


In [192]:
def predictions(x_train, logit):    
    y_pred = logit.predict(x_train)
    y_pred_proba = logit.predict_proba(x_train)
    return y_pred, y_pred_proba       

In [193]:
y_pred, y_pred_proba = predictions(x_train1, logit1)

In [194]:
y_pred_proba

array([[0.76671728, 0.23328272],
       [0.31499246, 0.68500754],
       [0.51307863, 0.48692137],
       ...,
       [0.78214601, 0.21785399],
       [0.86269987, 0.13730013],
       [0.47869005, 0.52130995]])

In [195]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit1.score(x_train1, y_train1)))


Accuracy of Logistic Regression classifier on training set: 0.71


In [196]:
print(confusion_matrix(y_train1, y_pred))

[[287  42]
 [115  90]]


* true positive = predict did not survive, did not survived
* true negative = predict survive, did survive


In [139]:
print(classification_report(y_train1, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.87      0.79       329
           1       0.68      0.44      0.53       205

    accuracy                           0.71       534
   macro avg       0.70      0.66      0.66       534
weighted avg       0.70      0.71      0.69       534



2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [140]:
train["age"] = train.age.fillna(29)
x_train2 = train[["age", "fare", "pclass", "sex_male"]]
y_train2 = train.survived

In [141]:
validate["age"] = validate.age.fillna(29)
x_validate2 = validate[["age", "fare", "pclass", "sex_male"]]
y_validate2 = validate.survived

In [142]:
test["age"] = test.age.fillna(29)
x_test2 = test[["age", "fare", "pclass", "sex_male"]]
y_test2 = test.survived

In [143]:
logit2 = LogisticRegression(random_state=123)
logit2.fit(x_train2, y_train2)

LogisticRegression(random_state=123)

In [144]:
y_pred, y_pred_proba = predictions(x_train2, logit2)
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit2.score(x_train2, y_train2)))

Accuracy of Logistic Regression classifier on training set: 0.81


In [145]:
print(confusion_matrix(y_train2, y_pred))

[[282  47]
 [ 57 148]]


In [146]:
print(classification_report(y_train2, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.86      0.84       329
           1       0.76      0.72      0.74       205

    accuracy                           0.81       534
   macro avg       0.80      0.79      0.79       534
weighted avg       0.80      0.81      0.80       534



3. Try out other combinations of features and models.

In [208]:
logit3 = LogisticRegression(C=.1, random_state=123)
logit3.fit(x_train2, y_train2)

LogisticRegression(C=0.1, random_state=123)

In [209]:
y_pred, y_pred_proba = predictions(x_train2, logit3)
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit3.score(x_train2, y_train2)))

Accuracy of Logistic Regression classifier on training set: 0.80


In [210]:
x_train3 = train[["pclass", "fare", "alone", "sex_male"]]
y_train3 = train.survived

In [219]:
x_validate3 = validate[["pclass", "fare", "alone", "sex_male"]]
y_validate3 = validate.survived

In [212]:
x_test3 = test[["pclass", "fare", "alone", "sex_male"]]
y_test3 = test.survived

In [213]:
logit4 = LogisticRegression(random_state=123)
logit4.fit(x_train3, y_train3)

LogisticRegression(random_state=123)

In [214]:
y_pred, y_pred_proba = predictions(x_train3, logit4)

print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit3.score(x_train3, y_train3)))

Accuracy of Logistic Regression classifier on training set: 0.38


In [215]:
print(confusion_matrix(y_train3, y_pred))

[[283  46]
 [ 65 140]]


In [216]:
print(classification_report(y_train3, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.86      0.84       329
           1       0.75      0.68      0.72       205

    accuracy                           0.79       534
   macro avg       0.78      0.77      0.78       534
weighted avg       0.79      0.79      0.79       534



4. Use you best 3 models to predict and evaluate on your validate sample.

In [231]:
logit4.score(x_train3, y_train3)

0.7921348314606742

In [230]:
logit4.score(x_train3, y_train3)
print(logit4.score(x_validate3, y_validate3))

0.7752808988764045


In [226]:
logit3.score(x_train2, y_train2)

0.795880149812734

In [222]:
logit3.score(x_validate2, y_validate2)

0.797752808988764

In [227]:
logit2.score(x_train2, y_train2)

0.8052434456928839

In [223]:
logit2.score(x_validate2, y_validate2)

0.7808988764044944

5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train? 

* The model best suited is logit4 primarily because with adding the age parameter, I had to chage several null values that potentially skewed the data more than I was comfortable with. 

In [232]:
logit4.score(x_test3, y_test3)

0.7821229050279329

Bonus3: scikit-learn's LogisticRegression classifier is actually applying a regularization penalty to the coefficients by default. This penalty causes the magnitude of the coefficients in the resulting model to be smaller than they otherwise would be. This value can be modified with the C hyper parameter. Small values of C correspond to a larger penalty, and large values of C correspond to a smaller penalty.
Try out the following values for C and note how the coefficients and the model's performance on both the dataset it was trained on and on the validate split are affected

In [245]:
logit01 = LogisticRegression(C=.01, random_state=123)
logit01.fit(x_train3, y_train3)

LogisticRegression(C=0.01, random_state=123)

In [246]:
logit01.score(x_train3, y_train3)

0.6872659176029963

In [255]:
lst = [.01, .1, 1, 10, 100, 1000]

for i in lst:
    logit = LogisticRegression(C=i, random_state=123)
    logit.fit(x_train3, y_train3)
    dct = {
    "C=": i,
    "Train accuracy ": logit.score(x_train3, y_train3),
    "Validation accuracy ": logit.score(x_validate3, y_validate3),
    "Difference score ": logit.score(x_train3, y_train3) - logit.score(x_validate3, y_validate3)
    }
    print(dct)

{'C=': 0.01, 'Train accuracy ': 0.6872659176029963, 'Validation accuracy ': 0.7303370786516854, 'Difference score ': -0.04307116104868913}
{'C=': 0.1, 'Train accuracy ': 0.7696629213483146, 'Validation accuracy ': 0.7696629213483146, 'Difference score ': 0.0}
{'C=': 1, 'Train accuracy ': 0.7921348314606742, 'Validation accuracy ': 0.7752808988764045, 'Difference score ': 0.016853932584269704}
{'C=': 10, 'Train accuracy ': 0.7921348314606742, 'Validation accuracy ': 0.7752808988764045, 'Difference score ': 0.016853932584269704}
{'C=': 100, 'Train accuracy ': 0.7921348314606742, 'Validation accuracy ': 0.7752808988764045, 'Difference score ': 0.016853932584269704}
{'C=': 1000, 'Train accuracy ': 0.7921348314606742, 'Validation accuracy ': 0.7752808988764045, 'Difference score ': 0.016853932584269704}


In [None]:
logit01 = LogisticRegression(C=.01, random_state=123)
logit01.fit(x_train3, y_train3)