In [218]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
import acquire
import prepare
import warnings
warnings.filterwarnings("ignore")
df_titanic = acquire.get_titanic_data()
df_titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [219]:
df_titanic = prepare.prep_titanic(df_titanic)

In [220]:
df_titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,alone,embarked_Q,embarked_S
0,0,0,3,1,22.0,1,0,7.25,0,0,1
1,1,1,1,0,38.0,1,0,71.2833,0,0,0
2,2,1,3,0,26.0,0,0,7.925,1,0,1
3,3,1,1,0,35.0,1,0,53.1,0,0,1
4,4,0,3,1,35.0,0,0,8.05,1,0,1


In [221]:
from sklearn.model_selection import train_test_split

def train_validate_test_split(prep_titanic, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(prep_titanic, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df_titanic[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test


In [222]:
# split into train, validate, test
train, validate, test = train_validate_test_split(df_titanic, target='survived', seed=123)

# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train1 = train.drop(columns=['survived', 'passenger_id','sex','alone','embarked_Q','embarked_S','parch','sibsp'])
y_train1 = train.survived

X_validate1 = validate.drop(columns=['survived', 'passenger_id','sex','alone','embarked_Q','embarked_S','parch','sibsp'])
y_validate1 = validate.survived

X_test1 = test.drop(columns=['survived', 'passenger_id','sex','alone','embarked_Q','embarked_S','parch','sibsp'])
y_test1 = test.survived


In [223]:
X_train1.head()

Unnamed: 0,pclass,age,fare
583,1,36.0,40.125
165,3,9.0,20.525
50,3,7.0,39.6875
259,2,50.0,26.0
306,1,28.0,110.8833


In [224]:
y_train1.head()

583    0
165    1
50     0
259    1
306    1
Name: survived, dtype: int64

In [225]:
from sklearn.linear_model import LogisticRegression
logit1 = LogisticRegression(C=1, random_state=123, intercept_scaling=1)

In [226]:
logit1.fit(X_train1, y_train1)

LogisticRegression(C=1, random_state=123)

In [227]:
logit1.fit(X_train1, y_train1)
LogisticRegression(C=1, random_state=123)
print('Coefficient: \n', logit1.coef_)
print('Intercept: \n', logit1.intercept_)


Coefficient: 
 [[-0.9526243  -0.0304467   0.00141806]]
Intercept: 
 [2.52032223]


In [228]:
logit1.score(X_train1, y_train1)

0.7028112449799196

In [229]:
logit1.predict(X_train1)

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,

In [230]:
logit1.predict_proba(X_train1).round(2)[:5]

array([[0.37, 0.63],
       [0.64, 0.36],
       [0.62, 0.38],
       [0.7 , 0.3 ],
       [0.29, 0.71]])

In [231]:
print(classification_report(y_train1, logit1.predict(X_train1)))

              precision    recall  f1-score   support

           0       0.71      0.87      0.78       307
           1       0.67      0.43      0.53       191

    accuracy                           0.70       498
   macro avg       0.69      0.65      0.66       498
weighted avg       0.70      0.70      0.69       498



In [232]:
logit1.coef_

array([[-0.9526243 , -0.0304467 ,  0.00141806]])

In [233]:
X_train1.columns

Index(['pclass', 'age', 'fare'], dtype='object')

In [234]:
# Calculating the baseline
baseline_acc = (y_validate1.value_counts(normalize=True) * 100)[0]

print('Baseline accuracy:', baseline_acc)

Baseline accuracy: 61.6822429906542


Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.


In [235]:
# split into train, validate, test
train, validate, test = train_validate_test_split(df_titanic, target='survived', seed=123)

# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train2 = train.drop(columns=['survived', 'passenger_id','alone','embarked_Q','embarked_S','parch','sibsp'])
y_train2 = train.survived

X_validate2 = validate.drop(columns=['survived', 'passenger_id','alone','embarked_Q','embarked_S','parch','sibsp'])
y_validate2 = validate.survived

X_test2 = test.drop(columns=['survived', 'passenger_id','alone','embarked_Q','embarked_S','parch','sibsp'])
y_test2 = test.survived

In [236]:
X_train2.head()

Unnamed: 0,pclass,sex,age,fare
583,1,1,36.0,40.125
165,3,1,9.0,20.525
50,3,1,7.0,39.6875
259,2,0,50.0,26.0
306,1,0,28.0,110.8833


In [237]:
logit2 = LogisticRegression(C=.1, random_state=123, intercept_scaling=1, solver='lbfgs')

In [238]:
logit2.fit(X_train2, y_train2)

LogisticRegression(C=0.1, random_state=123)

In [239]:
LogisticRegression(C=.1, random_state=123)

LogisticRegression(C=0.1, random_state=123)

In [240]:
X_train2.columns

Index(['pclass', 'sex', 'age', 'fare'], dtype='object')

In [241]:
print('Coefficient: \n', logit2.coef_)
print('Intercept: \n', logit2.intercept_)

Coefficient: 
 [[-8.46566801e-01 -1.80647943e+00 -2.34435682e-02  7.62468559e-04]]
Intercept: 
 [3.21048959]


In [242]:
logit2.score(X_train2, y_train2)

0.7991967871485943

In [243]:
y_pred1 = logit2.predict(X_train2)

In [244]:
y_pred_proba1 = logit2.predict_proba(X_train2)

In [245]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit2.score(X_train2, y_train2)))

Accuracy of Logistic Regression classifier on training set: 0.80


In [246]:
print(confusion_matrix(y_train2, y_pred1))

[[265  42]
 [ 58 133]]


In [247]:
print(classification_report(y_train2, y_pred1))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.76      0.70      0.73       191

    accuracy                           0.80       498
   macro avg       0.79      0.78      0.78       498
weighted avg       0.80      0.80      0.80       498



## Model 3 "added embarked_Q"

In [248]:
# split into train, validate, test
train, validate, test = train_validate_test_split(df_titanic, target='survived', seed=123)

# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train3 = train.drop(columns=['survived', 'passenger_id','embarked_Q','embarked_S','parch','sibsp','fare','age'])
y_train3 = train.survived

X_validate3 = validate.drop(columns=['survived', 'passenger_id','embarked_Q','embarked_S','parch','sibsp','fare','age'])
y_validate3 = validate.survived

X_test3 = test.drop(columns=['survived', 'passenger_id','embarked_Q','embarked_S','parch','sibsp','fare','age'])
y_test3 = test.survived

In [249]:
X_train3.head()

Unnamed: 0,pclass,sex,alone
583,1,1,1
165,3,1,0
50,3,1,0
259,2,0,0
306,1,0,1


In [250]:
# from sklearn.linear_model import LogisticRegression
logit3 = LogisticRegression(C=1, random_state=123, intercept_scaling=1, solver='lbfgs')

In [251]:
logit3.fit(X_train3, y_train3)

LogisticRegression(C=1, random_state=123)

In [252]:
LogisticRegression(C=1, random_state=123)

LogisticRegression(C=1, random_state=123)

In [253]:
X_train3.columns

Index(['pclass', 'sex', 'alone'], dtype='object')

In [254]:
print('Coefficient: \n', logit3.coef_)
print('Intercept: \n', logit3.intercept_)


Coefficient: 
 [[-0.96901625 -2.6918695  -0.08298946]]
Intercept: 
 [3.38975935]


In [255]:
y_pred3 = logit3.predict(X_train3)


In [256]:
y_pred_proba3 = logit3.predict_proba(X_train3)


In [257]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit3.score(X_train3, y_train3)))


Accuracy of Logistic Regression classifier on training set: 0.80


In [258]:
print(confusion_matrix(y_train3, y_pred3))

[[265  42]
 [ 58 133]]


In [259]:
print(classification_report(y_train3, y_pred3))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.76      0.70      0.73       191

    accuracy                           0.80       498
   macro avg       0.79      0.78      0.78       498
weighted avg       0.80      0.80      0.80       498



# model 4

In [260]:
# split into train, validate, test
train, validate, test = train_validate_test_split(df_titanic, target='survived', seed=123)

# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train4 = train.drop(columns=['survived', 'passenger_id','embarked_S','parch','sibsp','age','sex','fare'])
y_train4 = train.survived

X_validate4 = validate.drop(columns=['survived', 'passenger_id','embarked_S','parch','sibsp','age','sex','fare'])
y_validate4 = validate.survived

X_test4 = test.drop(columns=['survived', 'passenger_id','embarked_S','parch','sibsp','age','sex','fare'])
y_test4 = test.survived

In [261]:
X_train4.head()

Unnamed: 0,pclass,alone,embarked_Q
583,1,1,0
165,3,0,0
50,3,0,0
259,2,0,0
306,1,1,0


In [262]:
logit4 = LogisticRegression(C=1, random_state=123, intercept_scaling=1, solver='lbfgs')

In [263]:
logit4.fit(X_train4, y_train4)

LogisticRegression(C=1, random_state=123)

In [264]:
print('Coefficient: \n', logit4.coef_)
print('Intercept: \n', logit4.intercept_)


Coefficient: 
 [[-0.86978181 -0.75636072  0.99008189]]
Intercept: 
 [1.86628918]


In [265]:
y_pred4 = logit4.predict(X_train4)

In [266]:
y_pred_proba4 = logit4.predict_proba(X_train4)

In [267]:
X_train4.columns

Index(['pclass', 'alone', 'embarked_Q'], dtype='object')

In [268]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit4.score(X_train4, y_train4)))

Accuracy of Logistic Regression classifier on training set: 0.69


In [269]:
print(confusion_matrix(y_train4, y_pred4))

[[241  66]
 [ 86 105]]


In [270]:
print(classification_report(y_train4, y_pred4))

              precision    recall  f1-score   support

           0       0.74      0.79      0.76       307
           1       0.61      0.55      0.58       191

    accuracy                           0.69       498
   macro avg       0.68      0.67      0.67       498
weighted avg       0.69      0.69      0.69       498



In [272]:
# make predictions

y_pred1 = logit1.predict(X_train1)
y_pred2 = logit2.predict(X_train2)
y_pred3 = logit3.predict(X_train3)
y_pred4 = logit4.predict(X_train4)

print("Model 1: solver = lbfgs, c = 1")

# accuracy of model 1
print('Accuracy: {:.2f}'.format(logit1.score(X_train1, y_train1)))

# confusion matrix of model 1
print(confusion_matrix(y_train1, y_pred1))

# classification report of model 1
print(classification_report(y_train1, y_pred1))

print("Model 2: solver = lbfgs, c = .01")

# accuracy of model 2
print('Accuracy: {:.2f}'.format(logit2.score(X_train2, y_train2)))

# confusion matrix of model 2
print(confusion_matrix(y_train2, y_pred2))

# classification report of model 2
print(classification_report(y_train2, y_pred2))

print("Model 3: solver = lbfgs, c = 1")

# accuracy of model 3
print('Accuracy: {:.2f}'.format(logit3.score(X_train3, y_train3)))

# confusion matrix of model 3
print(confusion_matrix(y_train3, y_pred3))

# classification report of model 3
print(classification_report(y_train3, y_pred3))

print("Model 4: solver = lbfgs, c = 1")

# accuracy of model 4
print('Accuracy: {:.2f}'.format(logit4.score(X_train4, y_train4)))

# confusion matrix of model 4
print(confusion_matrix(y_train4, y_pred4))

# classification report of model 4
print(classification_report(y_train4, y_pred4))

print("Model 4: solver = lbfgs, c = 1")


Model 1: solver = lbfgs, c = 1
Accuracy: 0.70
[[267  40]
 [108  83]]
              precision    recall  f1-score   support

           0       0.71      0.87      0.78       307
           1       0.67      0.43      0.53       191

    accuracy                           0.70       498
   macro avg       0.69      0.65      0.66       498
weighted avg       0.70      0.70      0.69       498

Model 2: solver = lbfgs, c = .01
Accuracy: 0.80
[[265  42]
 [ 58 133]]
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.76      0.70      0.73       191

    accuracy                           0.80       498
   macro avg       0.79      0.78      0.78       498
weighted avg       0.80      0.80      0.80       498

Model 3: solver = lbfgs, c = 1
Accuracy: 0.80
[[265  42]
 [ 58 133]]
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.76     

Use your best 3 models to predict and evaluate on your validate sample.

In [273]:
y_val1 = logit1.predict(X_validate1)
y_val2 = logit2.predict(X_validate2)
y_val3 = logit3.predict(X_validate3)


print("Model 1: solver = lbfgs, c = 1")

# accuracy of model 1
print('Accuracy: {:.2f}'.format(logit1.score(X_validate1, y_validate1)))

# confusion matrix of model 1
print(confusion_matrix(y_validate1, y_val1))

# classification report of model 1
print(classification_report(y_validate1, y_val1))

print("Model 2: solver = lbfgs, c = .01")

# accuracy of model 2
print('Accuracy: {:.2f}'.format(logit2.score(X_validate2, y_validate2)))

# confusion matrix of model 2
print(confusion_matrix(y_validate2, y_val2))

# classification report of model 2
print(classification_report(y_validate2, y_val2))

print("Model 3: solver = lbfgs, c = 1")

# accuracy of model 3
print('Accuracy: {:.2f}'.format(logit3.score(X_validate3, y_validate3)))

# confusion matrix of model 3
print(confusion_matrix(y_validate3, y_val3))

# classification report of model 3
print(classification_report(y_validate3, y_val3))

Model 1: solver = lbfgs, c = 1
Accuracy: 0.71
[[117  15]
 [ 47  35]]
              precision    recall  f1-score   support

           0       0.71      0.89      0.79       132
           1       0.70      0.43      0.53        82

    accuracy                           0.71       214
   macro avg       0.71      0.66      0.66       214
weighted avg       0.71      0.71      0.69       214

Model 2: solver = lbfgs, c = .01
Accuracy: 0.79
[[115  17]
 [ 28  54]]
              precision    recall  f1-score   support

           0       0.80      0.87      0.84       132
           1       0.76      0.66      0.71        82

    accuracy                           0.79       214
   macro avg       0.78      0.76      0.77       214
weighted avg       0.79      0.79      0.79       214

Model 3: solver = lbfgs, c = 1
Accuracy: 0.76
[[109  23]
 [ 28  54]]
              precision    recall  f1-score   support

           0       0.80      0.83      0.81       132
           1       0.70     

Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [279]:
y_test3 = logit3.predict(X_test3)

print("Model 3")

# accuracy of model 3
print('Accuracy: {:.2f}'.format(logit3.score(X_test3, y_test3)))

# confusion matrix of model 3
print(confusion_matrix(y_test3, y_test3))

# classification report of model 3
print(classification_report(y_test3, y_test3))

print("Model 3")

Model 3
Accuracy: 1.00
[[117   0]
 [  0  62]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       117
           1       1.00      1.00      1.00        62

    accuracy                           1.00       179
   macro avg       1.00      1.00      1.00       179
weighted avg       1.00      1.00      1.00       179

Model 3
