# Logistic Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import acquire

from prepare import prep_titanic
from prepare import my_train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import warnings
warnings.filterwarnings("ignore")

Use titanic dataset

# 1
Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

In [2]:
prep_titanic().head()

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,male,1,0,7.25,Southampton,0,1,0,1
1,1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,2,1,3,female,0,0,7.925,Southampton,1,0,0,1
3,3,1,1,female,1,0,53.1,Southampton,0,0,0,1
4,4,0,3,male,0,0,8.05,Southampton,1,1,0,1


In [3]:
df = prep_titanic().drop(columns=['sex','parch','embark_town','sibsp','alone','sex_male','embark_town_Queenstown','embark_town_Southampton'])
df.head()

Unnamed: 0,passenger_id,survived,pclass,fare
0,0,0,3,7.25
1,1,1,1,71.2833
2,2,1,3,7.925
3,3,1,1,53.1
4,4,0,3,8.05


In [4]:
df['survived'].value_counts() #baseline count of survived

0    549
1    342
Name: survived, dtype: int64

In [5]:
my_train_test_split(df, target='survived')
train, validate, test = my_train_test_split(df, 'survived')
print(train.shape)
print(validate.shape)
print(test.shape)

(534, 4)
(178, 4)
(179, 4)


In [6]:
total = train['survived'].value_counts().sum()
total
baseline = total / len(df)
baseline


train.survived.value_counts(normalize=True)

0    0.616105
1    0.383895
Name: survived, dtype: float64

In [7]:

x_train = train.drop(columns='survived')
y_train = train.survived

x_val = validate.drop(columns='survived')
y_val = validate.survived

x_test = test.drop(columns='survived')
y_test = test.survived

In [8]:
reg_o = LogisticRegression()
reg_o

In [9]:
reg_o.fit(x_train, y_train)

In [10]:
y_pred = reg_o.predict(x_train)

In [11]:
y_pred_proba = reg_o.predict_proba(x_train)

In [12]:
print('Accuracy of Log. Reg. object on train set: {:.3f}'
     .format(reg_o.score(x_train, y_train)))

Accuracy of Log. Reg. object on train set: 0.672


In [13]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.86      0.76       329
           1       0.62      0.37      0.46       205

    accuracy                           0.67       534
   macro avg       0.65      0.62      0.61       534
weighted avg       0.66      0.67      0.65       534



# 2
Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [14]:
# encoded sex is_male
df = prep_titanic().drop(columns=['sex','parch','embark_town','sibsp','alone','embark_town_Queenstown','embark_town_Southampton'])
df.head()

Unnamed: 0,passenger_id,survived,pclass,fare,sex_male
0,0,0,3,7.25,1
1,1,1,1,71.2833,0
2,2,1,3,7.925,0
3,3,1,1,53.1,0
4,4,0,3,8.05,1


In [15]:
my_train_test_split(df, target='survived')
train, validate, test = my_train_test_split(df, 'survived')
print(train.shape)
print(validate.shape)
print(test.shape)

x_train = train.drop(columns='survived')
y_train = train.survived

x_val = validate.drop(columns='survived')
y_val = validate.survived

x_test = test.drop(columns='survived')
y_test = test.survived


(534, 5)
(178, 5)
(179, 5)


In [16]:
reg_o = LogisticRegression()
reg_o

In [17]:
reg_o.fit(x_train, y_train)

y_pred = reg_o.predict(x_train)

y_pred_proba = reg_o.predict_proba(x_train)

In [18]:
print('Accuracy of Log. Reg. object on train set: {:.3f}'
     .format(reg_o.score(x_train, y_train)))

print(classification_report(y_train, y_pred))

Accuracy of Log. Reg. object on train set: 0.792
              precision    recall  f1-score   support

           0       0.81      0.86      0.84       329
           1       0.75      0.68      0.72       205

    accuracy                           0.79       534
   macro avg       0.78      0.77      0.78       534
weighted avg       0.79      0.79      0.79       534



# 3
Try out other combinations of features and models.

In [19]:
# use embark_town_southampton in the following model
df = prep_titanic().drop(columns=['sex','parch','embark_town','sibsp','alone','embark_town_Queenstown'])
df.head()

Unnamed: 0,passenger_id,survived,pclass,fare,sex_male,embark_town_Southampton
0,0,0,3,7.25,1,1
1,1,1,1,71.2833,0,0
2,2,1,3,7.925,0,1
3,3,1,1,53.1,0,1
4,4,0,3,8.05,1,1


In [20]:
my_train_test_split(df, target='survived')
train, validate, test = my_train_test_split(df, 'survived')
print(train.shape)
print(validate.shape)
print(test.shape)

x_train = train.drop(columns='survived')
y_train = train.survived

x_val = validate.drop(columns='survived')
y_val = validate.survived

x_test = test.drop(columns='survived')
y_test = test.survived


(534, 6)
(178, 6)
(179, 6)


In [21]:
reg_o = LogisticRegression()
reg_o

In [22]:
reg_o.fit(x_train, y_train)

y_pred = reg_o.predict(x_train)

y_pred_proba = reg_o.predict_proba(x_train)

In [23]:
print('Accuracy of Log. Reg. object on train set: {:.3f}'
     .format(reg_o.score(x_train, y_train)))

print(classification_report(y_train, y_pred))

Accuracy of Log. Reg. object on train set: 0.788
              precision    recall  f1-score   support

           0       0.83      0.83      0.83       329
           1       0.73      0.72      0.72       205

    accuracy                           0.79       534
   macro avg       0.78      0.78      0.78       534
weighted avg       0.79      0.79      0.79       534



In [24]:
# use alone in following model, remove embark_town_southampton
df = prep_titanic().drop(columns=['sex','parch','embark_town','sibsp','embark_town_Queenstown','embark_town_Southampton'])
df.head()

Unnamed: 0,passenger_id,survived,pclass,fare,alone,sex_male
0,0,0,3,7.25,0,1
1,1,1,1,71.2833,0,0
2,2,1,3,7.925,1,0
3,3,1,1,53.1,0,0
4,4,0,3,8.05,1,1


In [25]:
my_train_test_split(df, target='survived')
train, validate, test = my_train_test_split(df, 'survived')
print(train.shape)
print(validate.shape)
print(test.shape)

(534, 6)
(178, 6)
(179, 6)


In [26]:
x_train = train.drop(columns='survived')
y_train = train.survived

x_val = validate.drop(columns='survived')
y_val = validate.survived

x_test = test.drop(columns='survived')
y_test = test.survived

In [27]:
reg_o = LogisticRegression()
reg_o

In [28]:
reg_o.fit(x_train, y_train)

y_pred = reg_o.predict(x_train)

y_pred_proba = reg_o.predict_proba(x_train)

In [29]:
print('Accuracy of Log. Reg. object on train set: {:.3f}'
     .format(reg_o.score(x_train, y_train)))

print(classification_report(y_train, y_pred))

Accuracy of Log. Reg. object on train set: 0.792
              precision    recall  f1-score   support

           0       0.81      0.86      0.84       329
           1       0.75      0.68      0.72       205

    accuracy                           0.79       534
   macro avg       0.78      0.77      0.78       534
weighted avg       0.79      0.79      0.79       534



In [30]:
# all encoded or numeric features model
df = prep_titanic().drop(columns=['sex','embark_town'])
df.head()

my_train_test_split(df, target='survived')
train, validate, test = my_train_test_split(df, 'survived')
print(train.shape)
print(validate.shape)
print(test.shape)

x_train = train.drop(columns='survived')
y_train = train.survived

x_val = validate.drop(columns='survived')
y_val = validate.survived

x_test = test.drop(columns='survived')
y_test = test.survived

reg_o = LogisticRegression()
reg_o

reg_o.fit(x_train, y_train)

y_pred = reg_o.predict(x_train)

y_pred_proba = reg_o.predict_proba(x_train)


print('Accuracy of Log. Reg. object on train set: {:.3f}'
     .format(reg_o.score(x_train, y_train)))

print(classification_report(y_train, y_pred))


(534, 10)
(178, 10)
(179, 10)
Accuracy of Log. Reg. object on train set: 0.792
              precision    recall  f1-score   support

           0       0.81      0.87      0.84       329
           1       0.76      0.67      0.71       205

    accuracy                           0.79       534
   macro avg       0.78      0.77      0.78       534
weighted avg       0.79      0.79      0.79       534



# 4
Use your best 3 models to predict and evaluate on your validate sample.

# 5
Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train? 