#  Logistic Regression


In [1]:
# Data manipulation and modeling imports
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


# User defined functions
from acquire import get_titanic_data
from prepare import split_data
from model import preprocess_titanic

# ignore warnings
import warnings
warnings.filterwarnings("ignore")


## Use Titanic dataset

In [2]:
# Acquire data
titanic = get_titanic_data()
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
def clean_titanic(df):
    """
    Takes in the Titanic DataFrame as an argument and returns the Dataframe with unnecessary columns dropped, 
    casts [pclass] column to object since it will be handled as object, and
    fills null values for [embark_town] with the column mode.
    """
    df = df.drop(columns=['passenger_id', 'embarked', 'deck', 'class'])
    df.pclass = df.pclass.astype(object)
    df.embark_town = df.embark_town.fillna('Southampton')
    # fill null ages with the average age
    avg_age = df.age.mean()
    df.age = df.age.fillna(avg_age)
    df.age = df.age.astype(int)
    return df

In [4]:
# clean data
titanic = clean_titanic(titanic)
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
0,0,3,male,22,1,0,7.25,Southampton,0
1,1,1,female,38,1,0,71.2833,Cherbourg,0
2,1,3,female,26,0,0,7.925,Southampton,1
3,1,1,female,35,1,0,53.1,Southampton,0
4,0,3,male,35,0,0,8.05,Southampton,1


In [5]:
#check for nulls
titanic.isna().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embark_town    0
alone          0
dtype: int64

In [6]:
titanic = preprocess_titanic(titanic)
titanic.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,22,1,0,7.25,0,1,0,1
1,1,1,38,1,0,71.2833,0,0,0,0
2,1,3,26,0,0,7.925,1,0,0,1
3,1,1,35,1,0,53.1,0,0,0,1
4,0,3,35,0,0,8.05,1,1,0,1


In [7]:
# Train, validate, split data
train, validate, test = split_data(titanic, 'survived')

train: 534 (60.0% of 891)
validate: 178 (20.0% of 891)
test: 179 (20.0% of 891)


In [8]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
776,0,3,29,0,0,7.75,1,1,1,0
829,1,1,62,0,0,80.0,1,0,0,1
215,1,1,31,1,0,113.275,0,0,0,0
258,1,1,35,0,0,512.3292,1,0,0,0
129,0,3,45,0,0,6.975,1,1,0,1


### 1. Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?



In [9]:
# Defining  our X and y variables for train, validate and test data

# create two variables from train, one with only features and one for target. 
X_train = train.drop(columns=['survived', 'sibsp', 'parch', 'alone', 'sex_male', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_train = train.survived

# create two variables from validate, one with only features and one for target. 
X_validate = validate.drop(columns=['survived', 'sibsp', 'parch', 'alone', 'sex_male', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_validate = validate.survived

# create two variables from test, one with only features and one for target. 
X_test = test.drop(columns=['survived', 'sibsp', 'parch', 'alone', 'sex_male', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_test = test.survived

In [10]:
X_train.head()

Unnamed: 0,pclass,age,fare
776,3,29,7.75
829,1,62,80.0
215,1,31,113.275
258,1,35,512.3292
129,3,45,6.975


In [11]:
#defining the logistic regression model
logit=LogisticRegression(C=1,random_state=123)

In [12]:
#fit the model on train data
logit.fit(X_train,y_train)

In [13]:
# use model to make predictions
y_pred=logit.predict(X_train)

In [14]:
#take a look at predictions
y_pred

array([0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,

In [15]:
#Look at the predicted probabilities for the first 10 observations
logit.predict_proba(X_train)[:10]

array([[0.78281691, 0.21718309],
       [0.57980193, 0.42019807],
       [0.28019982, 0.71980018],
       [0.18954878, 0.81045122],
       [0.87078654, 0.12921346],
       [0.74780348, 0.25219652],
       [0.24781205, 0.75218795],
       [0.72510869, 0.27489131],
       [0.5851614 , 0.4148386 ],
       [0.92084565, 0.07915435]])

In [16]:
logit.classes_

array([0, 1])

In [17]:
#view raw probabilities(output from the model)
y_pred_proba=logit.predict_proba(X_train)

In [18]:
# Baseline
print(train['survived'].value_counts())
baseline_accuracy = round((train.survived == 0).mean(), 2)
baseline_accuracy

survived
0    329
1    205
Name: count, dtype: int64


0.62

In [19]:
# create algorithm object 
logit1 = LogisticRegression(C=1, random_state=42, intercept_scaling=1, solver='liblinear')

# fit model with age, pclass and fare as only features
logit1.fit(X_train[['age', 'pclass', 'fare']], y_train)

# compute accuracy
train_accuracy = logit1.score(X_train[['age', 'pclass', 'fare']], y_train)

# compare this model with baseline
print(f'Train Accuracy: {train_accuracy}')
print(f'Baseline Accuracy: {baseline_accuracy}')


Train Accuracy: 0.702247191011236
Baseline Accuracy: 0.62


In [20]:
# OR

In [21]:
y_train.value_counts()


survived
0    329
1    205
Name: count, dtype: int64

In [22]:
(y_train == 0).mean()

0.6161048689138576

In [23]:
X_train.head()

Unnamed: 0,pclass,age,fare
776,3,29,7.75
829,1,62,80.0
215,1,31,113.275
258,1,35,512.3292
129,3,45,6.975


In [24]:
features1 = ['age','fare','pclass']
X_train[features1].head()

Unnamed: 0,age,fare,pclass
776,29,7.75,3
829,62,80.0,1
215,31,113.275,1
258,35,512.3292,1
129,45,6.975,3


### 2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.



In [25]:
# Defining  our X and y variables for train, validate and test data

# create two variables from train, one with only features and one for target. 
X_train = train.drop(columns=['survived', 'sibsp', 'parch', 'alone', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_train = train.survived

# create two variables from validate, one with only features and one for target. 
X_validate = validate.drop(columns=['survived', 'sibsp', 'parch', 'alone', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_validate = validate.survived

# create two variables from test, one with only features and one for target. 
X_test = test.drop(columns=['survived', 'sibsp', 'parch', 'alone', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_test = test.survived

In [26]:
X_train.head()

Unnamed: 0,pclass,age,fare,sex_male
776,3,29,7.75,1
829,1,62,80.0,0
215,1,31,113.275,0
258,1,35,512.3292,0
129,3,45,6.975,1


In [27]:
X_train.shape,X_validate.shape,X_test.shape

((534, 4), (178, 4), (179, 4))

In [28]:

# create algorithm object
logit2 = LogisticRegression(C=1, random_state=42, intercept_scaling=1, solver='liblinear')

# fit model with age, pclass, fare and sex_male as only features
logit2.fit(X_train[['age', 'pclass', 'fare', 'sex_male']], y_train)
    

# compute accuracy
train_accuracy = logit2.score(X_train[['age', 'pclass', 'fare', 'sex_male']], y_train)

# compare this model with baseline
print(f'Train Accuracy: {train_accuracy}')
print(f'Baseline Accuracy: {baseline_accuracy}')

Train Accuracy: 0.8014981273408239
Baseline Accuracy: 0.62


### 3. Try out other combinations of features and models.



In [29]:
# Test model with all features

# create algorithm object
logit3 = LogisticRegression(C=1, random_state=42, intercept_scaling=1, solver='liblinear')

# fit model with all features
logit3.fit(X_train, y_train)

# compute accuracy
train_accuracy = logit3.score(X_train, y_train)

# compare this model with baseline
print(f'Train Accuracy: {train_accuracy}')
print(f'Baseline Accuracy: {baseline_accuracy}')

Train Accuracy: 0.8014981273408239
Baseline Accuracy: 0.62


In [30]:
# Try changing 'solver' to 'lbfgs' feature

# create algorithm object
logit4 = LogisticRegression(C=1, random_state=42, intercept_scaling=1, solver='lbfgs')

# fit model with all features
logit4.fit(X_train, y_train)

# compute accuracy
train_acc4 = logit4.score(X_train, y_train)

# compare this model with baseline
print(f'Train Accuracy: {train_acc4}')
print(f'Baseline Accuracy: {baseline_accuracy}')

Train Accuracy: 0.8033707865168539
Baseline Accuracy: 0.62


In [31]:
# Try changing 'class_weight' to 'balanced'

# create algorithm object
logit5 = LogisticRegression(C=1, class_weight='balanced', random_state=42, intercept_scaling=1, solver='lbfgs')

# fit model with all features
logit5.fit(X_train, y_train)

# compute accuracy
train_acc5 = logit5.score(X_train, y_train)

# compare this model with baseline
print(f'Train Accuracy: {train_acc5}')
print(f'Baseline Accuracy: {baseline_accuracy}')

Train Accuracy: 0.8127340823970037
Baseline Accuracy: 0.62


In [32]:
# Try changing c-value (regularization strength) from 1 to 0.1

# create algorithm object
logit6 = LogisticRegression(C=0.1, random_state=123, intercept_scaling=1, solver='lbfgs')

# fit model with all features
logit6.fit(X_train, y_train)

# compute accuracy
train_acc6 = logit6.score(X_train, y_train)

# compare this model with baseline
print(f'Train Accuracy: {train_acc6}')
print(f'Baseline Accuracy: {baseline_accuracy}')


Train Accuracy: 0.8033707865168539
Baseline Accuracy: 0.62


### 4.Use you best 3 models to predict and evaluate on your validate sample.



In [33]:
# use logit to make predictions for the X_validate observations
y_val_pred3 = logit3.predict(X_validate)
# compute accuracy
val_acc3 = logit3.score(X_validate, y_validate)

# create a list and add to a dataframe at the end comparing all the models. 
model3 = [3, train_acc4, val_acc3]

y_val_pred5 = logit5.predict(X_validate)
val_acc5 = logit5.score(X_validate, y_validate) 
model5 = [5, train_acc5, val_acc5]

y_val_pred6 = logit6.predict(X_validate)
val_acc6 = logit6.score(X_validate, y_validate) 
model6 = [6, train_acc6, val_acc6]

pd.DataFrame([model3, model5, model6], columns=['model', 'in-sample accuracy', 'out-of-sample accuracy'])

Unnamed: 0,model,in-sample accuracy,out-of-sample accuracy
0,3,0.803371,0.769663
1,5,0.812734,0.747191
2,6,0.803371,0.764045


### 5.Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

Model logit3 performs best with out of sample accuracy

In [34]:
# Test Model 3

y_pred3 = logit3.predict(X_test)
y_pred_proba = logit3.predict_proba(X_test)
print("Model 3: solver = lbfgs, c = 1")
print('Accuracy: {:.2f}'.format(logit3.score(X_test, y_test)))
print(confusion_matrix(y_test, y_pred3))
print(classification_report(y_test, y_pred3))

Model 3: solver = lbfgs, c = 1
Accuracy: 0.79
[[94 16]
 [22 47]]
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       110
           1       0.75      0.68      0.71        69

    accuracy                           0.79       179
   macro avg       0.78      0.77      0.77       179
weighted avg       0.79      0.79      0.79       179



> Model 3 performs better on test than validate. There is still a small drop from how well it performs on train. All metrics are close to each other. 