In these exercises, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.


In [44]:
import numpy as np
import pandas as pd

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns

import acquire
import prepare

In [2]:
df = acquire.get_titanic_data()


Found CSV


In [3]:
df.head()


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [4]:
df = prepare.prep_titanic(df)


In [5]:
df.head()


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S
0,0,0,3,male,22.0,1,0,7.25,S,0,1,0,1
1,1,1,1,female,38.0,1,0,71.2833,C,0,0,0,0
2,2,1,3,female,26.0,0,0,7.925,S,1,0,0,1
3,3,1,1,female,35.0,1,0,53.1,S,0,0,0,1
4,4,0,3,male,35.0,0,0,8.05,S,1,1,0,1


In [6]:
df.shape


(891, 13)

In [7]:
df.age.isnull().value_counts()


age
False    714
True     177
Name: count, dtype: int64

In [8]:
df.dropna(inplace=True)


In [9]:
df.shape


(714, 13)

In [10]:
train, validate, test = prepare.split_function(df, 'survived')


In [11]:
print(f'Train: {train.shape}')
print(f'Validate: {validate.shape}')
print(f'Test: {test.shape}')

Train: (428, 13)
Validate: (143, 13)
Test: (143, 13)


In [12]:
train.head()


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S
548,548,0,3,male,33.0,1,1,20.525,S,0,1,0,1
133,133,1,2,female,29.0,1,0,26.0,S,0,0,0,1
540,540,1,1,female,36.0,0,2,71.0,S,0,0,0,1
2,2,1,3,female,26.0,0,0,7.925,S,1,0,0,1
649,649,1,3,female,23.0,0,0,7.55,S,1,0,0,1


In [13]:
train.age.isnull().value_counts()


age
False    428
Name: count, dtype: int64

In [14]:

  #  input train, validate, test, after using split function()
   # input target_variable as string
    #drop_cols formatted as: ['col1', 'col2', 'etc'] for multiple columns
       # This function will drop all 'object' columns. Identify additional 
        #columns you want to drop and insert 1 column as a string or multiple
        #columns in a list of strings.
    #X_train, X_validate, X_test, y_train, y_validate, y_test

    
def xy_train_val(train, validate, test, target_variable, drop_cols):
    
    baseline_accuracy = train[target_variable].value_counts().max() / train[target_variable].value_counts().sum()
    print(f'Baseline Accuracy: {baseline_accuracy:.2%}')
    
    X_train = train.select_dtypes(exclude=['object']).drop(columns=[target_variable]).drop(columns=drop_cols)
    X_validate = validate.select_dtypes(exclude=['object']).drop(columns=[target_variable]).drop(columns=drop_cols)
    X_test = test.select_dtypes(exclude=['object']).drop(columns=[target_variable]).drop(columns=drop_cols)
    
    y_train = train[target_variable]
    y_validate = validate[target_variable]
    y_test = test[target_variable]
    
    return X_train, X_validate, X_test, y_train, y_validate, y_test


In [15]:
X_train, X_validate, X_test, y_train, y_validate, y_test = xy_train_val(train, validate, test, 'survived', ['passenger_id'])


Baseline Accuracy: 59.35%


In [16]:
X_train.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,alone,sex_male,embarked_Q,embarked_S
548,3,33.0,1,1,20.525,0,1,0,1
133,2,29.0,1,0,26.0,0,0,0,1
540,1,36.0,0,2,71.0,0,0,0,1
2,3,26.0,0,0,7.925,1,0,0,1
649,3,23.0,0,0,7.55,1,0,0,1


In [17]:
y_train[:5]


548    0
133    1
540    1
2      1
649    1
Name: survived, dtype: int64

In [18]:
y_train[:5]


548    0
133    1
540    1
2      1
649    1
Name: survived, dtype: int64

In [19]:
logit1 = LogisticRegression()
logit1

In [20]:
logit1.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
logit1.score(X_train, y_train)


0.8060747663551402

In [22]:
y_pred = logit1.predict(X_train)


1. 1. Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?


In [23]:
features1 = ['age', 'fare', 'pclass']


In [24]:
train.head()


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S
548,548,0,3,male,33.0,1,1,20.525,S,0,1,0,1
133,133,1,2,female,29.0,1,0,26.0,S,0,0,0,1
540,540,1,1,female,36.0,0,2,71.0,S,0,0,0,1
2,2,1,3,female,26.0,0,0,7.925,S,1,0,0,1
649,649,1,3,female,23.0,0,0,7.55,S,1,0,0,1


In [25]:
X_train2, X_validate2, X_test2, y_train2, y_validate2, y_test2 = xy_train_val(train, validate, test, 'survived', ['passenger_id', 'sibsp', 'parch', 'alone',
                                                                                                            'sex_male', 'embarked_Q', 'embarked_S'])


Baseline Accuracy: 59.35%


In [26]:
X_train2.head()


Unnamed: 0,pclass,age,fare
548,3,33.0,20.525
133,2,29.0,26.0
540,1,36.0,71.0
2,3,26.0,7.925
649,3,23.0,7.55


In [27]:
logit2 = LogisticRegression()
logit2


In [28]:
logit2.fit(X_train2, y_train2)


In [29]:
y_pred2 = logit2.predict(X_train2)


In [30]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       0.81      0.88      0.84       254
           1       0.80      0.70      0.75       174

    accuracy                           0.81       428
   macro avg       0.80      0.79      0.79       428
weighted avg       0.81      0.81      0.80       428



In [78]:
logit2 = logit_run(X_train2, y_train2, X_validate2, y_validate2)


2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [46]:
train.head()


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S
548,548,0,3,male,33.0,1,1,20.525,S,0,1,0,1
133,133,1,2,female,29.0,1,0,26.0,S,0,0,0,1
540,540,1,1,female,36.0,0,2,71.0,S,0,0,0,1
2,2,1,3,female,26.0,0,0,7.925,S,1,0,0,1
649,649,1,3,female,23.0,0,0,7.55,S,1,0,0,1


In [47]:
X_train3, X_validate3, X_test3, y_train3, y_validate3, y_test3 = xy_train_val(train, validate, test, 'survived', ['passenger_id', 'sibsp', 'parch', 'alone',
                                                                                                            'embarked_Q', 'embarked_S'])

Baseline Accuracy: 59.35%


In [48]:
X_train3.head()


Unnamed: 0,pclass,age,fare,sex_male
548,3,33.0,20.525,1
133,2,29.0,26.0,0
540,1,36.0,71.0,0
2,3,26.0,7.925,0
649,3,23.0,7.55,0


In [49]:
logit3 = LogisticRegression()
logit3

In [50]:
logit3.fit(X_train3, y_train3)

In [51]:
logit3.score(X_train3, y_train3)

0.7873831775700935

In [52]:
logit3 = logit_run(X_train3, y_train3, X_validate3, y_validate3)

NameError: name 'logit_run' is not defined

3. Try out other combinations of features and models.

In [53]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S
548,548,0,3,male,33.0,1,1,20.525,S,0,1,0,1
133,133,1,2,female,29.0,1,0,26.0,S,0,0,0,1
540,540,1,1,female,36.0,0,2,71.0,S,0,0,0,1
2,2,1,3,female,26.0,0,0,7.925,S,1,0,0,1
649,649,1,3,female,23.0,0,0,7.55,S,1,0,0,1


In [54]:
X_train4, X_validate4, X_test4, y_train4, y_validate4, y_test4 = xy_train_val(train, validate, test, 'survived', ['passenger_id',
                                                                                                            'sibsp',
                                                                                                            'parch',
                                                                                                            'embarked_Q',
                                                                                                            'embarked_S'])

Baseline Accuracy: 59.35%


In [55]:
X_train4.head()


Unnamed: 0,pclass,age,fare,alone,sex_male
548,3,33.0,20.525,0,1
133,2,29.0,26.0,0,0
540,1,36.0,71.0,0,0
2,3,26.0,7.925,1,0
649,3,23.0,7.55,1,0


In [56]:
logit4 = LogisticRegression()
logit4

In [57]:
logit4.fit(X_train4, y_train4)

In [58]:
logit4.score(X_train4, y_train4)


0.7920560747663551

In [70]:
logit4 = logit_run(X_train4, y_train4, X_validate4, y_validate4)

In [71]:
X_train5, X_validate5, X_test5, y_train5, y_validate5, y_test5 = xy_train_val(train, validate, test, 'survived', ['passenger_id',
                                                                                                            'sibsp',
                                                                                                            'alone',
                                                                                                            'embarked_Q',
                                                                                                            'embarked_S'])

Baseline Accuracy: 59.35%


In [72]:
X_train5.head()

Unnamed: 0,pclass,age,parch,fare,sex_male
548,3,33.0,1,20.525,1
133,2,29.0,0,26.0,0
540,1,36.0,2,71.0,0
2,3,26.0,0,7.925,0
649,3,23.0,0,7.55,0


In [73]:
logit5 = LogisticRegression()
logit5

In [74]:
logit5.fit(X_train5, y_train5)


In [75]:
logit5.coef_


array([[-1.09612469e+00, -2.89332868e-02, -1.31195055e-01,
         6.29387681e-05, -2.20417572e+00]])

In [76]:
def logit_run(X_train, y_train, X_validate, y_validate):
    # Create and train a logistic regression model
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    # Use the trained model to make predictions
    y_pred = model.predict(X_validate)
    
    # Calculate classification report and confusion matrix
    classification_report_result = classification_report(y_validate, y_pred)
    confusion_matrix_result = confusion_matrix(y_validate, y_pred)
    
    # Return the model and evaluation results
    return model, classification_report_result, confusion_matrix_result


In [77]:
logit5 = logit_run(X_train5, y_train5, X_validate5, y_validate5)

4. Use you best 3 models to predict and evaluate on your validate sample.



5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?



In [79]:
logit4.score(X_test4, y_test4)

AttributeError: 'tuple' object has no attribute 'score'

# Bonus1 
How do different strategies for handling the missing values in the age column affect model performance?