# <span style="color:blue">Imports</span>

In [1020]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
import acquire
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression

# <span style="color:blue">Acquire Data</span>

In [1021]:
df = acquire.get_titanic_data()
df.dtypes

csv file found and loaded


passenger_id      int64
survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class            object
deck             object
embark_town      object
alone             int64
dtype: object

# <span style="color:blue">Prepare Data</span>


In [1022]:
#Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?
#Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before
# including it in a model.
df = df.drop(columns=['passenger_id', 'sibsp', 'parch', 'embarked', 'class', 'deck', 'embark_town', 'alone'])

In [1023]:
df

Unnamed: 0,survived,pclass,sex,age,fare
0,0,3,male,22.0,7.2500
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.9250
3,1,1,female,35.0,53.1000
4,0,3,male,35.0,8.0500
...,...,...,...,...,...
886,0,2,male,27.0,13.0000
887,1,1,female,19.0,30.0000
888,0,3,female,,23.4500
889,1,1,male,26.0,30.0000


In [1024]:
# split data
X = df.drop(columns=['survived'])
Y = df[['survived']]

In [1025]:
# Check for nulls and replace with mean
X.isnull().sum()
X["age"] = X.age.fillna(X.age.mean())

In [1026]:
# split into categorical and continuous
num = X.select_dtypes(exclude='object')
cat = X.select_dtypes(include='object')

In [1027]:
#create dummy variables
cat = pd.get_dummies(cat, drop_first=True)
cat

Unnamed: 0,sex_male
0,1
1,0
2,0
3,0
4,1
...,...
886,1
887,0
888,0
889,1


In [1028]:
#join back together
X = pd.concat([num, cat], axis=1)

# <span style="color:blue">Baseline</span>

In [1029]:
Y.survived.value_counts()
baseline = Y.survived.value_counts().max() / Y.survived.value_counts().sum()
print(f'Baseline Prediction: {Y.survived.value_counts().idxmax()}')
print(f'Baseline Accuracy: {baseline:.2%}')

Baseline Prediction: 0
Baseline Accuracy: 61.62%


# <span style="color:blue">Convert to Train, Validate, Test</span>

In [1030]:
X_train, X_validate, X_test, y_train, y_validate, y_test = acquire.train_validate_test_split(X, Y)
print("Shape of Training Data", X_train.shape)
print("Shape of Validation Data", X_validate.shape)
print("Shape of Testing Data", X_test.shape)
print("Churn Rate in Training Data", y_train.mean())
print("Churn Rate in Validation Data", y_validate.mean())
print("Churn Rate in Testing Data", y_test.mean())

Shape of Training Data (534, 4)
Shape of Validation Data (178, 4)
Shape of Testing Data (179, 4)
Churn Rate in Training Data survived    0.383895
dtype: float64
Churn Rate in Validation Data survived    0.382022
dtype: float64
Churn Rate in Testing Data survived    0.385475
dtype: float64


# <span style="color:blue">Logistic Regression Model</span>

In [1031]:
# Create the object
logit = LogisticRegression()

In [1032]:
# Fit the object
logit.fit(X_train, y_train)

In [1033]:
# Feacture Importance
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-1.14920042e+00 -2.79796794e-02 -1.06846138e-03 -2.59942200e+00]]
Intercept: 
 [4.58257482]


In [1034]:
# Make predictions
y_pred = logit.predict(X_train)

In [1035]:
# Estimate probability
y_pred_proba = logit.predict_proba(X_train)


# <span style="color:blue">Evaluate Model</span>

In [1036]:
# Accuracy
logit.score(X_train, y_train)

0.8052434456928839

In [1037]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_train, y_pred)
print(confusion_matrix)

[[282  47]
 [ 57 148]]


In [1038]:
# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.86      0.84       329
           1       0.76      0.72      0.74       205

    accuracy                           0.81       534
   macro avg       0.80      0.79      0.79       534
weighted avg       0.80      0.81      0.80       534



# <span style="color:blue">Validate Model</span>

In [1039]:
# Accuracy
print('Accuracy of Logistic Regression classifier on validation set: {:.2f}'
     .format(logit.score(X_validate, y_validate)))

Accuracy of Logistic Regression classifier on validation set: 0.78


# <span style="color:blue">Current Model Takeaways</span>

#### model 1 is better than baseline
#### with a 79% accuracy


# <span style="color:blue">Model 2</span>
#### This model will have survived, age, fare, pclass, sex, and alone

In [1040]:
df2 = acquire.get_titanic_data()

csv file found and loaded


# <span style="color:blue">Prepare Data</span>

In [1041]:
df2 = df2.drop(columns=['passenger_id', 'sibsp', 'parch', 'embarked', 'class', 'deck', 'embark_town'])

In [1042]:
df2.dtypes

survived      int64
pclass        int64
sex          object
age         float64
fare        float64
alone         int64
dtype: object

In [1043]:
# split data
X2 = df2.drop(columns=['survived'])
Y2 = df2[['survived']]
X2.alone = X2.alone.astype('object')
X2.age = X2.age.fillna(X2.age.mean())
cat = X2.select_dtypes(include='object')
num = X2.select_dtypes(exclude='object')

In [1044]:
# create dummy variables
cat = pd.get_dummies(cat, drop_first=True)

In [1045]:
#join back together
X2 = pd.concat([num, cat], axis=1)

In [1046]:
X2

Unnamed: 0,pclass,age,fare,sex_male,alone_1
0,3,22.000000,7.2500,1,0
1,1,38.000000,71.2833,0,0
2,3,26.000000,7.9250,0,1
3,1,35.000000,53.1000,0,0
4,3,35.000000,8.0500,1,1
...,...,...,...,...,...
886,2,27.000000,13.0000,1,1
887,1,19.000000,30.0000,0,1
888,3,29.699118,23.4500,0,0
889,1,26.000000,30.0000,1,1


# <span style="color:blue">Baseline</span>

In [1047]:
Y2.survived.value_counts()
baseline = Y2.survived.value_counts().max() / Y2.survived.value_counts().sum()
print(f'Baseline Prediction: {Y2.survived.value_counts().idxmax()}')
print(f'Baseline Accuracy: {baseline:.2%}')

Baseline Prediction: 0
Baseline Accuracy: 61.62%


# <span style="color:blue">Convert to Train, Validate, Test</span>

In [1048]:
X_train, X_validate, X_test, y_train, y_validate, y_test = acquire.train_validate_test_split(X2, Y2)
print("Shape of Training Data", X_train.shape)
print("Shape of Validation Data", X_validate.shape)
print("Shape of Testing Data", X_test.shape)
print("Churn Rate in Training Data", y_train.mean())
print("Churn Rate in Validation Data", y_validate.mean())
print("Churn Rate in Testing Data", y_test.mean())

Shape of Training Data (534, 5)
Shape of Validation Data (178, 5)
Shape of Testing Data (179, 5)
Churn Rate in Training Data survived    0.383895
dtype: float64
Churn Rate in Validation Data survived    0.382022
dtype: float64
Churn Rate in Testing Data survived    0.385475
dtype: float64


# <span style="color:blue">Logistic Regression Model</span>

In [1049]:
# Create the object
logit = LogisticRegression()

In [1050]:
# Fit the object
logit.fit(X_train, y_train)

In [1051]:
# Feacture Importance
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-1.14926935e+00 -2.80932791e-02 -1.02372060e-03 -2.60244800e+00
   1.36618628e-02]]
Intercept: 
 [4.57843775]


In [1052]:
# Make predictions
y_pred = logit.predict(X_train)

# <span style="color:blue">Evaluate Model</span>

In [1053]:
# Estimate probability
y_pred_proba = logit.predict_proba(X_train)

In [1054]:
# Accuracy
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.81


In [1055]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_train, y_pred)
print(confusion_matrix)

[[283  46]
 [ 56 149]]


In [1056]:
# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       0.83      0.86      0.85       329
           1       0.76      0.73      0.74       205

    accuracy                           0.81       534
   macro avg       0.80      0.79      0.80       534
weighted avg       0.81      0.81      0.81       534



In [1057]:
# Accuracy
print('Accuracy of Logistic Regression classifier on validation set: {:.2f}'
     .format(logit.score(X_validate, y_validate)))


Accuracy of Logistic Regression classifier on validation set: 0.78


# <span style="color:blue">Current Model Takeaways</span>
#### model 2 is better than baseline
#### with a 79% accuracy

# <span style="color:blue">Model 3</span>
#### Using the same features as model 2, but with embarked

In [1058]:
df3 = acquire.get_titanic_data()

csv file found and loaded


In [1059]:
df3 = df3.drop(columns=['passenger_id', 'sibsp', 'parch', 'class', 'deck', 'embark_town'])

In [1060]:
df3.dtypes

survived      int64
pclass        int64
sex          object
age         float64
fare        float64
embarked     object
alone         int64
dtype: object

In [1061]:
# split data
X3 = df3.drop(columns=['survived'])
Y3 = df3[['survived']]
X3.alone = X3.alone.astype('object')
X3.pclass = X3.pclass.astype('object')
X3.age = X3.age.fillna(X3.age.mean())
cat = X3.select_dtypes(include='object')
num = X3.select_dtypes(exclude='object')

In [1062]:
# create dummy variables
cat = pd.get_dummies(cat, drop_first=True)

In [1063]:
#join back together
X3 = pd.concat([num, cat], axis=1)

# <span style="color:blue">Baseline</span>

In [1064]:
# Baseline
Y3.survived.value_counts()
baseline = Y3.survived.value_counts().max() / Y3.survived.value_counts().sum()
print(f'Baseline Prediction: {Y3.survived.value_counts().idxmax()}')
print(f'Baseline Accuracy: {baseline:.2%}')

Baseline Prediction: 0
Baseline Accuracy: 61.62%


# <span style="color:blue">Convert to Train, Validate, Test</span>

In [1065]:
X_train, X_validate, X_test, y_train, y_validate, y_test = acquire.train_validate_test_split(X3, Y3)
print("Shape of Training Data", X_train.shape)
print("Shape of Validation Data", X_validate.shape)
print("Shape of Testing Data", X_test.shape)
print("Churn Rate in Training Data", y_train.mean())
print("Churn Rate in Validation Data", y_validate.mean())
print("Churn Rate in Testing Data", y_test.mean())


Shape of Training Data (534, 8)
Shape of Validation Data (178, 8)
Shape of Testing Data (179, 8)
Churn Rate in Training Data survived    0.383895
dtype: float64
Churn Rate in Validation Data survived    0.382022
dtype: float64
Churn Rate in Testing Data survived    0.385475
dtype: float64


# <span style="color:blue">Logistic Regression Model</span>

In [1066]:
# Create the object
logit = LogisticRegression()

In [1067]:
# Fit the object
logit.fit(X_train, y_train)

In [1068]:
# Feacture Importance
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)


Coefficient: 
 [[-2.40959439e-02 -1.24942561e-05 -6.35313892e-01 -2.05092650e+00
  -2.51086224e+00  4.98020743e-01 -3.69600519e-01 -2.50766412e-02]]
Intercept: 
 [3.20429613]


In [1069]:
# Make predictions
y_pred = logit.predict(X_train)

# <span style="color:blue">Evaluate Model</span>

In [1070]:
# Estimate probability
y_pred_proba = logit.predict_proba(X_train)

In [1071]:
# Accuracy
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.80


In [1072]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_train, y_pred)
print(confusion_matrix)

[[284  45]
 [ 60 145]]


In [1073]:
# validate
# Accuracy
y_pred = logit.predict(X_validate)
print('Accuracy of Logistic Regression classifier on validation set: {:.2f}'
     .format(logit.score(X_validate, y_validate)))


Accuracy of Logistic Regression classifier on validation set: 0.78


In [1074]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_validate, y_pred)
print(confusion_matrix)

[[94 16]
 [23 45]]


In [1075]:
# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.85      0.83       110
           1       0.74      0.66      0.70        68

    accuracy                           0.78       178
   macro avg       0.77      0.76      0.76       178
weighted avg       0.78      0.78      0.78       178



# <span style="color:blue">Current Model Takeaways</span>

# <span style="color:blue">Model 4</span>

In [1076]:
df4 = acquire.get_titanic_data()

csv file found and loaded


In [1077]:
df4 = df4.drop(columns=['passenger_id', 'class', 'embark_town'])

In [1078]:
df4.dtypes

survived      int64
pclass        int64
sex          object
age         float64
sibsp         int64
parch         int64
fare        float64
embarked     object
deck         object
alone         int64
dtype: object

In [1079]:
# split data
X4 = df4.drop(columns=['survived'])
Y4 = df4[['survived']]
X4.alone = X4.alone.astype('object')
X4.parch = X4.parch.astype('object')
X4.sibsp = X4.sibsp.astype('object')

cat = X4.select_dtypes(include='object')
num = X4.select_dtypes(exclude='object')

In [1080]:
num.age.fillna(22, inplace=True)

In [1081]:
# create dummy variables
cat = pd.get_dummies(cat, drop_first=True)

In [1082]:
#join back together
X4 = pd.concat([num, cat], axis=1)

In [1083]:
# Baseline
Y4.survived.value_counts()
baseline = Y4.survived.value_counts().max() / Y4.survived.value_counts().sum()
print(f'Baseline Prediction: {Y4.survived.value_counts().idxmax()}')
print(f'Baseline Accuracy: {baseline:.2%}')


Baseline Prediction: 0
Baseline Accuracy: 61.62%


In [1084]:
# Convert to Train, Validate, Test

In [1085]:
X_train, X_validate, X_test, y_train, y_validate, y_test = acquire.train_validate_test_split(X4, Y4)
print("Shape of Training Data", X_train.shape)
print("Shape of Validation Data", X_validate.shape)
print("Shape of Testing Data", X_test.shape)
print("Churn Rate in Training Data", y_train.mean())
print("Churn Rate in Validation Data", y_validate.mean())
print("Churn Rate in Testing Data", y_test.mean())


Shape of Training Data (534, 25)
Shape of Validation Data (178, 25)
Shape of Testing Data (179, 25)
Churn Rate in Training Data survived    0.383895
dtype: float64
Churn Rate in Validation Data survived    0.382022
dtype: float64
Churn Rate in Testing Data survived    0.385475
dtype: float64


# <span style="color:blue">Logistic Regression Model</span>

In [1086]:
# Create the object
logit = LogisticRegression()

In [1087]:
# Fit the object
logit.fit(X_train, y_train)

In [1088]:
# Feacture Importance
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)


Coefficient: 
 [[-9.38724279e-01 -2.43475253e-02 -2.54982948e-04 -2.54736677e+00
   9.79268355e-01  4.88005532e-01 -1.01387351e+00 -2.58471397e-01
  -3.21196584e-01 -5.80873483e-01  6.86577834e-01  4.82431534e-01
  -2.72090250e-02 -1.90107364e-01 -8.01529976e-02  0.00000000e+00
   5.57971308e-01 -1.17641756e-01  7.36234715e-01  8.04394317e-02
   9.47996080e-01  1.47021600e+00  5.91621173e-01  2.36280228e-01
   8.11989111e-01]]
Intercept: 
 [2.97909669]


In [1089]:
# Make predictions
y_pred = logit.predict(X_train)

In [1090]:
# Estimate probability
y_pred_proba = logit.predict_proba(X_train)


In [1091]:
# Accuracy
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.84


In [1092]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_train, y_pred)
print(confusion_matrix)

[[291  38]
 [ 50 155]]


In [1093]:
# validate
# Accuracy
y_pred = logit.predict(X_validate)
print('Accuracy of Logistic Regression classifier on validation set: {:.2f}'
     .format(logit.score(X_validate, y_validate)))

Accuracy of Logistic Regression classifier on validation set: 0.77


In [1094]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_validate, y_pred)
print(confusion_matrix)


[[90 20]
 [21 47]]


In [1095]:
# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_validate, y_pred))


              precision    recall  f1-score   support

           0       0.81      0.82      0.81       110
           1       0.70      0.69      0.70        68

    accuracy                           0.77       178
   macro avg       0.76      0.75      0.76       178
weighted avg       0.77      0.77      0.77       178



Accuracy of Logistic Regression classifier on test set: 0.81


# <span style="color:blue">Current Model Takeaways</span>
####