In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import prepare
import acquire

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import tree

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
df = data('titanic')
df.head()

Unnamed: 0,class,age,sex,survived
1,1st class,adults,man,yes
2,1st class,adults,man,yes
3,1st class,adults,man,yes
4,1st class,adults,man,yes
5,1st class,adults,man,yes


In [3]:
df.replace({'class':{'1st class': 1, '2nd class': 2, '3rd class': 3}}, inplace=True)

In [4]:
df.replace({'age':{'adults': 2, 'child': 1}}, inplace=True)

In [5]:
df.replace({'sex':{'man': 0, 'women': 1}}, inplace=True)

In [6]:
df.survived.value_counts()

no     817
yes    499
Name: survived, dtype: int64

In [7]:
#checking for nulls, if there were, use: df.column.fillna(value=df.column.mode())
df.isna().sum()

class       0
age         0
sex         0
survived    0
dtype: int64

###### What is your baseline predicition?

In [8]:
df['baseline'] = 'no'
df.head()

Unnamed: 0,class,age,sex,survived,baseline
1,1,2,0,yes,no
2,1,2,0,yes,no
3,1,2,0,yes,no
4,1,2,0,yes,no
5,1,2,0,yes,no


###### What is your baseline accuracy?

In [9]:
round((df.survived == df.baseline).mean(), 2)

0.62

###### Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [10]:
def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [11]:
train, validate, test = train_validate_test_split(df, target='survived', seed=123)

In [12]:
X_train = train.drop(columns=['survived','baseline'])
y_train = train.survived

X_validate = validate.drop(columns=['survived','baseline'])
y_validate = validate.survived

X_test = test.drop(columns=['survived','baseline'])
y_test = test.survived

In [13]:
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [14]:
clf = clf.fit(X_train, y_train)

In [15]:
import graphviz
from graphviz import Graph

dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data)

In [16]:
# graph.render('titanic_decision_tree', view=True)

###### Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [17]:
y_pred = clf.predict(X_train)
y_pred[0:5]

array(['no', 'no', 'no', 'no', 'no'], dtype=object)

In [18]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.78


In [19]:
confusion_matrix(y_train, y_pred)

array([[443,  14],
       [145, 134]])

In [20]:
y_train.value_counts()

no     457
yes    279
Name: survived, dtype: int64

In [21]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,no,yes
no,443,14
yes,145,134


Create a classificaiton report

Precision: the higher this number is, the more you were able to pinpoint all positives correctly. If this is a low score, you predicted a lot of positives where there were none. 
TP/(TP+FP)

Recall: if this score is high, you didn’t miss a lot of positives. But as it gets lower, you are not predicting the positives that are actually there. 
TP/(TP+FN)

f1-score: The balanced harmonic mean of Recall and Precision, giving both metrics equal weight. The higher the F-Measure is, the better. 

Support: number of occurrences of each class in where y is true.

In [22]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

          no       0.75      0.97      0.85       457
         yes       0.91      0.48      0.63       279

    accuracy                           0.78       736
   macro avg       0.83      0.72      0.74       736
weighted avg       0.81      0.78      0.76       736



###### Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [23]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.81


In [24]:
# Produce y_predictions that come from the X_validate
y_pred = clf.predict(X_validate)

# Compare actual y values (from validate) to predicted y_values from the model run on X_validate
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

          no       0.77      0.98      0.87       196
         yes       0.95      0.53      0.68       120

    accuracy                           0.81       316
   macro avg       0.86      0.75      0.77       316
weighted avg       0.84      0.81      0.79       316



###### Run through steps 2-4 using a different max_depth value.



In [25]:
clf = DecisionTreeClassifier(max_depth=2, random_state=123)

clf = clf.fit(X_train, y_train)

dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data)

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

In [26]:
y_pred = clf.predict(X_train)
y_pred[0:5]

array(['no', 'no', 'no', 'no', 'no'], dtype=object)

In [27]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.78


In [28]:
confusion_matrix(y_train, y_pred)

array([[443,  14],
       [147, 132]])

In [29]:
y_train.value_counts()

no     457
yes    279
Name: survived, dtype: int64

In [30]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,no,yes
no,443,14
yes,147,132


In [31]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

          no       0.75      0.97      0.85       457
         yes       0.90      0.47      0.62       279

    accuracy                           0.78       736
   macro avg       0.83      0.72      0.73       736
weighted avg       0.81      0.78      0.76       736



In [32]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.80


In [33]:
# Produce y_predictions that come from the X_validate
y_pred = clf.predict(X_validate)

# Compare actual y values (from validate) to predicted y_values from the model run on X_validate
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

          no       0.76      0.98      0.86       196
         yes       0.95      0.50      0.66       120

    accuracy                           0.80       316
   macro avg       0.86      0.74      0.76       316
weighted avg       0.83      0.80      0.78       316



### Random Forest

In [34]:
df.head()

Unnamed: 0,class,age,sex,survived,baseline
1,1,2,0,yes,no
2,1,2,0,yes,no
3,1,2,0,yes,no
4,1,2,0,yes,no
5,1,2,0,yes,no


Train Validate Test

In [35]:
def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [36]:
train, validate, test = train_validate_test_split(df, target='survived', seed=123)

In [37]:
X_train = train.drop(columns=['survived','baseline'])
y_train = train.survived

X_validate = validate.drop(columns=['survived','baseline'])
y_validate = validate.survived

X_test = test.drop(columns=['survived','baseline'])
y_test = test.survived

In [38]:
# Create the object
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)

In [39]:
# Fit the model
rf.fit(X_train, y_train)
RandomForestClassifier(max_depth=3, min_samples_leaf=3, random_state=123)

RandomForestClassifier(max_depth=3, min_samples_leaf=3, random_state=123)

In [40]:
# Feature Importance
print(rf.feature_importances_)

[0.3647712  0.02423182 0.61099698]


In [41]:
# Make Predictions
y_pred = rf.predict(X_train)

In [42]:
# Estimate Probability
y_pred_proba = rf.predict_proba(X_train)

In [43]:
# Compute the Accuracy
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.79


In [44]:
# Create a confusion matrix
print(confusion_matrix(y_train, y_pred))

[[443  14]
 [140 139]]


In [45]:
# Create a classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

          no       0.76      0.97      0.85       457
         yes       0.91      0.50      0.64       279

    accuracy                           0.79       736
   macro avg       0.83      0.73      0.75       736
weighted avg       0.82      0.79      0.77       736



In [46]:
# Validate Model
# Evaluate on Out-of-Sample data
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

Accuracy of random forest classifier on test set: 0.82


In [47]:
# Run through steps increasing min_samples_leaf and decreasing max_depth
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=100,
                            max_depth=1, 
                            random_state=123)

In [48]:
# Fit the model
rf.fit(X_train, y_train)
RandomForestClassifier(max_depth=3, min_samples_leaf=3, random_state=123)
# Make Predictions
y_pred = rf.predict(X_train)
# Estimate Probability
y_pred_proba = rf.predict_proba(X_train)
# Compute the Accuracy
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.73


In [49]:
# Create a confusion matrix
print(confusion_matrix(y_train, y_pred))

[[455   2]
 [194  85]]


In [50]:
# Create a classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

          no       0.70      1.00      0.82       457
         yes       0.98      0.30      0.46       279

    accuracy                           0.73       736
   macro avg       0.84      0.65      0.64       736
weighted avg       0.81      0.73      0.69       736



# KNN

#### Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [51]:
df = data('titanic')
df.head()

Unnamed: 0,class,age,sex,survived
1,1st class,adults,man,yes
2,1st class,adults,man,yes
3,1st class,adults,man,yes
4,1st class,adults,man,yes
5,1st class,adults,man,yes


In [52]:
df.replace({'class':{'1st class': 1, '2nd class': 2, '3rd class': 3}}, inplace=True)
df.replace({'age':{'adults': 2, 'child': 1}}, inplace=True)
df.replace({'sex':{'man': 0, 'women': 1}}, inplace=True)

In [53]:
df.head()

Unnamed: 0,class,age,sex,survived
1,1,2,0,yes
2,1,2,0,yes
3,1,2,0,yes
4,1,2,0,yes
5,1,2,0,yes


In [54]:
from sklearn.model_selection import train_test_split

def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [55]:
train, validate, test = train_validate_test_split(df, target='survived', seed=123)
train.shape, validate.shape, test.shape

((736, 4), (316, 4), (264, 4))

In [56]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [57]:
# Train the Model
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [58]:
# Fit the Model
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [59]:
# Make Predictions
y_pred = knn.predict(X_train)

In [60]:
y_pred[:10]

array(['no', 'no', 'no', 'no', 'no', 'no', 'no', 'yes', 'no', 'no'],
      dtype=object)

In [61]:
# Calculate Probabilities
y_pred_proba = knn.predict_proba(X_train)

In [62]:
y_pred_proba[:10]

array([[0.6, 0.4],
       [1. , 0. ],
       [0.8, 0.2],
       [0.8, 0.2],
       [0.6, 0.4],
       [1. , 0. ],
       [1. , 0. ],
       [0. , 1. ],
       [0.6, 0.4],
       [1. , 0. ]])

#### Evaluate your results using the model score, confusion matrix, and classification report.



In [63]:
# Evaluate the Model
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.75


In [64]:
# Create Confusion Matrix
print(confusion_matrix(y_train, y_pred))

[[376  81]
 [106 173]]


#### Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [65]:
# Create a Classification Report
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))

Unnamed: 0,no,yes,accuracy,macro avg,weighted avg
precision,0.780083,0.681102,0.745924,0.730593,0.742562
recall,0.822757,0.620072,0.745924,0.721414,0.745924
f1-score,0.800852,0.649156,0.745924,0.725004,0.743348
support,457.0,279.0,0.745924,736.0,736.0


In [66]:
# Train with 10 neighbors
knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')
# Fit the Model
knn.fit(X_train, y_train)
# Make Predictions
y_pred = knn.predict(X_train)
# Calculate Probabilities
y_pred_proba = knn.predict_proba(X_train)
y_pred_proba[:10]

array([[0.6, 0.4],
       [1. , 0. ],
       [0.9, 0.1],
       [0.9, 0.1],
       [0.7, 0.3],
       [1. , 0. ],
       [1. , 0. ],
       [0.2, 0.8],
       [0.7, 0.3],
       [1. , 0. ]])

In [67]:
# Evaluate the Model
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.79


In [68]:
# Create Confusion Matrix
print(confusion_matrix(y_train, y_pred))

[[443  14]
 [140 139]]


In [69]:
# Create a Classification Report
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))

Unnamed: 0,no,yes,accuracy,macro avg,weighted avg
precision,0.759863,0.908497,0.790761,0.83418,0.816206
recall,0.969365,0.498208,0.790761,0.733787,0.790761
f1-score,0.851923,0.643519,0.790761,0.747721,0.772922
support,457.0,279.0,0.790761,736.0,736.0


In [70]:
# Train with 20 neighbors
knn = KNeighborsClassifier(n_neighbors=20, weights='uniform')
# Fit the Model
knn.fit(X_train, y_train)
# Make Predictions
y_pred = knn.predict(X_train)
# Calculate Probabilities
y_pred_proba = knn.predict_proba(X_train)
y_pred_proba[:10]

array([[0.4 , 0.6 ],
       [0.9 , 0.1 ],
       [0.85, 0.15],
       [0.85, 0.15],
       [0.5 , 0.5 ],
       [0.9 , 0.1 ],
       [0.9 , 0.1 ],
       [0.45, 0.55],
       [0.5 , 0.5 ],
       [0.9 , 0.1 ]])

In [71]:
# Evaluate the Model
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.76


In [72]:
# Create Confusion Matrix
print(confusion_matrix(y_train, y_pred))

[[384  73]
 [101 178]]


In [73]:
# Create a Classification Report
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))

Unnamed: 0,no,yes,accuracy,macro avg,weighted avg
precision,0.791753,0.709163,0.763587,0.750458,0.760445
recall,0.840263,0.637993,0.763587,0.739128,0.763587
f1-score,0.815287,0.671698,0.763587,0.743492,0.760856
support,457.0,279.0,0.763587,736.0,736.0


#### What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

n_neighbors = 10 was the best model, it's using more data points to compare without oversaturating the data. 

#### Which model performs best on our out-of-sample data from validate?

In [74]:
# n_neighbors = 5
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
# Fit the Model
knn.fit(X_validate, y_validate)
# Make Predictions
y_pred = knn.predict(X_validate)
# Calculate Probabilities
y_pred_proba = knn.predict_proba(X_validate)
y_pred_proba[:10]

array([[1. , 0. ],
       [1. , 0. ],
       [0.8, 0.2],
       [0.8, 0.2],
       [0.8, 0.2],
       [1. , 0. ],
       [0.8, 0.2],
       [0.6, 0.4],
       [0.8, 0.2],
       [1. , 0. ]])

In [75]:
# Evaluate the Model
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

Accuracy of KNN classifier on training set: 0.82


In [76]:
# n_neighbors = 10
knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')
# Fit the Model
knn.fit(X_validate, y_validate)
# Make Predictions
y_pred = knn.predict(X_validate)
# Calculate Probabilities
y_pred_proba = knn.predict_proba(X_validate)
y_pred_proba[:10]

array([[0.9, 0.1],
       [0.9, 0.1],
       [0.9, 0.1],
       [0.6, 0.4],
       [0.9, 0.1],
       [0.9, 0.1],
       [0.7, 0.3],
       [0.5, 0.5],
       [0.9, 0.1],
       [0.9, 0.1]])

In [77]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

Accuracy of KNN classifier on training set: 0.81


In [78]:
# n_neighbors = 20
knn = KNeighborsClassifier(n_neighbors=20, weights='uniform')
# Fit the Model
knn.fit(X_validate, y_validate)
# Make Predictions
y_pred = knn.predict(X_validate)
# Calculate Probabilities
y_pred_proba = knn.predict_proba(X_validate)
y_pred_proba[:10]

array([[0.8 , 0.2 ],
       [0.8 , 0.2 ],
       [0.9 , 0.1 ],
       [0.6 , 0.4 ],
       [0.9 , 0.1 ],
       [0.8 , 0.2 ],
       [0.65, 0.35],
       [0.35, 0.65],
       [0.9 , 0.1 ],
       [0.8 , 0.2 ]])

In [79]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

Accuracy of KNN classifier on training set: 0.81


## Logistic Regression

#### Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [80]:
# Trying a different data set

train, validate, test = prepare.prep_titanic(acquire.get_titanic_data())
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S
583,583,0,1,male,36.0,0,0,40.125,C,First,Cherbourg,1,0,0
337,337,1,1,female,41.0,0,0,134.5,C,First,Cherbourg,1,0,0
50,50,0,3,male,7.0,4,1,39.6875,S,Third,Southampton,0,0,1
218,218,1,1,female,32.0,0,0,76.2917,C,First,Cherbourg,1,0,0
31,31,1,1,female,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0


In [129]:
train.replace({'sex':{'male': 0, 'female': 1}}, inplace=True)
validate.replace({'sex':{'male': 0, 'female': 1}}, inplace=True)
test.replace({'sex':{'male': 0, 'female': 1}}, inplace=True)

In [151]:
train.replace({'embarked':{'C': 0, 'S': 1, 'Q': 2}}, inplace=True)
validate.replace({'embarked':{'C': 0, 'S': 1, 'Q': 2}}, inplace=True)
test.replace({'embarked':{'C': 0, 'S': 1, 'Q': 2}}, inplace=True)

In [152]:
train.embark_town.value_counts()

Southampton    363
Cherbourg       96
Queenstown      38
Name: embark_town, dtype: int64

In [153]:
# adding a baseline for future questions
train['baseline'] = 0
validate['baseline'] = 0
test['baseline'] = 0

In [154]:
x_cols = ['pclass','age','fare']
y_col = ['survived']

X_train, y_train = train[x_cols], train[y_col]
X_validate, y_validate = validate[x_cols], validate[y_col]
X_test, y_test = test[x_cols], test[y_col]

In [155]:
X_train.head()

Unnamed: 0,pclass,age,fare
583,1,36.0,40.125
337,1,41.0,134.5
50,3,7.0,39.6875
218,1,32.0,76.2917
31,1,29.916875,146.5208


In [156]:
y_train.head()

Unnamed: 0,survived
583,0
337,1
50,0
218,1
31,1


In [157]:
# Define the logistic regression model
logit = LogisticRegression(C=1, random_state=123)

In [158]:
#  fit the model on train data
logit.fit(X_train, y_train)

LogisticRegression(C=1, random_state=123)

In [159]:
# now use the model to make predictions
y_pred = logit.predict(X_train)

In [160]:
#take a look at predictions
y_pred

array([1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,

In [161]:
# View raw probabilities (output from the model)

y_pred_proba = logit.predict_proba(X_train)
y_pred_proba = pd.DataFrame(y_pred_proba)
y_pred_proba.head()

Unnamed: 0,0,1
0,0.36398,0.63602
1,0.341399,0.658601
2,0.626598,0.373402
3,0.315053,0.684947
4,0.263599,0.736401


In [162]:
# classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.86      0.79       307
           1       0.68      0.48      0.56       190

    accuracy                           0.72       497
   macro avg       0.71      0.67      0.68       497
weighted avg       0.71      0.72      0.70       497



Compared to baseline

In [163]:
x_cols = ['baseline']
y_col = ['survived']

X_train, y_train = train[x_cols], train[y_col]
X_validate, y_validate = validate[x_cols], validate[y_col]
X_test, y_test = test[x_cols], test[y_col]

In [164]:
logit = LogisticRegression(C=1, random_state=123)
logit.fit(X_train, y_train)
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)
y_pred_proba = pd.DataFrame(y_pred_proba)
y_pred_proba.head()
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.62      1.00      0.76       307
           1       0.00      0.00      0.00       190

    accuracy                           0.62       497
   macro avg       0.31      0.50      0.38       497
weighted avg       0.38      0.62      0.47       497



baseline accuracy = 62%

### Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [165]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S,baseline
583,583,0,1,0,36.0,0,0,40.125,0,First,Cherbourg,1,0,0,0
337,337,1,1,1,41.0,0,0,134.5,0,First,Cherbourg,1,0,0,0
50,50,0,3,0,7.0,4,1,39.6875,1,Third,Southampton,0,0,1,0
218,218,1,1,1,32.0,0,0,76.2917,0,First,Cherbourg,1,0,0,0
31,31,1,1,1,29.916875,1,0,146.5208,0,First,Cherbourg,0,0,0,0


In [166]:
x_cols = ['pclass','age','fare','sex']
y_col = ['survived']

X_train, y_train = train[x_cols], train[y_col]
X_validate, y_validate = validate[x_cols], validate[y_col]
X_test, y_test = test[x_cols], test[y_col]

In [167]:
logit = LogisticRegression(C=1, random_state=123)
logit.fit(X_train, y_train)
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)
y_pred_proba = pd.DataFrame(y_pred_proba)
y_pred_proba.head()

Unnamed: 0,0,1
0,0.547277,0.452723
1,0.09788,0.90212
2,0.838213,0.161787
3,0.082535,0.917465
4,0.073965,0.926035


In [168]:
# classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.75      0.71      0.73       190

    accuracy                           0.80       497
   macro avg       0.79      0.78      0.78       497
weighted avg       0.80      0.80      0.80       497



In [173]:
train, validate, test = prepare.prep_titanic(acquire.get_titanic_data())
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S
583,583,0,1,male,36.0,0,0,40.125,C,First,Cherbourg,1,0,0
337,337,1,1,female,41.0,0,0,134.5,C,First,Cherbourg,1,0,0
50,50,0,3,male,7.0,4,1,39.6875,S,Third,Southampton,0,0,1
218,218,1,1,female,32.0,0,0,76.2917,C,First,Cherbourg,1,0,0
31,31,1,1,female,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0


In [174]:
train.replace({'sex':{'male': 0, 'female': 1}}, inplace=True)
validate.replace({'sex':{'male': 0, 'female': 1}}, inplace=True)
test.replace({'sex':{'male': 0, 'female': 1}}, inplace=True)
train.replace({'embarked':{'C': 0, 'S': 1, 'Q': 2}}, inplace=True)
validate.replace({'embarked':{'C': 0, 'S': 1, 'Q': 2}}, inplace=True)
test.replace({'embarked':{'C': 0, 'S': 1, 'Q': 2}}, inplace=True)

In [186]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S
583,583,0,1,0,36.0,0,0,40.125,0,First,Cherbourg,1,0,0
337,337,1,1,1,41.0,0,0,134.5,0,First,Cherbourg,1,0,0
50,50,0,3,0,7.0,4,1,39.6875,1,Third,Southampton,0,0,1
218,218,1,1,1,32.0,0,0,76.2917,0,First,Cherbourg,1,0,0
31,31,1,1,1,29.916875,1,0,146.5208,0,First,Cherbourg,0,0,0


In [194]:
survived = train[train['survived']==1]
survived.sibsp.value_counts()

0    117
1     61
2      8
3      3
4      1
Name: sibsp, dtype: int64

In [195]:
x_cols = ['age','alone','sex']
y_col = ['survived']

X_train, y_train = train[x_cols], train[y_col]
X_validate, y_validate = validate[x_cols], validate[y_col]
X_test, y_test = test[x_cols], test[y_col]

In [196]:
logit = LogisticRegression(C=1, random_state=123)
logit.fit(X_train, y_train)
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)
y_pred_proba = pd.DataFrame(y_pred_proba)
y_pred_proba.head()

Unnamed: 0,0,1
0,0.816557,0.183443
1,0.317631,0.682369
2,0.749072,0.250928
3,0.323612,0.676388
4,0.228135,0.771865


In [197]:
# classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.86      0.83       307
           1       0.75      0.66      0.70       190

    accuracy                           0.78       497
   macro avg       0.78      0.76      0.77       497
weighted avg       0.78      0.78      0.78       497

