In [92]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from acquire import get_titanic_data, get_iris_data
from prepare import prep_titanic
import graphviz
from graphviz import Graph
import warnings
warnings.filterwarnings('ignore')

In [93]:
from sklearn.model_selection import train_test_split

def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

## Baseline Prediction and Basic Exploration

In [94]:
titanic = get_titanic_data()
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [95]:
for col, vals in titanic.iteritems():
    print(titanic[f'{col}'].value_counts())

0      1
598    1
587    1
588    1
589    1
      ..
300    1
301    1
302    1
303    1
890    1
Name: passenger_id, Length: 891, dtype: int64
0    549
1    342
Name: survived, dtype: int64
3    491
1    216
2    184
Name: pclass, dtype: int64
male      577
female    314
Name: sex, dtype: int64
24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: age, Length: 88, dtype: int64
0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: sibsp, dtype: int64
0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: parch, dtype: int64
8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
           ..
35.0000     1
28.5000     1
6.2375      1
14.0000     1
10.5167     1
Name: fare, Length: 248, dtype: int64
S    644
C    168
Q     77
Name: embarked, dtype: int64
Third     491
First     216
Second    184
Name: class, dtype: int64
C    59
B    47
D    33
E    32
A    15


In [96]:
titanic['baseline'] = 0
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,baseline
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1,0
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1,0


In [97]:
(titanic.survived == titanic.baseline).mean()

0.6161616161616161

Baseline estimate: ~61.62%

## Modeling

In [98]:
train, validate, test = prep_titanic(get_titanic_data())
train.shape, validate.shape, test.shape

((497, 12), (214, 12), (178, 12))

In [99]:
X_train, y_train = train.drop(columns='survived'), train.survived
X_validate, y_validate = validate.drop(columns='survived'), validate.survived
X_test, y_test = test.drop(columns='survived'), test.survived

In [100]:
clf = DecisionTreeClassifier(max_depth=3, random_state=123)
clf = clf.fit(X_train, y_train)

In [101]:
dot_data = export_graphviz(clf, feature_names= X_train.columns,class_names=['died','lived'], rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

In [102]:
y_pred = clf.predict(X_train)
y_pred

array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,

In [103]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba

array([[0.51666667, 0.48333333],
       [0.0326087 , 0.9673913 ],
       [0.88      , 0.12      ],
       [0.0326087 , 0.9673913 ],
       [0.0326087 , 0.9673913 ],
       [0.88      , 0.12      ],
       [0.88      , 0.12      ],
       [0.88      , 0.12      ],
       [0.88      , 0.12      ],
       [0.93333333, 0.06666667],
       [0.93333333, 0.06666667],
       [0.92857143, 0.07142857],
       [0.0326087 , 0.9673913 ],
       [0.51666667, 0.48333333],
       [0.88      , 0.12      ],
       [0.92857143, 0.07142857],
       [0.88      , 0.12      ],
       [0.51666667, 0.48333333],
       [0.88      , 0.12      ],
       [0.4137931 , 0.5862069 ],
       [0.0326087 , 0.9673913 ],
       [0.88      , 0.12      ],
       [0.0326087 , 0.9673913 ],
       [0.4137931 , 0.5862069 ],
       [0.51666667, 0.48333333],
       [0.88      , 0.12      ],
       [0.0326087 , 0.9673913 ],
       [0.0326087 , 0.9673913 ],
       [0.51666667, 0.48333333],
       [0.88      , 0.12      ],
       [0.

In [104]:
accuracy = round(clf.score(X_train, y_train), 2)
print(f'Training accuracy: {accuracy}')

Training accuracy: 0.82


In [105]:
train['prediction'] = clf.predict(X_train)
train[['prediction', 'survived']]

Unnamed: 0,prediction,survived
583,0,0
337,1,1
50,0,0
218,1,1
31,1,1
...,...,...
313,0,0
636,0,0
222,0,0
485,0,0


In [106]:
confusion1 = confusion_matrix(y_train, y_pred)
con_df = pd.DataFrame(confusion_matrix(y_train, y_pred))
con_df.rename(columns = {0: 'died', 1: 'lived'}, index = {0: 'died', 1: 'lived'})

Unnamed: 0,died,lived
died,279,28
lived,62,128


## Compute:

In [107]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86       307
           1       0.82      0.67      0.74       190

    accuracy                           0.82       497
   macro avg       0.82      0.79      0.80       497
weighted avg       0.82      0.82      0.81       497



In [108]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.82


In [109]:
TP = confusion1[0,0]
TN = confusion1[1,1]
FP = confusion1[0,1]
FN = confusion1[1,0]

AR = (TP+TN)/(TP+FP+FN+TN)
TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(TN+FP)
FNR = FN/(FN+TP)

print(f'Accuracy: {AR:.2%}\nTrue Positive Rate: {TPR:.2%}\nFalse Positive Rate: {FPR:.2%}\nTrue Negaitve Rate: {TNR:.2%}\nFalse Negative Rate: {FNR:.2%}')

Accuracy: 81.89%
True Positive Rate: 81.82%
False Positive Rate: 17.95%
True Negaitve Rate: 82.05%
False Negative Rate: 18.18%


#### Analysis: 
The model performs better on in-sample data compared to the baseline of 61.62%

### Out-of-sample

In [110]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.79


In [111]:
y_pred = clf.predict(X_validate)
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.88      0.84       132
           1       0.77      0.65      0.70        82

    accuracy                           0.79       214
   macro avg       0.78      0.76      0.77       214
weighted avg       0.79      0.79      0.79       214



In [112]:
confusion2 = confusion_matrix(y_validate, y_pred)
con_df2 = pd.DataFrame(confusion_matrix(y_validate, y_pred))
con_df2.rename(columns = {0: 'died', 1: 'lived'}, index = {0: 'died', 1: 'lived'})

Unnamed: 0,died,lived
died,116,16
lived,29,53


In [113]:
TP = confusion2[0,0]
TN = confusion2[1,1]
FP = confusion2[0,1]
FN = confusion2[1,0]

AR = (TP+TN)/(TP+FP+FN+TN)
TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(TN+FP)
FNR = FN/(FN+TP)

print(f'Accuracy: {AR:.2%}\nTrue Positive Rate: {TPR:.2%}\nFalse Positive Rate: {FPR:.2%}\nTrue Negaitve Rate: {TNR:.2%}\nFalse Negative Rate: {FNR:.2%}')

Accuracy: 78.97%
True Positive Rate: 80.00%
False Positive Rate: 23.19%
True Negaitve Rate: 76.81%
False Negative Rate: 20.00%


#### Analysis: 
The model performs better on out-of-sample data compared to the baseline of 61.62%

Additionally, the best way to minimize over/underfitting is to keep the `max_depth` value at 3

# Random Forrest

### Fitting Model

In [114]:
rfc = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

rfc.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=123)

### Evaluate Model

In [115]:
for i in range(0, len(list(X_train.columns))):
    print(X_train.columns[i], rfc.feature_importances_[i])

passenger_id 0.17092083917232642
sex 0.24080376754637992
age 0.14761909049395786
sibsp 0.044382154196669944
parch 0.03053929084363289
fare 0.20166824074859663
class 0.09914946094562498
alone 0.026274267124618927
Cherbourg 0.013785358842932333
Queenstown 0.009458192575770026
Southampton 0.015399337509489968


In [116]:
y_pred = rfc.predict(X_train)
y_pred_proba = rfc.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.7767774 , 0.2232226 ],
       [0.025     , 0.975     ],
       [0.99156452, 0.00843548],
       [0.0125    , 0.9875    ],
       [0.06      , 0.94      ]])

In [117]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rfc.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.97


In [118]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       307
           1       1.00      0.92      0.96       190

    accuracy                           0.97       497
   macro avg       0.98      0.96      0.97       497
weighted avg       0.97      0.97      0.97       497



In [119]:
confusion3 = confusion_matrix(y_train, y_pred)
con_df3 = pd.DataFrame(confusion_matrix(y_train, y_pred))
con_df3.rename(columns = {0: 'died', 1: 'lived'}, index = {0: 'died', 1: 'lived'})

Unnamed: 0,died,lived
died,307,0
lived,16,174


In [120]:
TP = confusion3[0,0]
TN = confusion3[1,1]
FP = confusion3[0,1]
FN = confusion3[1,0]

AR = (TP+TN)/(TP+FP+FN+TN)
TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(TN+FP)
FNR = FN/(FN+TP)

print(f'Accuracy: {AR:.2%}\nTrue Positive Rate: {TPR:.2%}\nFalse Positive Rate: {FPR:.2%}\nTrue Negaitve Rate: {TNR:.2%}\nFalse Negative Rate: {FNR:.2%}')

Accuracy: 96.78%
True Positive Rate: 95.05%
False Positive Rate: 0.00%
True Negaitve Rate: 100.00%
False Negative Rate: 4.95%


In [121]:
#For loop that increases the min_leaf_samples in addition to decreasing the max_depth
for i in range(1, 10):
    # Make the model
    rfc = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=(i),
                            n_estimators=100,
                            max_depth=(11 - i), 
                            random_state=123)

    # Fit the model (on train and only train)
    rfc = rfc.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    y_predictions = rfc.predict(X_train)
    
    tscore = rfc.score(X_train, y_train)
    vscore = rfc.score(X_validate, y_validate)

    # Produce the classification report on the actual y values and this model's predicted y values
    report = classification_report(y_train, y_predictions, output_dict=True)
    print(rfc)
    print(pd.DataFrame(report))
    print(f"Forrest with max depth of {11 - i} and minimum leaf samples {i}")
    print(f'Accuracy of random forest classifier on train set: {tscore:.2%}\nAccuracy of random forest classifier on validate set: {vscore:.2%}')
    print()

RandomForestClassifier(max_depth=10, random_state=123)
                    0           1  accuracy   macro avg  weighted avg
precision    0.950464    1.000000  0.967807    0.975232      0.969402
recall       1.000000    0.915789  0.967807    0.957895      0.967807
f1-score     0.974603    0.956044  0.967807    0.965324      0.967508
support    307.000000  190.000000  0.967807  497.000000    497.000000
Forrest with max depth of 10 and minimum leaf samples 1
Accuracy of random forest classifier on train set: 96.78%
Accuracy of random forest classifier on validate set: 80.84%

RandomForestClassifier(max_depth=9, min_samples_leaf=2, random_state=123)
                    0           1  accuracy   macro avg  weighted avg
precision    0.915152    0.970060  0.933602    0.942606      0.936143
recall       0.983713    0.852632  0.933602    0.918172      0.933602
f1-score     0.948195    0.907563  0.933602    0.927879      0.932661
support    307.000000  190.000000  0.933602  497.000000    497.00

In [122]:
#For loop that decreases the max_depth
for i in range(1, 10):
    # Make the model
    rfc = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=(1),
                            n_estimators=100,
                            max_depth=(11 - i), 
                            random_state=123)

    # Fit the model (on train and only train)
    rfc = rfc.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    y_predictions = rfc.predict(X_train)
    
    tscore = rfc.score(X_train, y_train)
    vscore = rfc.score(X_validate, y_validate)

    # Produce the classification report on the actual y values and this model's predicted y values
    report = classification_report(y_train, y_predictions, output_dict=True)
    print(rfc)
    print(pd.DataFrame(report))
    print(f"Forrest with max depth of {11 - i} and minimum leaf samples 1")
    print(f'Accuracy of random forest classifier on train set: {tscore:.2%}\nAccuracy of random forest classifier on validate set: {vscore:.2%}')
    print('-------------------------------------------------------------------------------\n')

RandomForestClassifier(max_depth=10, random_state=123)
                    0           1  accuracy   macro avg  weighted avg
precision    0.950464    1.000000  0.967807    0.975232      0.969402
recall       1.000000    0.915789  0.967807    0.957895      0.967807
f1-score     0.974603    0.956044  0.967807    0.965324      0.967508
support    307.000000  190.000000  0.967807  497.000000    497.000000
Forrest with max depth of 10 and minimum leaf samples 1
Accuracy of random forest classifier on train set: 96.78%
Accuracy of random forest classifier on validate set: 80.84%
-------------------------------------------------------------------------------

RandomForestClassifier(max_depth=9, random_state=123)
                    0           1  accuracy   macro avg  weighted avg
precision    0.938650    0.994152  0.957746    0.966401      0.959868
recall       0.996743    0.894737  0.957746    0.945740      0.957746
f1-score     0.966825    0.941828  0.957746    0.954326      0.957269
suppo

In [123]:
#For loop that increases the min_samples_leaf
for i in range(1, 10):
    # Make the model
    rfc = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=(i),
                            n_estimators=100,
                            max_depth=(10), 
                            random_state=123)

    # Fit the model (on train and only train)
    rfc = rfc.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    y_predictions = rfc.predict(X_train)
    
    tscore = rfc.score(X_train, y_train)
    vscore = rfc.score(X_validate, y_validate)

    # Produce the classification report on the actual y values and this model's predicted y values
    report = classification_report(y_train, y_predictions, output_dict=True)
    print(rfc)
    print(pd.DataFrame(report))
    print(f"Forrest with max depth of {11 - i} and minimum leaf samples 1")
    print(f'Accuracy of random forest classifier on train set: {tscore:.2%}\nAccuracy of random forest classifier on validate set: {vscore:.2%}')
    print()

RandomForestClassifier(max_depth=10, random_state=123)
                    0           1  accuracy   macro avg  weighted avg
precision    0.950464    1.000000  0.967807    0.975232      0.969402
recall       1.000000    0.915789  0.967807    0.957895      0.967807
f1-score     0.974603    0.956044  0.967807    0.965324      0.967508
support    307.000000  190.000000  0.967807  497.000000    497.000000
Forrest with max depth of 10 and minimum leaf samples 1
Accuracy of random forest classifier on train set: 96.78%
Accuracy of random forest classifier on validate set: 80.84%

RandomForestClassifier(max_depth=10, min_samples_leaf=2, random_state=123)
                    0           1  accuracy   macro avg  weighted avg
precision    0.920973    0.976190  0.939638    0.948582      0.942082
recall       0.986971    0.863158  0.939638    0.925064      0.939638
f1-score     0.952830    0.916201  0.939638    0.934516      0.938827
support    307.000000  190.000000  0.939638  497.000000    497.0

## kNN Models

In [124]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [125]:
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [126]:
y_pred = knn.predict(X_train)
y_pred[0:10]

array([0, 1, 0, 1, 0, 1, 0, 0, 0, 0])

In [127]:
y_pred_proba = knn.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.8, 0.2],
       [0.4, 0.6],
       [0.6, 0.4],
       [0.2, 0.8],
       [0.6, 0.4]])

In [128]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.86      0.80       307
           1       0.71      0.54      0.61       190

    accuracy                           0.74       497
   macro avg       0.73      0.70      0.71       497
weighted avg       0.73      0.74      0.73       497



In [129]:
confusion4 = confusion_matrix(y_train, y_pred)
con_df4 = pd.DataFrame(confusion_matrix(y_train, y_pred))
con_df4.rename(columns = {0: 'died', 1: 'lived'}, index = {0: 'died', 1: 'lived'})

Unnamed: 0,died,lived
died,264,43
lived,87,103


In [130]:
TP = confusion4[0,0]
TN = confusion4[1,1]
FP = confusion4[0,1]
FN = confusion4[1,0]

AR = (TP+TN)/(TP+FP+FN+TN)
TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(TN+FP)
FNR = FN/(FN+TP)

print(f'Accuracy: {AR:.2%}\nTrue Positive Rate: {TPR:.2%}\nFalse Positive Rate: {FPR:.2%}\nTrue Negaitve Rate: {TNR:.2%}\nFalse Negative Rate: {FNR:.2%}')

Accuracy: 73.84%
True Positive Rate: 75.21%
False Positive Rate: 29.45%
True Negaitve Rate: 70.55%
False Negative Rate: 24.79%


In [131]:
print('Accuracy of kNN classifier on validate set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

Accuracy of kNN classifier on validate set: 0.61


#### kNN n_neigbors = 10

In [132]:
knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [133]:
y_pred = knn.predict(X_train)
y_pred[0:10]

array([0, 1, 0, 1, 0, 0, 0, 0, 0, 0])

In [134]:
y_pred_proba = knn.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.6, 0.4],
       [0.3, 0.7],
       [0.7, 0.3],
       [0.3, 0.7],
       [0.6, 0.4]])

In [135]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.92      0.79       307
           1       0.73      0.35      0.47       190

    accuracy                           0.70       497
   macro avg       0.71      0.63      0.63       497
weighted avg       0.71      0.70      0.67       497



In [136]:
confusion5 = confusion_matrix(y_train, y_pred)
con_df5 = pd.DataFrame(confusion_matrix(y_train, y_pred))
con_df5.rename(columns = {0: 'died', 1: 'lived'}, index = {0: 'died', 1: 'lived'})

Unnamed: 0,died,lived
died,283,24
lived,124,66


In [137]:
TP = confusion5[0,0]
TN = confusion5[1,1]
FP = confusion5[0,1]
FN = confusion5[1,0]

AR = (TP+TN)/(TP+FP+FN+TN)
TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(TN+FP)
FNR = FN/(FN+TP)

print(f'Accuracy: {AR:.2%}\nTrue Positive Rate: {TPR:.2%}\nFalse Positive Rate: {FPR:.2%}\nTrue Negaitve Rate: {TNR:.2%}\nFalse Negative Rate: {FNR:.2%}')

Accuracy: 70.22%
True Positive Rate: 69.53%
False Positive Rate: 26.67%
True Negaitve Rate: 73.33%
False Negative Rate: 30.47%


In [138]:
print('Accuracy of kNN classifier on validate set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

Accuracy of kNN classifier on validate set: 0.69


#### kNN n_neigbors = 20

In [139]:
knn = KNeighborsClassifier(n_neighbors=20, weights='uniform')
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=20)

In [140]:
y_pred = knn.predict(X_train)
y_pred[0:10]

array([0, 1, 0, 1, 0, 0, 0, 0, 0, 0])

In [141]:
y_pred_proba = knn.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.55, 0.45],
       [0.25, 0.75],
       [0.65, 0.35],
       [0.3 , 0.7 ],
       [0.7 , 0.3 ]])

In [142]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.94      0.78       307
           1       0.73      0.26      0.38       190

    accuracy                           0.68       497
   macro avg       0.70      0.60      0.58       497
weighted avg       0.69      0.68      0.63       497



In [143]:
confusion6 = confusion_matrix(y_train, y_pred)
con_df6 = pd.DataFrame(confusion_matrix(y_train, y_pred))
con_df6.rename(columns = {0: 'died', 1: 'lived'}, index = {0: 'died', 1: 'lived'})

Unnamed: 0,died,lived
died,289,18
lived,141,49


In [144]:
TP = confusion6[0,0]
TN = confusion6[1,1]
FP = confusion6[0,1]
FN = confusion6[1,0]

AR = (TP+TN)/(TP+FP+FN+TN)
TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(TN+FP)
FNR = FN/(FN+TP)

print(f'Accuracy: {AR:.2%}\nTrue Positive Rate: {TPR:.2%}\nFalse Positive Rate: {FPR:.2%}\nTrue Negaitve Rate: {TNR:.2%}\nFalse Negative Rate: {FNR:.2%}')

Accuracy: 68.01%
True Positive Rate: 67.21%
False Positive Rate: 26.87%
True Negaitve Rate: 73.13%
False Negative Rate: 32.79%


In [145]:
print('Accuracy of kNN classifier on validate set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

Accuracy of kNN classifier on validate set: 0.70


## Logistic/Linear Regression
### Model 1

In [146]:
logit = LogisticRegression(C=1, random_state=123)
logit.fit(X_train, y_train)

LogisticRegression(C=1, random_state=123)

In [147]:
y_pred = logit.predict(X_train)
y_pred[0:10]

array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0])

In [148]:
y_train[0:10]

583    0
337    1
50     0
218    1
31     1
308    0
314    0
883    0
459    0
180    0
Name: survived, dtype: int64

In [149]:
y_pred_proba = logit.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.72398781, 0.27601219],
       [0.09825586, 0.90174414],
       [0.93536842, 0.06463158],
       [0.17548538, 0.82451462],
       [0.09021716, 0.90978284]])

In [150]:
confusion7 = confusion_matrix(y_train, y_pred)
con_df7 = pd.DataFrame(confusion_matrix(y_train, y_pred))
con_df7.rename(columns = {0: 'died', 1: 'lived'}, index = {0: 'died', 1: 'lived'})

Unnamed: 0,died,lived
died,267,40
lived,63,127


In [151]:
TP = confusion7[0,0]
TN = confusion7[1,1]
FP = confusion7[0,1]
FN = confusion7[1,0]

AR = (TP+TN)/(TP+FP+FN+TN)
TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(TN+FP)
FNR = FN/(FN+TP)

print(f'Accuracy: {AR:.2%}\nTrue Positive Rate: {TPR:.2%}\nFalse Positive Rate: {FPR:.2%}\nTrue Negaitve Rate: {TNR:.2%}\nFalse Negative Rate: {FNR:.2%}')

Accuracy: 79.28%
True Positive Rate: 80.91%
False Positive Rate: 23.95%
True Negaitve Rate: 76.05%
False Negative Rate: 19.09%


In [152]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.87      0.84       307
           1       0.76      0.67      0.71       190

    accuracy                           0.79       497
   macro avg       0.78      0.77      0.77       497
weighted avg       0.79      0.79      0.79       497



In [153]:
print('Accuracy of Regression classifier on validate set: {:.2f}'
     .format(logit.score(X_validate, y_validate)))

Accuracy of Regression classifier on validate set: 0.78


### Model 2

In [154]:
X_train = train[['age', 'fare', 'class']]
y_train = train.survived
X_validate = validate[['age', 'fare', 'class']]
y_validate = validate.survived
X_test = test[['age', 'fare', 'class']]
y_test = test.survived

In [155]:
logit2 = LogisticRegression(C=1, random_state=123)
logit2.fit(X_train, y_train)

LogisticRegression(C=1, random_state=123)

In [156]:
y_pred = logit2.predict(X_train)
y_pred[0:10]

array([1, 1, 0, 1, 1, 0, 0, 0, 0, 0])

In [157]:
y_train[0:10]

583    0
337    1
50     0
218    1
31     1
308    0
314    0
883    0
459    0
180    0
Name: survived, dtype: int64

In [158]:
y_pred_proba = logit2.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.36397951, 0.63602049],
       [0.34139883, 0.65860117],
       [0.6265983 , 0.3734017 ],
       [0.31505319, 0.68494681],
       [0.26359851, 0.73640149]])

In [159]:
confusion8 = confusion_matrix(y_train, y_pred)
con_df8 = pd.DataFrame(confusion_matrix(y_train, y_pred))
con_df8.rename(columns = {0: 'died', 1: 'lived'}, index = {0: 'died', 1: 'lived'})

Unnamed: 0,died,lived
died,265,42
lived,99,91


In [160]:
TP = confusion8[0,0]
TN = confusion8[1,1]
FP = confusion8[0,1]
FN = confusion8[1,0]

AR = (TP+TN)/(TP+FP+FN+TN)
TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(TN+FP)
FNR = FN/(FN+TP)

print(f'Accuracy: {AR:.2%}\nTrue Positive Rate: {TPR:.2%}\nFalse Positive Rate: {FPR:.2%}\nTrue Negaitve Rate: {TNR:.2%}\nFalse Negative Rate: {FNR:.2%}')

Accuracy: 71.63%
True Positive Rate: 72.80%
False Positive Rate: 31.58%
True Negaitve Rate: 68.42%
False Negative Rate: 27.20%


In [161]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.86      0.79       307
           1       0.68      0.48      0.56       190

    accuracy                           0.72       497
   macro avg       0.71      0.67      0.68       497
weighted avg       0.71      0.72      0.70       497



In [162]:
print('Accuracy of Regression classifier on validate set: {:.2f}'
     .format(logit2.score(X_validate, y_validate)))

Accuracy of Regression classifier on validate set: 0.73


In [163]:
train.head()

Unnamed: 0,passenger_id,survived,sex,age,sibsp,parch,fare,class,alone,Cherbourg,Queenstown,Southampton,prediction
583,583,0,1,36.0,0,0,40.125,1,1,1,0,0,0
337,337,1,0,41.0,0,0,134.5,1,1,1,0,0,1
50,50,0,1,7.0,4,1,39.6875,3,0,0,0,1,0
218,218,1,0,32.0,0,0,76.2917,1,1,1,0,0,1
31,31,1,0,29.916875,1,0,146.5208,1,0,1,0,0,1


### Model 3

In [165]:
X_train = train[['age', 'fare', 'class', 'sex']]
y_train = train.survived
X_validate = validate[['age', 'fare', 'class', 'sex']]
y_validate = validate.survived
X_test = test[['age', 'fare', 'class', 'sex']]
y_test = test.survived

In [166]:
logit3 = LogisticRegression(C=1, random_state=123)
logit3.fit(X_train, y_train)

LogisticRegression(C=1, random_state=123)

In [168]:
y_pred = logit3.predict(X_train)
y_pred[0:10]

array([0, 1, 0, 1, 1, 0, 0, 0, 0, 1])

In [169]:
y_train[0:10]

583    0
337    1
50     0
218    1
31     1
308    0
314    0
883    0
459    0
180    0
Name: survived, dtype: int64