- [x] Create a new notebook, `knn_model`, and work with the titanic dataset to answer the following:

In [2]:
# Import full libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import personal modules
import acquire as a
import prepare as p
import model as m

# Selective imports
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

# Handle warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
# import Titanic
raw_titanic = a.get_titanic_data()

# Check data
a.df_info(raw_titanic)

Reading from file...


Unnamed: 0,nunique,dtypes,isnull,352
passenger_id,891,int64,0,352
survived,2,int64,0,0
pclass,3,int64,0,3
sex,2,object,0,male
age,88,float64,177,15.0
sibsp,7,int64,0,1
parch,7,int64,0,1
fare,248,float64,0,7.2292
embarked,3,object,2,C
class,3,object,0,Third


In [4]:
# Clean Titanic
titanic = p.prep_titanic(raw_titanic)

# check data
a.df_info(titanic)

Unnamed: 0,nunique,dtypes,isnull,886
passenger_id,891,object,0,886
survived,2,object,0,0
pclass,3,object,0,2
sex,2,object,0,male
age,91,float64,0,27.0
sibsp,7,object,0,0
parch,7,object,0,0
fare,248,float64,0,13.0
embark_town,3,object,0,Southampton
alone,2,object,0,1


In [5]:
# Split Titanic
df_sets = train,validate,test = p.split_df(titanic,'survived')

# check train
a.df_info(train)

Unnamed: 0,nunique,dtypes,isnull,627
passenger_id,534,object,0,627
survived,2,object,0,1
pclass,3,object,0,1
sex,2,object,0,female
age,79,float64,0,21.0
sibsp,7,object,0,0
parch,6,object,0,0
fare,194,float64,0,77.9583
embark_town,3,object,0,Southampton
alone,2,object,0,1


In [6]:
for df in df_sets:
    print(f'Shape: {df.shape}')

Shape: (534, 10)
Shape: (178, 10)
Shape: (179, 10)


In [7]:
# Encode Titanic
encoded = e_train,e_val,e_test = [m.preprocess_titanic(df) for df in df_sets]

# check encoded train
a.df_info(e_train,samples=3)

Unnamed: 0,nunique,dtypes,isnull,92,207,243
survived,2,float64,0,0.0,1.0,0.0
pclass,3,float64,0,1.0,3.0,3.0
age,79,float64,0,46.0,26.0,22.0
sibsp,7,float64,0,1.0,0.0,0.0
parch,6,float64,0,0.0,0.0,0.0
fare,194,float64,0,61.175,18.7875,7.125
alone,2,float64,0,0.0,1.0,1.0
is_male,2,float64,0,1.0,1.0,1.0
is_Queenstown,2,float64,0,0.0,0.0,0.0
is_Southampton,2,float64,0,1.0,0.0,1.0


In [8]:
# Get X's
X_set = X_train,X_val,X_test = [df.drop(columns='survived') for df in encoded]

# check x_train
a.df_info(X_train,samples=3)

Unnamed: 0,nunique,dtypes,isnull,384,32,247
pclass,3,float64,0,3.0,3.0,2.0
age,79,float64,0,25.14,25.14,24.0
sibsp,7,float64,0,0.0,0.0,0.0
parch,6,float64,0,0.0,0.0,2.0
fare,194,float64,0,7.8958,7.75,14.5
alone,2,float64,0,1.0,1.0,0.0
is_male,2,float64,0,1.0,0.0,0.0
is_Queenstown,2,float64,0,0.0,1.0,0.0
is_Southampton,2,float64,0,1.0,0.0,1.0


In [9]:
# Get y's
y_set = y_train,y_val,y_test = [df.survived for df in encoded]

# check y_train
y_train.head()

776    0.0
829    1.0
215    1.0
258    1.0
129    0.0
Name: survived, dtype: float64

1. [x] Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [11]:
# Create the model
knn = KNeighborsClassifier()

In [12]:
# Fit the model
knn.fit(X_train,y_train)

In [13]:
# Make predictions
knn.predict(X_train)[:5]

array([0., 1., 1., 1., 0.])

In [14]:
# Get train score
train_acc = knn.score(X_train,y_train)
train_acc

0.8033707865168539

In [15]:
# Get validate score
val_acc = knn.score(X_val,y_val)
val_acc

0.6966292134831461

2. [x] Evaluate your results using the model score, confusion matrix, and classification report.

In [17]:
# Evaluate the difference between scores
train_acc - val_acc

0.10674157303370779

In [18]:
# Get the confusion matrix
pd.crosstab(y_train,knn.predict(X_train),normalize='index')

col_0,0.0,1.0
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.860182,0.139818
1.0,0.287805,0.712195


In [19]:
# Print the classification report
print(metrics.classification_report(y_train,knn.predict(X_train)))

              precision    recall  f1-score   support

         0.0       0.83      0.86      0.84       329
         1.0       0.76      0.71      0.74       205

    accuracy                           0.80       534
   macro avg       0.79      0.79      0.79       534
weighted avg       0.80      0.80      0.80       534



In [20]:
metrics.confusion_matrix(y_train,knn.predict(X_train))

array([[283,  46],
       [ 59, 146]])

In [21]:
def confusion_matrix(y_actual,y_pred,positive=None,get_rates=False):
    '''
    Return a confusion matrix and dictionary of its contents.
    
    Parameters:
    ----------
    y_actual: also known as y_true; a Series or array containing the target variable of a dataset
    y_pred: a Series or array containing the predictions made
    positive: default=None; the value to determine the positive values of the matrix. 
        If no value given, the most frequently occurring value in the target variable will be assigned as the positive.
    get_rates: bool, default=False; If True, then it will return the rates instead of the value counts themselves.
        'rates' refers to True Positive Rate, True Negative Rate, etc.
        
    '''
    # set defaults for testing
    # y_actual = y_train
    # y_pred = knn.predict(X_train)
    # positive = y_actual.mode()[0]
    
    # get the positive if not defined
    if positive==None:
        positive = y_actual.mode()[0]

    # get the negative
    negative = y_actual.unique()[y_actual.unique() != positive][0]

    # isolate target_name just in case
    target_name = y_actual.name

    # remap the arrays
    y_actual = pd.Series(np.where(y_actual == positive,'P='+str(positive),'N='+str(negative)),name=target_name)
    y_pred = pd.Series(np.where(y_pred == positive,'P='+str(positive),'N='+str(negative)),name='predicted')
    
    # create the matrix
    if get_rates == True:
        matrix = pd.crosstab(y_pred,y_actual,normalize='columns')
    else:
        matrix = pd.crosstab(y_pred,y_actual)
    
    # get values 
    TN = matrix.iloc[0,0]
    FP = matrix.iloc[1,0]
    FN = matrix.iloc[0,1]
    TP = matrix.iloc[1,1]
    
    return matrix,{'TN':TN,'FP':FP,'FN':FN,'TP':TP}

In [22]:
confuse_matrix,dict_ = confusion_matrix(y_train,knn.predict(X_train),get_rates=True,positive=1.0)

# How to return variable labels to each value?

In [23]:
confuse_matrix

survived,N=0.0,P=1.0
predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
N=0.0,0.860182,0.287805
P=1.0,0.139818,0.712195


In [24]:
dict_

{'TN': 0.8601823708206687,
 'FP': 0.1398176291793313,
 'FN': 0.28780487804878047,
 'TP': 0.7121951219512195}

3. [x] Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [26]:
# use above-built function to get the rates
matrix, values = confusion_matrix(y_train,knn.predict(X_train))
values

{'TN': 146, 'FP': 59, 'FN': 46, 'TP': 283}

In [27]:
matrix_rates, rates = confusion_matrix(y_train,knn.predict(X_train),get_rates=True)
rates

{'TN': 0.7121951219512195,
 'FP': 0.28780487804878047,
 'FN': 0.1398176291793313,
 'TP': 0.8601823708206687}

In [28]:
print(f'True Positive Rate: {rates["TP"]}')
print(f'False Positive Rate: {rates["FP"]}')
print(f'True Negative Rate: {rates["TN"]}')
print(f'False Negative Rate: {rates["FN"]}')
print()
print(metrics.classification_report(y_train,knn.predict(X_train)))

True Positive Rate: 0.8601823708206687
False Positive Rate: 0.28780487804878047
True Negative Rate: 0.7121951219512195
False Negative Rate: 0.1398176291793313

              precision    recall  f1-score   support

         0.0       0.83      0.86      0.84       329
         1.0       0.76      0.71      0.74       205

    accuracy                           0.80       534
   macro avg       0.79      0.79      0.79       534
weighted avg       0.80      0.80      0.80       534



4. [x] Run through steps 1-3 setting k to 10

In [30]:
# Build knn10 model
knn10 = KNeighborsClassifier(n_neighbors=10)

In [31]:
# fit knn10 model
knn10.fit(X_train,y_train)

In [32]:
# get metrics
matrix, rates = confusion_matrix(y_train,knn10.predict(X_train),get_rates=True)

print(f'True Positive Rate: {rates["TP"]}')
print(f'False Positive Rate: {rates["FP"]}')
print(f'True Negative Rate: {rates["TN"]}')
print(f'False Negative Rate: {rates["FN"]}')
print()
print(metrics.classification_report(y_train,knn10.predict(X_train)))

True Positive Rate: 0.8966565349544073
False Positive Rate: 0.4585365853658537
True Negative Rate: 0.5414634146341464
False Negative Rate: 0.1033434650455927

              precision    recall  f1-score   support

         0.0       0.76      0.90      0.82       329
         1.0       0.77      0.54      0.63       205

    accuracy                           0.76       534
   macro avg       0.76      0.72      0.73       534
weighted avg       0.76      0.76      0.75       534



5. [x] Run through steps 1-3 setting k to 20

In [34]:
# build knn20 model
knn20 = KNeighborsClassifier(n_neighbors=20)

In [35]:
# fit knn20 model
knn20.fit(X_train,y_train)

In [36]:
# get metrics
matrix, rates = confusion_matrix(y_train,knn20.predict(X_train),get_rates=True)

print(f'True Positive Rate: {rates["TP"]}')
print(f'False Positive Rate: {rates["FP"]}')
print(f'True Negative Rate: {rates["TN"]}')
print(f'False Negative Rate: {rates["FN"]}')
print()
print(metrics.classification_report(y_train,knn20.predict(X_train)))

True Positive Rate: 0.8844984802431611
False Positive Rate: 0.4926829268292683
True Negative Rate: 0.5073170731707317
False Negative Rate: 0.11550151975683891

              precision    recall  f1-score   support

         0.0       0.74      0.88      0.81       329
         1.0       0.73      0.51      0.60       205

    accuracy                           0.74       534
   macro avg       0.74      0.70      0.70       534
weighted avg       0.74      0.74      0.73       534



6. [ ] What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

> As a whole, it seems that the lower the k value, the better the model performs on the train data. This is likely because it is more selective about its groupings, and therefore can achieve better accuracy.

7. [x] Which model performs best on our out-of-sample data from `validate`?

In [40]:
# knn default metrics
matrix, rates = confusion_matrix(y_val,knn.predict(X_val),get_rates=True)

print(f'True Positive Rate: {rates["TP"]}')
print(f'False Positive Rate: {rates["FP"]}')
print(f'True Negative Rate: {rates["TN"]}')
print(f'False Negative Rate: {rates["FN"]}')
print()
print(metrics.classification_report(y_val,knn.predict(X_val)))

True Positive Rate: 0.7727272727272727
False Positive Rate: 0.4264705882352941
True Negative Rate: 0.5735294117647058
False Negative Rate: 0.22727272727272727

              precision    recall  f1-score   support

         0.0       0.75      0.77      0.76       110
         1.0       0.61      0.57      0.59        68

    accuracy                           0.70       178
   macro avg       0.68      0.67      0.67       178
weighted avg       0.69      0.70      0.69       178



In [41]:
# knn10 metrics
matrix, rates = confusion_matrix(y_val,knn10.predict(X_val),get_rates=True)

print(f'True Positive Rate: {rates["TP"]}')
print(f'False Positive Rate: {rates["FP"]}')
print(f'True Negative Rate: {rates["TN"]}')
print(f'False Negative Rate: {rates["FN"]}')
print()
print(metrics.classification_report(y_val,knn10.predict(X_val)))

True Positive Rate: 0.8090909090909091
False Positive Rate: 0.5147058823529411
True Negative Rate: 0.4852941176470588
False Negative Rate: 0.19090909090909092

              precision    recall  f1-score   support

         0.0       0.72      0.81      0.76       110
         1.0       0.61      0.49      0.54        68

    accuracy                           0.69       178
   macro avg       0.66      0.65      0.65       178
weighted avg       0.68      0.69      0.68       178



In [42]:
# knn20 metrics
matrix, rates = confusion_matrix(y_val,knn20.predict(X_val),get_rates=True)

print(f'True Positive Rate: {rates["TP"]}')
print(f'False Positive Rate: {rates["FP"]}')
print(f'True Negative Rate: {rates["TN"]}')
print(f'False Negative Rate: {rates["FN"]}')
print()
print(metrics.classification_report(y_val,knn20.predict(X_val)))

True Positive Rate: 0.8181818181818182
False Positive Rate: 0.5294117647058824
True Negative Rate: 0.47058823529411764
False Negative Rate: 0.18181818181818182

              precision    recall  f1-score   support

         0.0       0.71      0.82      0.76       110
         1.0       0.62      0.47      0.53        68

    accuracy                           0.69       178
   macro avg       0.66      0.64      0.65       178
weighted avg       0.68      0.69      0.68       178



> From the three models built, the first with a default neighbors of 5 seems to perform the best. It is notable, however, that there is a 10 point drop between the train and validate scores.