<h1 style="background-color:Tomato;">K-Nearest Neighbour Exercises

In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix

import env
import acquire

import warnings
warnings.filterwarnings("ignore")

from sklearn.neighbors import KNeighborsClassifier

from pydataset import data

### 1 / Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e, make predictions on the training sample).
Use Titanic.

In [3]:
tati = pd.read_csv('titanic.csv')
tati.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [5]:
to_drop = ['deck', 'Unnamed: 0', 'age', 'embarked', 'passenger_id', 'class']

# indicating which unnecessary or superfluous columns to drop

In [6]:
tati.drop(columns = to_drop, inplace = True)

# dropping unnecessary or superfluous columns 

In [7]:
tati = tati.dropna()
tati.info()

#  drop rows with nulls

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     889 non-null    int64  
 1   pclass       889 non-null    int64  
 2   sex          889 non-null    object 
 3   sibsp        889 non-null    int64  
 4   parch        889 non-null    int64  
 5   fare         889 non-null    float64
 6   embark_town  889 non-null    object 
 7   alone        889 non-null    int64  
dtypes: float64(1), int64(5), object(2)
memory usage: 62.5+ KB


In [8]:
# baseline for survival is to die (ie, ['survived'] == '0').

tati[tati.survived == 0]

tati.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone
0,0,3,male,1,0,7.25,Southampton,0
1,1,1,female,1,0,71.2833,Cherbourg,0
2,1,3,female,0,0,7.925,Southampton,1
3,1,1,female,1,0,53.1,Southampton,0
4,0,3,male,0,0,8.05,Southampton,1


In [9]:
tati['survival_baseline'] = 0


In [10]:
# Boolean baseline (T = 1, F = 0)

(tati['survival_baseline'] == tati['survived']).mean()

0.6175478065241845

In [11]:
# creating dummies in order to help machine learning

tatdum = pd.get_dummies(tati[['embark_town', 'sex']], drop_first = True)


In [12]:
# now that we have the DF and the dummies, use concatenation to join the two :

pd.concat([tati, tatdum], axis = 1)

# assigning it a variable for permanence

tati = pd.concat([tati, tatdum], axis = 1)
tati.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,survival_baseline,embark_town_Queenstown,embark_town_Southampton,sex_male
0,0,3,male,1,0,7.25,Southampton,0,0,0,1,1
1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0,0
2,1,3,female,0,0,7.925,Southampton,1,0,0,1,0
3,1,1,female,1,0,53.1,Southampton,0,0,0,1,0
4,0,3,male,0,0,8.05,Southampton,1,0,0,1,1


In [13]:
# dropping columns for machine learning purposes (don't do this for exploration purposes)

tati = tati.drop(columns = ['embark_town', 'sex'], axis = 1)
tati.columns

Index(['survived', 'pclass', 'sibsp', 'parch', 'fare', 'alone',
       'survival_baseline', 'embark_town_Queenstown',
       'embark_town_Southampton', 'sex_male'],
      dtype='object')

In [14]:
# setting the 'survived' column to Boolean T / F
tati['survived'] = np.where(tati['survived'] == True, '1', '0')

In [15]:
tati['survived']

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: survived, Length: 889, dtype: object

In [16]:
def train_val_test(df, target, seed = 23):
    
    train, val_test = train_test_split(df, train_size = 0.7, 
                                       random_state = seed, 
                                       stratify = df[target])
    
    val, test = train_test_split(val_test, train_size = 0.5, 
                                 random_state = seed, 
                                 stratify = val_test[target])
    
    return train, val, test

## function to train, validate, test data

In [17]:
train, val, test  = train_val_test(tati, 'survived')

train.shape, val.shape, test.shape

# returning the data shapes / sizes to be trained, validated, tested

((622, 10), (133, 10), (134, 10))

In [18]:
# split train, val, test into X_train, X_val, X_test (feature DF, exludes var 'survived') and y-train, etc

# y_train is only our target

# X_train is everything except for 'survived'

X_train = train.drop(columns = ['survived'])
y_train = train['survived']


X_val = val.drop(columns = ['survived'])
y_val = val['survived']


X_test = test.drop(columns = ['survived'])
y_test = test['survived']

In [29]:
knn = KNeighborsClassifier(n_neighbors = 5)

# has 5 neighbours : will find the 5 observations nearest to the new datapoint and check their class

In [25]:
knn.fit(X_train, y_train)

# training the KNN model

In [26]:
# this tells us how accurate the model. 

knn.score(X_train, y_train)

# we're predicting on the features (X_train, the first variable), and
# comparing to the actual labels (the 2nd variable, y_train)

0.8295819935691319

In [27]:
knn.score(X_val, y_val)

# the divide btwn the train & val scores is quite high, about 12pc

0.706766917293233

In [33]:
## make a loop

no_neighbours = []
train_accuracy = []
validation_accuracy = []

# the empty lists can be filled each time going through the loop

for i in range(1, 13, 2):
    
    knn = KNeighborsClassifier(n_neighbors = i)
    
    knn.fit(X_train, y_train)    # training the KNN model
    
    no_neighbours.append(i)
    
    train_accuracy.append(knn.score(X_train, y_train))
            # this tells us how accurate the training model
    
    validation_accuracy.append(knn.score(X_val, y_val))
                    # this tells us how accurate the validation model
        

### 2 / Evaluate your results using the model score, confusion matrix, and classification report.