In [2]:
# Importing libraries in order to use the different functionalities later on
import seaborn as sns
import sklearn as sk
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 

## Data pre-processing steps 

In [3]:
# Importing the dataset of student and showing the head in order to get an overview of the dataset
df = pd.read_csv('student-por.csv')
df = df.dropna() 
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


In [4]:
# Counting the romantic variable
df['romantic'].value_counts() 

no     410
yes    239
Name: romantic, dtype: int64

In [5]:
# Creating dummy variables, because these variables are qualitative variables (so for example gender exist
# as man and woman) 
Pstatus_dummies = pd.get_dummies(df['Pstatus'])
internet_dummies = pd.get_dummies(df['internet'])
sex_dummies = pd.get_dummies(df['sex'])

# Dummy variable = Pstatus | internet | freetime
# Getting a subset with quantitative variables
df_subset = df[['age', 'traveltime', 'studytime', 'absences']]

# Concatenate the different subset with the current subset
df_subset = pd.concat([df_subset, Pstatus_dummies], axis=1)
df_subset = pd.concat([df_subset, internet_dummies], axis=1)
df_subset = pd.concat([df_subset, sex_dummies], axis=1)

df_subset.head()

Unnamed: 0,age,traveltime,studytime,absences,A,T,no,yes,F,M
0,18,2,2,4,1,0,1,0,1,0
1,17,1,2,2,0,1,0,1,1,0
2,15,1,2,6,0,1,0,1,1,0
3,15,1,3,0,0,1,0,1,1,0
4,16,1,2,0,0,1,1,0,1,0


In [6]:
# Creating a training and test set
# X = independent variables Y = dependent variable
from sklearn.preprocessing import normalize 

X = df_subset[['age', 'traveltime', 'studytime','absences', 'A', 'yes','F']]
X = normalize(X) 
y = df['romantic'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

## K-nearest neighbour algorithm

K-nearest neighbor is a simple algorithm which solves mostly regression and classification problems. Supervised machine learning algorithms are used to solve classification or regression problems. The KNN algorithm assumes that similar things exist in close proximity. In other words, similar things are near to each other. 

In [59]:
# Calculate the KNN-score
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=10) 
knn = knn.fit(X_train, y_train) 
knn.score(X_test, y_test) 

0.6256410256410256

62% van de romantic interest is predicted accurately

In [60]:
# Calculate the values for the confusion matrix
from sklearn.metrics import confusion_matrix
y_test_pred = knn.predict(X_test) 
cm = confusion_matrix(y_test, y_test_pred) 
cm

array([[111,  17],
       [ 56,  11]])

In [61]:
# Creating the confusion matrix 
# Chose the biggest value as no, since line 4 indicated that there are more 'no interests' in a romantic relationship
conf_matrix = pd.DataFrame(cm, index=['no : actual', 'yes : actual'], columns = ['no : predict', 'yes : predict']) 
conf_matrix

Unnamed: 0,no : predict,yes : predict
no : actual,111,17
yes : actual,56,11


Calculating the different variables: 

$accuracy = \frac{111 + 11}{111 + 11 + 17 + 56} = 0.625$
The accuracy is exactly the same as the knn-score


How much of the predicted ‘no-romantic’ is actually romantic interest? 
- So ( 'predict no-romantic interest as actual' / 'all the no actual' ) 

$recall = \frac{111}{111 + 17} = 0.87$


How much of the 'no-romantic' interest is actually predict as romantic interest?
- So ( 'no romantic interest predicted as yes' / 'all the predicted yes' ) 

$precision = \frac{17}{11+17} = 0.61$