In [1]:
import numpy as np
import pandas as pd

from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# Find the Nearest Neighbor

In [2]:
data = {"Age":[22,33,28,51,25,39,54,55,50,60],"Marital":['Single','Married','Other','Other','Single','Single','Signle','Married','Married','Married'],"Income":['$46,156.98','$24,188.10','$28,787.34','$23,886.72','$47,281.44','$33,994.90','$28,716.50','$49,186.75','$46,726.50','$36,120.34'],'Risk':['Bad loss','Bad loss','Bad loss','Bad loss','Bad loss','Good risk','Good risk','Good risk','Good risk','Good risk']}
df = pd.DataFrame(data)
print df.head()
print df['Risk'].unique()

   Age      Income  Marital      Risk
0   22  $46,156.98   Single  Bad loss
1   33  $24,188.10  Married  Bad loss
2   28  $28,787.34    Other  Bad loss
3   51  $23,886.72    Other  Bad loss
4   25  $47,281.44   Single  Bad loss
['Bad loss' 'Good risk']


In [3]:
#change the categorical target variable from the Risk column into numerical
labels = df['Risk'].astype('category').cat.categories.tolist()
replace_map = {'Risk':{ k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
df.replace(replace_map, inplace=True)

In [4]:
df

Unnamed: 0,Age,Income,Marital,Risk
0,22,"$46,156.98",Single,1
1,33,"$24,188.10",Married,1
2,28,"$28,787.34",Other,1
3,51,"$23,886.72",Other,1
4,25,"$47,281.44",Single,1
5,39,"$33,994.90",Single,2
6,54,"$28,716.50",Signle,2
7,55,"$49,186.75",Married,2
8,50,"$46,726.50",Married,2
9,60,"$36,120.34",Married,2


In [5]:
# Convert the Marital with one-hot-encoding
df = pd.get_dummies(df,columns=['Marital'],prefix=['Marital'])

In [6]:
df

Unnamed: 0,Age,Income,Risk,Marital_Married,Marital_Other,Marital_Signle,Marital_Single
0,22,"$46,156.98",1,0,0,0,1
1,33,"$24,188.10",1,1,0,0,0
2,28,"$28,787.34",1,0,1,0,0
3,51,"$23,886.72",1,0,1,0,0
4,25,"$47,281.44",1,0,0,0,1
5,39,"$33,994.90",2,0,0,0,1
6,54,"$28,716.50",2,0,0,1,0
7,55,"$49,186.75",2,1,0,0,0
8,50,"$46,726.50",2,1,0,0,0
9,60,"$36,120.34",2,1,0,0,0


In [7]:
df['Income'] = df.Income.str[1:]

In [8]:
df['Income'] = df['Income'].str.replace(',','')
df['Income'].astype(float)

0    46156.98
1    24188.10
2    28787.34
3    23886.72
4    47281.44
5    33994.90
6    28716.50
7    49186.75
8    46726.50
9    36120.34
Name: Income, dtype: float64

In [None]:
scaler = MinMaxScaler()
df[['Age','Income']] = scaler.fit_transform(df[['Age','Income']])

In [10]:
df

Unnamed: 0,Age,Income,Risk,Marital_Married,Marital_Other,Marital_Signle,Marital_Single
0,0.0,0.880246,1,0,0,0,1
1,0.289474,0.011912,1,1,0,0,0
2,0.157895,0.1937,1,0,1,0,0
3,0.763158,0.0,1,0,1,0,0
4,0.078947,0.924691,1,0,0,0,1
5,0.447368,0.399532,2,0,0,0,1
6,0.842105,0.1909,2,0,0,1,0
7,0.868421,1.0,2,1,0,0,0
8,0.736842,0.902757,2,1,0,0,0
9,1.0,0.483542,2,1,0,0,0


In [11]:
neigh = NearestNeighbors(3)
neigh.fit(df[:-1])

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=3, p=2, radius=1.0)

In [12]:
neighbors = neigh.kneighbors(df[-1:],3,return_distance=False)[0]
neighbors

array([8, 7, 1], dtype=int64)

In [13]:
df.loc[neighbors, :]

Unnamed: 0,Age,Income,Risk,Marital_Married,Marital_Other,Marital_Signle,Marital_Single
8,0.736842,0.902757,2,1,0,0,0
7,0.868421,1.0,2,1,0,0,0
1,0.289474,0.011912,1,1,0,0,0


# Predict the Risk

In [14]:
df_risk = pd.read_csv('ClassifyRisk', usecols=['age','marital_status','income','risk'])
df_risk.head()

Unnamed: 0,age,marital_status,income,risk
0,34,other,28060.7,bad loss
1,37,other,28009.34,bad loss
2,29,other,27614.6,bad loss
3,33,other,27287.18,bad loss
4,39,other,26954.06,bad loss


In [None]:
#change the categorical target variable from the Risk column into numerical
labels = df_risk['risk'].astype('category').cat.categories.tolist()
replace_map = {'risk':{ k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
df_risk.replace(replace_map, inplace=True)

# Convert the Marital with one-hot-encoding
df_risk = pd.get_dummies(df_risk,columns=['marital_status'],prefix=['marital_status'])

#Scale the age and income variable
scaler = MinMaxScaler()
df_risk[['age','income']] = scaler.fit_transform(df_risk[['age','income']])

print df_risk.head()

In [16]:
to_be_predicted = df_risk[:1]
df_risk.drop(df_risk.index[0], inplace=True)
print to_be_predicted
print "\n"
print df_risk.head()

        age    income  risk  marital_status_married  marital_status_other  \
0  0.346939  0.202218     1                       0                     1   

   marital_status_single  
0                      0  


        age    income  risk  marital_status_married  marital_status_other  \
1  0.408163  0.201404     1                       0                     1   
2  0.244898  0.195148     1                       0                     1   
3  0.326531  0.189959     1                       0                     1   
4  0.448980  0.184680     1                       0                     1   
5  0.224490  0.173868     1                       0                     1   

   marital_status_single  
1                      0  
2                      0  
3                      0  
4                      0  
5                      0  


In [17]:
#split dataset
X = df_risk[['age','income','marital_status_married','marital_status_other','marital_status_single']]
y = df_risk[['risk']]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

In [18]:
# k - odd number, which is a square root of the number of instances of the test set
import math
print math.sqrt(len(y_test))

7.0


In [19]:
neigh = KNeighborsClassifier(n_neighbors = 7)
neigh.fit(X_train,y_train.values.ravel())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=7, p=2,
           weights='uniform')

In [20]:
y_pred = neigh.predict(X_test)
y_pred

array([2, 2, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 2, 1,
       2, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2,
       1, 1, 2], dtype=int64)

In [21]:
#Evaluate model
#cols - predicted, rows - actual
cm = confusion_matrix(y_test,y_pred)
print cm

[[19  4]
 [ 2 24]]


In [22]:
# Accuracy = TP+TN/TP+FP+FN+TN - the ratio of correctly predicted observation to the total observations
# Precision = TP/TP+FP -he ratio of correctly predicted positive observations to the total predicted positive observations
# Recall = TP/TP+FN - the ratio of correctly predicted positive observations to the all observations in actual class
# F1 Score = 2*(Recall * Precision) / (Recall + Precision)
print('F1 score: {}'.format(f1_score(y_test, y_pred,average=None)))
print('Accuracy: {}'.format(accuracy_score(y_test, y_pred)))

F1 score: [ 0.86363636  0.88888889]
Accuracy: 0.877551020408


In [23]:
predicted = neigh.predict(to_be_predicted[['age','income','marital_status_married','marital_status_other','marital_status_single']])
print 'Predicted {}'.format(predicted)
print 'Actual {}'.format(to_be_predicted[['risk']].values[0])

Predicted [1]
Actual [1]


In [24]:
predicted = neigh.predict([[20,2000,0,0,1]])
print 'Predicted {}'.format(predicted)

Predicted [2]


In [25]:
from sklearn.metrics import classification_report
print classification_report(y_test,neigh.predict(X_test))

              precision    recall  f1-score   support

           1       0.90      0.83      0.86        23
           2       0.86      0.92      0.89        26

   micro avg       0.88      0.88      0.88        49
   macro avg       0.88      0.87      0.88        49
weighted avg       0.88      0.88      0.88        49

