In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from collections import Counter

%matplotlib inline

In [22]:
data = pd.read_csv('data.csv', delimiter=',')
data.head()

Unnamed: 0,Id,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1350 entries, 0 to 1349
Data columns (total 12 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Id                                    1350 non-null   int64  
 1   SeriousDlqin2yrs                      1350 non-null   int64  
 2   RevolvingUtilizationOfUnsecuredLines  1350 non-null   float64
 3   age                                   1350 non-null   int64  
 4   NumberOfTime30-59DaysPastDueNotWorse  1350 non-null   int64  
 5   DebtRatio                             1350 non-null   float64
 6   MonthlyIncome                         1094 non-null   float64
 7   NumberOfOpenCreditLinesAndLoans       1350 non-null   int64  
 8   NumberOfTimes90DaysLate               1350 non-null   int64  
 9   NumberRealEstateLoansOrLines          1350 non-null   int64  
 10  NumberOfTime60-89DaysPastDueNotWorse  1350 non-null   int64  
 11  NumberOfDependent

In [24]:
data['MonthlyIncome'].fillna(data['MonthlyIncome'].mean(), inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1350 entries, 0 to 1349
Data columns (total 12 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Id                                    1350 non-null   int64  
 1   SeriousDlqin2yrs                      1350 non-null   int64  
 2   RevolvingUtilizationOfUnsecuredLines  1350 non-null   float64
 3   age                                   1350 non-null   int64  
 4   NumberOfTime30-59DaysPastDueNotWorse  1350 non-null   int64  
 5   DebtRatio                             1350 non-null   float64
 6   MonthlyIncome                         1350 non-null   float64
 7   NumberOfOpenCreditLinesAndLoans       1350 non-null   int64  
 8   NumberOfTimes90DaysLate               1350 non-null   int64  
 9   NumberRealEstateLoansOrLines          1350 non-null   int64  
 10  NumberOfTime60-89DaysPastDueNotWorse  1350 non-null   int64  
 11  NumberOfDependent

In [25]:
data['DebtRatio'] = data['DebtRatio'] * data['MonthlyIncome']
data.head()

Unnamed: 0,Id,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,7323.197016,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,316.878123,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,258.914887,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,118.963951,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,1584.975094,63588.0,7,0,1,0,0.0


In [26]:
def eucl_distance(x, y):
    x = np.array(x) 
    y = np.array(y)
    return np.sqrt(((x-y)**2).sum())

In [27]:
def get_neighbors(train_data, train_labels, inst, k):
    neigh = []
    for i in range(len(train_labels)):
        distance = eucl_distance(inst, train_data.iloc[[i]])
        neigh.append((i, distance, y[i]))
    neigh.sort(key=lambda n: n[1])
    neighbors = neigh[:k]
    return neighbors

In [28]:
def get_majority(neighbours):
    classes = [neighbour[2] for neighbour in neighbours]
    count = Counter(classes)
    return count.most_common()[0][0] 

In [29]:
def vote_distance_weights(neighbors):
    class_counter = Counter()
    number_of_neighbors = len(neighbors)
    for i in range(number_of_neighbors):
        dist = neighbors[i][1]
        label = neighbors[i][2]
        class_counter[label] += 1 / (dist**2 + 1)
    labels, votes = zip(*class_counter.most_common())
    winner = class_counter.most_common(1)[0][0]
    votes4winner = class_counter.most_common(1)[0][1]
    return winner, votes4winner / sum(votes)

In [31]:
X = data[['age', 'DebtRatio']].copy()
y = data['SeriousDlqin2yrs']
for i in ['age', 'DebtRatio']:
    maxv = max(X[i])
    X[i] = X[i] / maxv

In [37]:
X.iloc[[12]]

Unnamed: 0,age,DebtRatio
12,0.474227,4.6e-05


In [38]:
X.iloc[[12]]-(np.zeros(2) + [1,1])

Unnamed: 0,age,DebtRatio
12,-0.525773,-0.999954


In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
y_pred = []
for i in range(len(y_test)):
    neighbors = get_neighbors(X_train, y_train, X_test.iloc[[i]], 2)
    y_pred.append(get_majority(neighbors))

In [42]:
count = np.count_nonzero(y_pred - y_test)
print(1-count/len(y_pred))

0.8740740740740741


In [43]:
np.count_nonzero(y_test)

19

In [44]:
np.count_nonzero(y_pred)

15

In [45]:
y_pred = []
for i in range(len(y_test)):
    neighbors = get_neighbors(X_train, y_train, X_test.iloc[[i]], 2)
    y_pred.append(vote_distance_weights(neighbors)[0])

In [46]:
count = np.count_nonzero(y_pred - y_test)
print(1-count/len(y_pred))

0.8740740740740741


In [47]:
np.count_nonzero(y_pred)

15