In [None]:
# Libraries
from sklearn.model_selection import train_test_split       # Split our data into train and test sets where feature variables are given as input in the method.
import pandas as pd                                        # Bring pandas data analysis to Python
from sklearn.neighbors import KNeighborsClassifier         # Used to get functionalities to calculate KNN
import  numpy as np                                        # Used to working with arrays

In [None]:
# Import the dataset that we analyse
from google.colab import files
uploaded = files.upload()

In [None]:
# To store the dataset in Pandas dataframe
import io
df = pd.read_excel(io.BytesIO(uploaded['donor-selected.xlsx']))
print(df)

    TARGET_B  DONOR_AGE  PER_CAPITA_INCOME
0          0         87              11855
1          1         79              10385
2          0         75              30855
3          0         39              16342
4          0         74              12107
..       ...        ...                ...
95         1         87              17761
96         0          7              17179
97         0         17              10244
98         0         86              37099
99         0         52              20273

[100 rows x 3 columns]


In [None]:
# K-Nearest Neighbors /splitting dataset into two parts as x and y, x represents DONOR_AGE and INCOME, y represents Target_B
X = df.iloc[:, [1, 2]].values
y = df.iloc[:, 0].values
print(X)
print(y)

[[   87 11855]
 [   79 10385]
 [   75 30855]
 [   39 16342]
 [   74 12107]
 [   63  6851]
 [   71  5900]
 [   79 12667]
 [   41  8132]
 [   63 11428]
 [   43  6519]
 [   87  9042]
 [   55 13841]
 [   77 12671]
 [   37 10221]
 [   73 12426]
 [   81 12208]
 [   76 13079]
 [   75 19388]
 [   69 13347]
 [   45 20210]
 [   71 29771]
 [   43 13619]
 [   87 10469]
 [   55  9807]
 [   77 24742]
 [   37 27674]
 [   52 13018]
 [   30 18212]
 [   43  9742]
 [   77  5016]
 [   67 16041]
 [   37 42851]
 [   67 12605]
 [   65 22959]
 [   77 14394]
 [   59 18262]
 [   75 17885]
 [   42 14366]
 [   72  8371]
 [   76 21808]
 [   33 14123]
 [   79 27547]
 [   58 15150]
 [   38  9591]
 [   47 13576]
 [   45 14928]
 [   56 10050]
 [   80 16023]
 [   82 10815]
 [   82 16782]
 [   64 13025]
 [   49 19249]
 [   75 32344]
 [   61 11878]
 [   58  9958]
 [   48 20091]
 [   85  9343]
 [   51  9717]
 [   59 19730]
 [   47 30842]
 [   49 17008]
 [   76  8520]
 [   80 14751]
 [   42 17493]
 [   70 19935]
 [   54 14

In [None]:
# Splitting the dataset into the 80% Training set and 20% Test set and print training set x values
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0,train_size=0.8)
print(X_train)

[[   58 15150]
 [   76  8520]
 [   39 16342]
 [   70  5381]
 [   47 13576]
 [   80 16023]
 [   71  5900]
 [   52 20273]
 [   78 10149]
 [   57 14082]
 [   47 30842]
 [   45  7839]
 [   67 18081]
 [   56 11819]
 [   64 13025]
 [   52 13018]
 [   75 19388]
 [   48 20091]
 [   80 14751]
 [   32 18663]
 [   79 10385]
 [   49 17008]
 [   79 27547]
 [   33 14123]
 [   74 12107]
 [   73 12426]
 [   76 13079]
 [   76 21808]
 [   42 14366]
 [   63  6851]
 [   67 18534]
 [   59 19730]
 [   87 11855]
 [   65 22959]
 [   30 18212]
 [   82 16782]
 [   87  9042]
 [   77 14394]
 [   87 10469]
 [   49 19249]
 [   43  6519]
 [   67 16041]
 [   54 14765]
 [   85  9343]
 [   73 21162]
 [   33 13430]
 [   37 42851]
 [   54 15080]
 [   37 10221]
 [   60 12793]
 [   69 13347]
 [   43  9742]
 [   82 10815]
 [   17 10244]
 [   86 37099]
 [   52 10580]
 [   45 20210]
 [   71 12400]
 [   63 21294]
 [   82 18534]
 [   77 24742]
 [   75 17885]
 [   86 13429]
 [   45 14928]
 [   72  8371]
 [   70 19935]
 [   51  9

In [None]:
# Feature Scaling
# Feature Scaling is used because the ranges are high and since the income has larger values it will dominate the dataset.
# So the scaling is used to reduce the gap between ranges and get all the values approximately from (+1)-(-1)
# We can skip this feature scaling step and get the results as well.
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
print(X_train)

[[-0.1716426  -0.09656192]
 [ 0.84970181 -1.09413581]
 [-1.24972837  0.08279073]
 [ 0.50925367 -1.56644116]
 [-0.79579752 -0.33339168]
 [ 1.07666723  0.03479283]
 [ 0.56599503 -1.48835053]
 [-0.51209074  0.67426328]
 [ 0.96318452 -0.84903055]
 [-0.22838396 -0.25725708]
 [-0.79579752  2.26451343]
 [-0.90928023 -1.19660155]
 [ 0.3390296   0.34444699]
 [-0.28512531 -0.59775628]
 [ 0.16880553 -0.41629714]
 [-0.51209074 -0.41735039]
 [ 0.79296045  0.54110296]
 [-0.73905616  0.6468789 ]
 [ 1.07666723 -0.15659691]
 [-1.64691786  0.43201683]
 [ 1.01992588 -0.81352113]
 [-0.68231481  0.18299951]
 [ 1.01992588  1.76873575]
 [-1.59017651 -0.25108807]
 [ 0.7362191  -0.55442276]
 [ 0.67947774 -0.50642486]
 [ 0.84970181 -0.4081721 ]
 [ 0.84970181  0.90522496]
 [-1.0795043  -0.21452541]
 [ 0.11206418 -1.34525962]
 [ 0.3390296   0.41260702]
 [-0.11490125  0.59256153]
 [ 1.47385673 -0.59233959]
 [ 0.22554689  1.0784086 ]
 [-1.76040057  0.36415773]
 [ 1.19014994  0.14899473]
 [ 1.47385673 -1.01559379]
 

In [None]:
# Applying K-NN to the Training set
knn = KNeighborsClassifier(n_neighbors = 3,weights='uniform')  # As n_neighbors we can put any odd number. It represent how many votings get for Target_B. So to avoid tie votes for 1 and 0 we use an odd number.
knn.fit(X_train, y_train)                                      # Distances/ weights represent the distances between the new data that we want to predict to the existing data. They are uniform

KNeighborsClassifier(n_neighbors=3)

In [None]:
# Predicting the Test set results
y_pred = knn.predict(X_test)
print('Using the 20 test data to compare between Predicted and Actual values')
print('Predicted Values',y_pred)
print('Actual values   ',y_test)

Using the 20 test data to compare between Predicted and Actual values
Predicted Values [0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0]
Actual values    [1 1 0 0 0 0 1 0 1 1 0 1 1 0 1 0 0 0 1 0]


In [None]:
# Evaluate whether the model is good or not
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[10  1]
 [ 7  2]]
