## Import Libraries

In [2]:
import mysql.connector
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from collections import Counter
from tkinter import *
from PyQt6 import *

## Data Preparations

SQL

In [49]:
# connect to sql
connection = mysql.connector.connect(host='localhost',user='root',password='',database='iris')
# dealing with nulls
def filter():
    for item in data.columns:
        sql = f"UPDATE iris_1 set {item} = null WHERE {item} = ''"
        mycursor = connection.cursor()
        mycursor.execute(sql)
        connection.commit()
# read from sql
query = 'SELECT * FROM iris_1'
data = pd.read_sql(query, connection)
filter()
# copy from original data
df = data.copy()



Preparing Target

In [50]:
# check target variabel uniqueness
df['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [51]:
# encode target variabel
df.loc[df['Species'] == 'Iris-setosa', 'Species'] = 0
df.loc[df['Species'] == 'Iris-versicolor', 'Species'] = 1
df.loc[df['Species'] == 'Iris-virginica', 'Species'] = 2

Drop Unnecessary Features

In [52]:
df = df.drop('Id', axis = 1)

In [37]:
df

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


Getting Knowledge

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   SepalLengthCm  150 non-null    object
 1   SepalWidthCm   150 non-null    object
 2   PetalLengthCm  150 non-null    object
 3   PetalWidthCm   150 non-null    object
 4   Species        150 non-null    object
dtypes: object(5)
memory usage: 6.0+ KB


Convert to Numerical

In [63]:
# convert to float for X vars
for i in df.columns:
    df[i] = df[i].astype(float)
# convert to int for y var
df['Species'] = df['Species'].astype(int)

In [None]:
# do not forget to double check ;)
df.info()
df

Getting Numerical Variables

In [66]:
num_vars = [i for i in df.columns if df[i].dtype != 'O']
print(f'Variabel bertipe numerik ada : {len(num_vars)}')
print(f'Terdiri dari : {num_vars}')

Variabel bertipe numerik ada : 5
Terdiri dari : ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species']


Getting Categorical Variables

In [67]:
cat_vars = [i for i in df.columns if df[i].dtype == 'O']
print(f'Variabel bertipe kategork ada : {len(cat_vars)}')
print(f'Terdiri dari : {cat_vars}')

Variabel bertipe kategork ada : 0
Terdiri dari : []


## KNN

Split For KNN Testing

In [82]:
X = df.iloc[:, df.columns != 'Species']
y = df['Species']

Create Function to Calculate Distance

In [83]:
# use p = 1 for manhattan, p = 2 for euclidean
def minkowski(a, b, p = 2):
    # store the dimension number
    dim = len(a)
    # set initial distance to 0
    distance = 0
    # calculate distance
    for i in range(dim):
        distance += abs(a[i] - b[i])**p
    distance = np.sqrt(distance)
    return distance
# test
minkowski(a=X.iloc[0], b=X.iloc[1], p=2)

0.5385164807134502

Trying to Use the Distance Function

In [90]:
# define arbitrary test point
test_pt = [4.8, 2.5, 2.7, 0.7]
# calculate distance between test point and all points in X
distances = []
for i in X.index:
    distances.append(minkowski(test_pt, X.iloc[i]))
# store in a dataframe
df_dists = pd.DataFrame(data = distances, index = X.index, columns = ['dist'])
df_dists

Unnamed: 0,dist
0,1.740690
1,1.483240
2,1.646208
3,1.445683
4,1.786057
...,...
145,3.559494
146,2.996665
147,3.328663
148,3.552464


Sort Distance Measurements to Find Points Closest to the Test Points

In [93]:
# find 5 nearest neighbors
df_nn = df_dists.sort_values(by =['dist'], axis = 0)[:5]
df_nn

Unnamed: 0,dist
98,0.583095
57,0.685565
93,0.728011
60,1.00995
79,1.24499


Use Majority Label to Predict the Label of The Test Point

In [95]:
# create counter to track labels
counter = Counter(y[df_nn.index])
# get most common label of all the nearest neighbor
counter.most_common()[0][0]

1