forked from misbahulard/Machine-Learning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
knn_w_random_subsampling.py
108 lines (84 loc) · 2.86 KB
/
knn_w_random_subsampling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import random
import pandas as pd
import numpy as np
import operator
import math
# Fungsi untuk menghitung jarak euclidean
def euclideanDistance(inst1, inst2, length):
distance = 0
for x in range(length):
# total terlebih dahulu baru di akar (test - training)^2 + ...
distance += pow((inst1[x] - inst2[x]), 2)
return math.sqrt(distance)
# Fungsi untuk mencari tetangga sebanyak k
def getNeighbors(training, test, k):
# untuk setiap data training hitung jaraknya
# dan simpan jarak ke list dan urutkan asc
distances = []
length = len(test)-1
for x in range(len(training)):
dist = euclideanDistance(training[x], test, length)
distances.append((training[x], dist))
distances.sort(key=operator.itemgetter(1))
# cari tetangga sebanyak k dan simpan ke list
neighbors = []
for x in range(k):
neighbors.append(distances[x][0])
# Tambahkan jaraknya
for x in range(k):
if (len(neighbors[x]) == 5):
neighbors[x].insert(5, distances[x][1])
else:
neighbors[x][5] = distances[x][1]
return neighbors
# fungsi untuk memprediksi data test masuk ke label mana
def predict(neighbors):
# Hitung dan masukkan ke var classVote
classVote = {}
for x in range(len(neighbors)):
res = neighbors[x][-2]
if res in classVote:
classVote[res] += 1
else:
classVote[res] = 1
# urutkan yang paling banyak mana
sortedVotes = sorted(classVote.items(), key=operator.itemgetter(1), reverse=True)
return sortedVotes[0][0]
def calculateError(testSet, prediction):
correct = 0
for x in range(len(testSet)):
if testSet[x][4] == prediction[x]:
correct += 1
numError = len(testSet) - correct
return (numError/float(len(testSet))) * 100.0
# fungsi untuk mendapatkan random sample test sebanyak k
def getRandomSample(training, k):
temp = []
for i in range(k):
r = random.randint(0, len(training) - 1)
temp.append(training[r])
training.pop(r)
return temp
def main():
df = pd.read_csv('irisdata.csv')
training = df.values
training = training.tolist()
print("\nInput K: ")
k = int(input())
print("\nNumber of random sample: ")
numOfRand = int(input())
data_test = getRandomSample(training, numOfRand)
# print("Total training data set: ", len(training))
# print(training)
prediction = []
for x in range(len(data_test)):
neighbors = getNeighbors(training, data_test[x], k)
print("\nNeighbors data test ke-: ", (x+1))
for ng in neighbors:
print("=> ", ng[4], " | distance: ", ng[5])
prediction.append(predict(neighbors))
print(predict(neighbors))
error = calculateError(data_test, prediction)
print("Error ratio: ", error, "%")
if __name__ == "__main__":
main()