-
Notifications
You must be signed in to change notification settings - Fork 0
/
kNN.m
187 lines (173 loc) · 5.65 KB
/
kNN.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
%Supervised ML algorithm - k Nearest Neighbours (kNN) with merge sorting for Octave.
%The data is divided equally for training and testing sets.
%Verification of effectiveness is in the form of a plot of draws and successfully assigned classes as a function of k (number of nearest neighbours used to classify the record).
%my dataset was CTG.xls - https://archive.ics.uci.edu/ml/datasets/Cardiotocography - i chose the NSP to be the objective variable (last column: Normal=1; Suspect=2; Pathologic=3).
%Remember to clean the data from NaNs, empty cells and headings - like in the CTGdata100.xlsx
clear all % remove old variables
%LOADING DATA FROM FILE
tic
dataraw = xlsread("CTGdata100.xlsx"); % pkg load io (In command window. Also select the directiory where you have the input file)
data = dataraw(:,1:end-2); %remove two last columns in the dataset - objective variable(s) - to be found by the algorithm
lengthR = length(data(:,1)); %rows
lengthC = length(data(1,:)); %columns
% RANDOMIZING INPUT DATA - 50/50 training to testing
permutt = randperm(lengthR);
dtki = permutt(1:round(lengthR/2)); % KnownDataIndex = Testing data
dtui= permutt(round(lengthR/2)+1:lengthR); % UnknownDataIndex = Training data
%WAITBAR FOR DISTANCES
wtbr = waitbar (0,sprintf("Calculating (distances)^2: %d ", 0));
% CALCULATING THE SQUARES OF DIFFERENCES (always a positive value)
differsq = zeros(lengthR,lengthR);
for x=1:lengthR
for i=1:lengthR
if i==x continue
elseif any (dtki == i) continue % skip dtki
elseif any (dtui == x) continue % skip dtui
endif
for j = 1:lengthC
differsq (i,x) = differsq (i,x) + (data(i,j) - data(x,j))^2;
endfor
endfor
waitbar (x/lengthR,wtbr,sprintf("Calculating (distances)^2: %d %% ", round(100*x/lengthR)));
endfor
delete(wtbr)
% CALCULATING THE DISTANCES (square root of the differsq)
Dists(lengthR,lengthR) = 0;
for i=1:lengthR
if any (dtki == i) continue % skip dtki
endif
for j=1:lengthR
Dists(i,j) = sqrt(differsq(i,j));
endfor
endfor
lengthRD = length(Dists(:,1));
lengthCD = length(Dists(1,:));
toc
tic
%INDEXES MATRIX FOR SORTING
MtrxIndxs=zeros(lengthR,lengthR);
for j=1:lengthR
MtrxIndxs(j,:) = 1:lengthR;
endfor
%WAITBAR FOR SORTING
skip = 0;
wb = waitbar (0,sprintf("sorting: %d / %d ", j-skip, lengthRD));
%MERGE SORT - BEGINNING
function A = mergeswap(A, i, j)
tmp = A(i);
A(i) = A(j);
A(j) = tmp;
end
for j=1:lengthRD
if any (dtki == j) skip++; continue % skip dtki
endif
waitbar (j/lengthRD,wb,sprintf("sorting: %d / %d ", j-skip, round(lengthRD/2)));
function [D,I] = recmerge(Dists, MtrxIndxs)
if length(Dists) == 1
D = Dists;
I = MtrxIndxs;
elseif length(Dists) == 2
if Dists(2) < Dists(1)
D = mergeswap(Dists, 1, 2);
I = mergeswap(MtrxIndxs, 1, 2);
else
D = Dists;
I = MtrxIndxs;
endif
else
mid = floor(length(Dists) / 2);
fin = length(Dists);
[A,AA]= recmerge(Dists(1:mid),MtrxIndxs(1:mid));
[B,BB] = recmerge(Dists(mid+1:end),MtrxIndxs(mid+1:end));
C = [A B];
CC = [AA BB];
a = 1;
b = mid+1;
for i = 1:fin
if a < mid+1 && (b >= fin+1 || C(a) <= C(b))
D(i) = C(a);
I(i) = CC(a);
a = a + 1;
else
D(i) = C(b);
I(i) = CC(b);
b = b + 1;
endif
endfor
endif
endfunction
[Dists(j,:),MtrxIndxs(j,:)] = recmerge(Dists(j,:),MtrxIndxs(j,:));
endfor
%MERGE SORT - OVER
delete(wb)
toc
tic
%VOTING OF THE k NEIGHBOURS (k is variable to make a plot at the end)
Vote (lengthRD,3) = 0;
for k = 1:50 %k of nearest neighbours
klist(k)=k;
clear Vote
Vote (lengthRD,3) = 0;
for j=1:lengthRD
if any (dtki == j) continue % skip dtki
endif
for i=round(lengthCD/2):round(lengthCD/2)-1+k %(k = +1:lim)
if dataraw(MtrxIndxs(j,i),end) == 1
Vote(j,1)++; %vote for option 1
elseif dataraw(MtrxIndxs(j,i),end) == 2
Vote(j,2)++; %vote for option 2
elseif dataraw(MtrxIndxs(j,i),end) == 3
Vote(j,3)++; %vote for option 3
endif
endfor
endfor
%ANY DRAWS IN THE VOTING?
draw(k)=0;
clear draw
for i=1:lengthR
maxval(i) = max(Vote(i,:));
endfor
for i=1:lengthR
sumd(i) = sum (Vote(i,:) == maxval(i));
if any (dtki == i)
draw(i)= 0;
continue % skip dtki
elseif sumd (i) >= 2;
draw(i)=1;
endif
endfor
draws(k)=length(nonzeros(draw));
drawspercent(k)=100*(draws(k)/length(dtui));
%ASSIGN THE CLASS BY THE VOTINGS
Class (lengthR,1) = 0;
for i=1:lengthR
if any (dtki == i) continue % skip dtki
endif
[maxi,col] = max(Vote(i,:));
if col == 1
Class(i)=col;
elseif col == 2
Class(i)=col;
elseif col == 3
Class(i)=col;
endif
endfor
%VERIFICATION
err = 0;
for i=1:lengthR
if any (dtki == i) continue % skip dtki
endif
if Class(i) != dataraw(i,end)
err++;
endif
endfor
%EFFECTIVENESS - PLOT
Efctv = 100*((length(dtui)-err)/length(dtui));
Efctvns(k)= Efctv;
endfor
kNNplot = plot(klist,Efctvns,".","markersize", 15, drawspercent,"*","markersize", 5,"color",'r');
grid on, grid minor
xlabel ("k (number of neighbours)");
ylabel ("[%]");
legend('correctly assigned classes','draws in voting',"location",'east');
toc