# Libraries

In [54]:
from google.colab import drive  #read files from drive
import pandas as pd # handle the data as a dataframe
import numpy as np  
from scipy.spatial.distance import euclidean
from math import log 
import matplotlib.pyplot as plt  #for any plots needed
import seaborn as sns    # to add more beauty plots
import random
random.seed(42)

from sklearn.preprocessing import LabelEncoder #encode categorical data to numerical
from sklearn.cluster import KMeans

In [55]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import Data

In [56]:
# Read the gzipped file directly into a Pandas DataFrame
df = pd.read_csv('/content/drive/My Drive/Data/kddcup.data_10_percent.gz', compression='gzip', header=None)


In [57]:
test = pd.read_csv('/content/drive/My Drive/Data/corrected.gz', compression='gzip', header=None)

In [58]:
data_cols = ['duration',
 'protocol_type',
 'service',
 'flag',
 'src_bytes',
 'dst_bytes',
 'land',
 'wrong_fragment',
 'urgent',
 'hot',
 'num_failed_logins',
 'logged_in',
 'num_compromised',
 'root_shell',
 'su_attempted',
 'num_root',
 'num_file_creations',
 'num_shells',
 'num_access_files',
 'num_outbound_cmds',
 'is_host_login',
 'is_guest_login',
 'count',
 'srv_count',
 'serror_rate',
 'srv_serror_rate',
 'rerror_rate',
 'srv_rerror_rate',
 'same_srv_rate',
 'diff_srv_rate',
 'srv_diff_host_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_rerror_rate',
 'target']

In [59]:
df.columns = data_cols
test.columns = data_cols

In [60]:

df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,target
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


In [61]:
test.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,target
0,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal.
3,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,snmpgetattack.
4,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,snmpgetattack.


In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494021 entries, 0 to 494020
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     494021 non-null  int64  
 1   protocol_type                494021 non-null  object 
 2   service                      494021 non-null  object 
 3   flag                         494021 non-null  object 
 4   src_bytes                    494021 non-null  int64  
 5   dst_bytes                    494021 non-null  int64  
 6   land                         494021 non-null  int64  
 7   wrong_fragment               494021 non-null  int64  
 8   urgent                       494021 non-null  int64  
 9   hot                          494021 non-null  int64  
 10  num_failed_logins            494021 non-null  int64  
 11  logged_in                    494021 non-null  int64  
 12  num_compromised              494021 non-null  int64  
 13 

In [63]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 311029 entries, 0 to 311028
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     311029 non-null  int64  
 1   protocol_type                311029 non-null  object 
 2   service                      311029 non-null  object 
 3   flag                         311029 non-null  object 
 4   src_bytes                    311029 non-null  int64  
 5   dst_bytes                    311029 non-null  int64  
 6   land                         311029 non-null  int64  
 7   wrong_fragment               311029 non-null  int64  
 8   urgent                       311029 non-null  int64  
 9   hot                          311029 non-null  int64  
 10  num_failed_logins            311029 non-null  int64  
 11  logged_in                    311029 non-null  int64  
 12  num_compromised              311029 non-null  int64  
 13 

In [64]:
test = test[test['service'] != 'icmp']

# Data Preprocessing

In [65]:
def encoder(df):

  le = LabelEncoder()
  protocol_encoder = le.fit(df['protocol_type'])
  df['protocol_type'] = protocol_encoder.transform(df['protocol_type'])


  service_encoder = le.fit(df['service'])
  df['service'] = service_encoder.transform(df['service'])

  flag_encoder = le.fit(df['flag'])
  df['flag'] = flag_encoder.transform(df['flag'])
  
  target_encoder = le.fit(df['target'])
  df['target'] = target_encoder.transform(df['target'])

  return df

In [66]:
def drop_unnecessary_cols(df, cols):
  df.drop(columns = cols, inplace = True)
  return df
  

In [67]:
df= encoder(df)
test = encoder(test)

In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494021 entries, 0 to 494020
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     494021 non-null  int64  
 1   protocol_type                494021 non-null  int64  
 2   service                      494021 non-null  int64  
 3   flag                         494021 non-null  int64  
 4   src_bytes                    494021 non-null  int64  
 5   dst_bytes                    494021 non-null  int64  
 6   land                         494021 non-null  int64  
 7   wrong_fragment               494021 non-null  int64  
 8   urgent                       494021 non-null  int64  
 9   hot                          494021 non-null  int64  
 10  num_failed_logins            494021 non-null  int64  
 11  logged_in                    494021 non-null  int64  
 12  num_compromised              494021 non-null  int64  
 13 

In [69]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 311027 entries, 0 to 311028
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     311027 non-null  int64  
 1   protocol_type                311027 non-null  int64  
 2   service                      311027 non-null  int64  
 3   flag                         311027 non-null  int64  
 4   src_bytes                    311027 non-null  int64  
 5   dst_bytes                    311027 non-null  int64  
 6   land                         311027 non-null  int64  
 7   wrong_fragment               311027 non-null  int64  
 8   urgent                       311027 non-null  int64  
 9   hot                          311027 non-null  int64  
 10  num_failed_logins            311027 non-null  int64  
 11  logged_in                    311027 non-null  int64  
 12  num_compromised              311027 non-null  int64  
 13 

In [70]:
for i in df.columns:
  print(f"feature: {i}")
  print(df[i].value_counts())
  print()

feature: duration
0        481671
1          2476
2           870
3           625
5           554
          ...  
18185         1
4283          1
2154          1
1007          1
120           1
Name: duration, Length: 2495, dtype: int64

feature: protocol_type
0    283602
1    190065
2     20354
Name: protocol_type, dtype: int64

feature: service
14    281400
45    110893
22     64293
50      9723
40      7237
       ...  
1         11
58         7
41         1
57         1
46         1
Name: service, Length: 66, dtype: int64

feature: flag
9     378440
5      87007
1      26875
4        903
2        579
10       107
6         57
7         24
3         11
8         10
0          8
Name: flag, dtype: int64

feature: src_bytes
1032     228035
0        115342
520       52774
105        7370
147        2725
          ...  
6927          1
2315          1
11898         1
12289         1
475           1
Name: src_bytes, Length: 3300, dtype: int64

feature: dst_bytes
0        408258
105      

In [71]:
#df = drop_unnecessary_cols(df, ['num_outbound_cmds', 'is_host_login', 'is_guest_login', 'land','wrong_fragment', 'root_shell', 'logged_in'])

# Knn Model

In [72]:
def KNN_model_train(df, clusters, sigma, iterations):
  
  rows = df.shape[0] #numper of samples ot cluser it
  centroids = []
  #print(df)
  train = df.to_numpy()
  #get random values to represents the initial centroids
  for i in range(clusters):
    centroids.append(train[random.randint(0, rows)])

  
  error = 100          # the error between old and new centroids 
  c = []              # save the cluster after each loop
  l = 0               # number of iteration needd to finish the cluster


  #iterate until the error between the old and new centroids < sigma
  while(error > sigma and l < iterations):
    c = [] #save the cluster of each point 

    #calculate the error between the points and each centriod
    for i in range(rows):
      err = []
      for j in range(clusters):
        err.append(euclidean(train[i], centroids[j]))
  
      c.append(np.argmin(err)) #add the right cluster to the list

    df['cluster'] = c   # save the new cluster at the data
    
    new_centroids = df.groupby('cluster').mean().to_numpy() # calculate the new centroids 
    
    #get the error bewteen the old and enw centroids
    #print(centroids[0])

    #print(new_centroids[0])
    for i in range(len(new_centroids)):
      temp = euclidean(centroids[i], new_centroids[i])
      if(i != 0):
        error = max(temp, error)
      else: 
        error = temp

      centroids[i] = new_centroids[i] #update centroids

    #remove the old clusters to starn next iteration
    df.drop(columns = ['cluster'], inplace = True) 
    
    print(f"l = {l}, Error = {error}")
    l+=1

  #save the last version of clusters
  df['cluster'] = c
  #return the data clustered, and centroids to use them in test
  return df, centroids

In [73]:
def KNN_model_test(df, centroids, n):
  rows = df.shape[0]
  c = []

  for i in range(rows):
    err = []
    for j in range(n):
      err.append(euclidean(df.iloc[i], centroids[j]))
  
    c.append(np.argmin(err))

  df['cluster'] = c

  return df


# Metrices

In [74]:
def percision(df, c):
  perc = 0
  c_perc = []
  n_t = df.shape[0]
  
  for i in range(c):
    cluster = df[df['cluster'] == i]
    n = cluster.shape[0]
    if(n != 0):
      max_k = max(cluster.value_counts())
      prc = ((max_k / n)* (n/n_t))
      perc += prc
      c_perc.append(prc)
    else:
      c_perc.append(0) 
  
  return c_perc, perc

In [75]:
def recall(df, c):
  rec = 0
  c_rec = []
  n_t = df.shape[0]
  
  for i in range(c):
    cluster = df[df['cluster'] == i]
    cluster_size = cluster.shape[0]
    if(cluster_size != 0):
      max_k = max(cluster['target'].value_counts())
      max_label = cluster['target'].value_counts().idxmax()
      total = df[df['target'] == max_label].shape[0]

      recall = max_k / total
      rec += recall * (cluster_size / n_t)
      c_rec.append(recall)
  
    else:
      c_rec.append(0)
  return c_rec, rec

In [76]:
def F1_score(percision, recall, c):
  f1 = 0
  for i in range(c):
    perc_i = percision[i]
    rec_i = recall[i]
    if(perc_i != 0 or rec_i != 0):
      f = (2 * perc_i * rec_i) / (perc_i + rec_i)
      f1 += f
  
  return f1/c

In [77]:

def Conditional_Entropy(df, c):
  H_t_c = 0
  n_t = df.shape[0]

  for i in range(c):
    cluster = df[df['cluster'] == i]
    n_i = cluster.shape[0]
    labels = cluster['target'].value_counts()
    H_c_i = 0
    for j in labels:
      H_c_i -= (j/n_i)*log(j/n_i)
    
    H_t_c += (n_i/n_t) * H_c_i

  return H_t_c

# Train & Test the model


In [95]:
x_train = df.drop(columns = ['target'])
y_train = df['target']
x_test = test.drop(columns = ['target'])
y_test = test['target']

## K = 7

### Train model

In [96]:
clustered1, centroids1 = KNN_model_train(x_train, 7, 0.1, 40)

l = 0, Error = 17928.619115823483
l = 1, Error = 276627.1686323501
l = 2, Error = 9074229.281863956
l = 3, Error = 7248073.205862684
l = 4, Error = 676770723.3015882
l = 5, Error = 2348477.572780419
l = 6, Error = 1772627.256114696
l = 7, Error = 211267.32078933262
l = 8, Error = 75800.56345399117
l = 9, Error = 211593.0480849187
l = 10, Error = 258439.99973408016
l = 11, Error = 331243.19609501865
l = 12, Error = 221039.77897126437
l = 13, Error = 104554.65569453445
l = 14, Error = 123337.00585108857
l = 15, Error = 48523.18634653835
l = 16, Error = 26045.765576200498
l = 17, Error = 5079.340303527298
l = 18, Error = 6613.9674647865395
l = 19, Error = 9456.763608721822
l = 20, Error = 26078.07956779603
l = 21, Error = 29969.132404436794
l = 22, Error = 49767.393404630144
l = 23, Error = 75942.27618896686
l = 24, Error = 62330.70489491762
l = 25, Error = 42584.86867175459
l = 26, Error = 31938.695551980767
l = 27, Error = 28174.345290952147
l = 28, Error = 2442.0129894690767
l = 29, Er

In [97]:
clustered1['target'] = y_train
train_result1 = clustered1[['target', 'cluster']]

In [98]:
train_result1

Unnamed: 0,target,cluster
0,11,2
1,11,2
2,11,2
3,11,2
4,11,2
...,...,...
494016,11,2
494017,11,2
494018,11,2
494019,11,2


In [99]:
c_perc1, perc1 = percision(train_result1, 7)


In [100]:
for i in range(7):
  print(f'cluster{i} percision: {c_perc1[i]}')

print(f'the total percision is: {perc1}')

cluster0 percision: 4.2508314423880765e-05
cluster1 percision: 3.0363081731343407e-05
cluster2 percision: 0.2169968483121163
cluster3 percision: 0.568376647956261
cluster4 percision: 2.024205448756227e-06
cluster5 percision: 0.00015586381955422948
cluster6 percision: 0.00012145232692537362
the total percision is: 0.7857257080164608


In [101]:
c_rec1, rec1 = recall(train_result1, 7)

In [102]:
for i in range(7):
  print(f'cluster{i} recall: {c_rec1[i]}')

print(f'the total recall is: {rec1}')

cluster0 recall: 0.00021587614876950596
cluster1 recall: 0.75
cluster2 recall: 1.0
cluster3 recall: 1.0
cluster4 recall: 0.0009615384615384616
cluster5 recall: 0.0007915458788215218
cluster6 recall: 0.058823529411764705
the total recall is: 0.9996263166219571


In [103]:
F1_score(c_perc1, c_rec1, 7)

0.1545778747346366

In [104]:
Conditional_Entropy(train_result1, 7)

0.47966812431270545

### test model

In [105]:
test_clustered1= KNN_model_test(x_test, centroids1, 7)

In [106]:
test_clustered1['target'] = y_test
test_result1 = test_clustered1[['target', 'cluster']]

In [177]:
values = np.sort(test_result1['cluster'].unique())

for i in values: 
  print(f"cluster number {i} is {test_result1[test_result1['cluster'] == i].value_counts().idxmax()[0]}")

cluster number 0 is 16
cluster number 1 is 16
cluster number 2 is 14
cluster number 3 is 27
cluster number 5 is 16
cluster number 6 is 16


In [108]:
c_perc1, perc1 = percision(test_result1, 7)


In [109]:
for i in range(7):
  print(f'cluster{i} percision: {c_perc1[i]}')

print(f'the total percision is: {perc1}')

cluster0 percision: 3.858185945271633e-05
cluster1 percision: 3.215154954393027e-06
cluster2 percision: 0.18648220250974995
cluster3 percision: 0.5275779916213063
cluster4 percision: 0
cluster5 percision: 0.00012539104322132805
cluster6 percision: 1.2860619817572108e-05
the total percision is: 0.7142402428085022


In [110]:
c_rec1, rec1 = recall(test_result1, 7)

In [111]:
for i in range(7):
  print(f'cluster{i} recall: {c_rec1[i]}')

print(f'the total recall is: {rec1}')

cluster0 recall: 0.00019804921522998466
cluster1 recall: 1.6504101269165386e-05
cluster2 recall: 1.0
cluster3 recall: 1.0
cluster4 recall: 0
cluster5 recall: 0.0006436599494974502
cluster6 recall: 6.601640507666154e-05
the total recall is: 0.9998039714330621


In [112]:
F1_score(c_perc1, c_rec1, 7)

0.14362628222553706

In [113]:
Conditional_Entropy(test_result1, 7)

0.7541218871700119

## K = 15

In [114]:
x_train = df.drop(columns = ['target'])
y_train = df['target']
x_test = test.drop(columns = ['target'])
y_test = test['target']

### Train model

In [115]:
clustered2, centroids2 = KNN_model_train(x_train, 15, 0.1, 30)

l = 0, Error = 65838.01171661475
l = 1, Error = 422967.9386367525
l = 2, Error = 10493211.56313255
l = 3, Error = 682392653.2297082
l = 4, Error = 2898571.0504853628
l = 5, Error = 1363152.4945983796
l = 6, Error = 951234.8639524203
l = 7, Error = 797284.929740488
l = 8, Error = 211267.32078933262
l = 9, Error = 103300.18094222891
l = 10, Error = 229304.53326072232
l = 11, Error = 249135.51847817618
l = 12, Error = 318320.07029014005
l = 13, Error = 221556.3798228289
l = 14, Error = 90144.39345401693
l = 15, Error = 104622.19560262292
l = 16, Error = 74568.95118326641
l = 17, Error = 54522.23295823478
l = 18, Error = 93948.20661657247
l = 19, Error = 84956.54764777303
l = 20, Error = 47500.63753669052
l = 21, Error = 27216.603564589055
l = 22, Error = 28174.345290952147
l = 23, Error = 29574.986837442346
l = 24, Error = 5973.58534213041
l = 25, Error = 30750.18344035303
l = 26, Error = 9870.67205652651
l = 27, Error = 6818.220593389801
l = 28, Error = 3433.4387634801183
l = 29, Error =

In [116]:
clustered2['target'] = y_train
train_result2 = clustered2[['target', 'cluster']]

In [117]:
train_result2

Unnamed: 0,target,cluster
0,11,7
1,11,7
2,11,7
3,11,7
4,11,7
...,...,...
494016,11,7
494017,11,7
494018,11,7
494019,11,7


In [118]:
c_perc2, perc2 = percision(train_result2, 15)


In [119]:
for i in range(15):
  print(f'cluster{i} percision: {c_perc2[i]}')

print(f'the total percision is: {perc2}')

cluster0 percision: 2.024205448756227e-06
cluster1 percision: 0.0012023780365611988
cluster2 percision: 0.00012145232692537362
cluster3 percision: 0.10681124891451982
cluster4 percision: 0.00011335550513034871
cluster5 percision: 0.00165782426253135
cluster6 percision: 0.0009655459990567202
cluster7 percision: 0.2169968483121163
cluster8 percision: 3.0363081731343407e-05
cluster9 percision: 0.006803354513269679
cluster10 percision: 0.002050520119590058
cluster11 percision: 0.004416816289186087
cluster12 percision: 0.018806892824394104
cluster13 percision: 3.845990352636831e-05
cluster14 percision: 0.4615532538090486
the total percision is: 0.8215703381030361


In [120]:
c_rec2, rec2 = recall(train_result2, 15)

In [121]:
for i in range(15):
  print(f'cluster{i} recall: {c_rec2[i]}')

print(f'the total recall is: {rec2}')

cluster0 recall: 0.0009615384615384616
cluster1 recall: 0.0061062110651945965
cluster2 recall: 0.058823529411764705
cluster3 recall: 0.18792335909398483
cluster4 recall: 0.0005756697300520159
cluster5 recall: 0.008419169802010733
cluster6 recall: 0.004903472522050207
cluster7 recall: 1.0
cluster8 recall: 0.75
cluster9 recall: 0.03455046361972902
cluster10 recall: 0.010413454223976644
cluster11 recall: 0.9904675442578302
cluster12 recall: 0.09550977610559427
cluster13 recall: 0.00019531651555336252
cluster14 recall: 0.8120552726236689
the total recall is: 0.7928001900515536


In [122]:
F1_score(c_perc2, c_rec2, 15)

0.07622394879576687

In [123]:
Conditional_Entropy(train_result2, 15)

0.4036096116383191

### test model

In [124]:
test_clustered2= KNN_model_test(x_test, centroids2, 15)

In [125]:
test_clustered2['target'] = y_test
test_result2 = test_clustered2[['target', 'cluster']]

In [176]:
values = np.sort(test_result2['cluster'].unique())

for i in values: 
  print(f"cluster number {i} is {test_result2[test_result2['cluster'] == i].value_counts().idxmax()[0]}")

cluster number 1 is 16
cluster number 2 is 16
cluster number 3 is 27
cluster number 4 is 16
cluster number 5 is 16
cluster number 6 is 16
cluster number 7 is 14
cluster number 8 is 16
cluster number 9 is 21
cluster number 10 is 16
cluster number 11 is 1
cluster number 12 is 16
cluster number 13 is 16
cluster number 14 is 27


In [127]:
c_perc2, perc2 = percision(test_result2, 15)


In [128]:
for i in range(15):
  print(f'cluster{i} percision: {c_perc2[i]}')

print(f'the total percision is: {perc2}')

cluster0 percision: 0
cluster1 percision: 0.000704118935012073
cluster2 percision: 1.2860619817572108e-05
cluster3 percision: 0.1763255280088224
cluster4 percision: 8.680918376861173e-05
cluster5 percision: 0.0016107926321509065
cluster6 percision: 3.858185945271633e-05
cluster7 percision: 0.18648220250974998
cluster8 percision: 3.215154954393027e-06
cluster9 percision: 0.0016300835618772646
cluster10 percision: 0.0012796316718484248
cluster11 percision: 0.00346272188588129
cluster12 percision: 0.020959595147688144
cluster13 percision: 3.858185945271633e-05
cluster14 percision: 0.35122031206293985
the total percision is: 0.7438550350934163


In [129]:
c_rec2, rec2 = recall(test_result2, 15)

In [130]:
for i in range(15):
  print(f'cluster{i} recall: {c_rec2[i]}')

print(f'the total recall is: {rec2}')

cluster0 recall: 0
cluster1 recall: 0.00361439817794722
cluster2 recall: 6.601640507666154e-05
cluster3 recall: 0.3342169893534685
cluster4 recall: 0.00044561073426746544
cluster5 recall: 0.00826855473585186
cluster6 recall: 0.00019804921522998466
cluster7 recall: 1.0
cluster8 recall: 1.6504101269165386e-05
cluster9 recall: 0.6679841897233202
cluster10 recall: 0.0065686323051278245
cluster11 recall: 0.9808743169398907
cluster12 recall: 0.10759023617368917
cluster13 recall: 0.00019804921522998466
cluster14 recall: 0.6657220688520394
the total recall is: 0.7322995745747352


In [131]:
F1_score(c_perc2, c_rec2, 15)

0.07043976925152902

In [132]:
Conditional_Entropy(test_result2, 15)

0.6753273648202184

## K = 23

In [133]:
x_train = df.drop(columns = ['target'])
y_train = df['target']
x_test = test.drop(columns = ['target'])
y_test = test['target']

### Train model

In [134]:
clustered3, centroids3 = KNN_model_train(x_train, 23, 0.1, 30)

l = 0, Error = 265507.37023355765
l = 1, Error = 9090340.030203752
l = 2, Error = 7248073.205862684
l = 3, Error = 676770723.3015882
l = 4, Error = 2912968.0664596995
l = 5, Error = 1381883.315194855
l = 6, Error = 1051732.8665031139
l = 7, Error = 797284.929740488
l = 8, Error = 211267.32078933262
l = 9, Error = 94079.13216145016
l = 10, Error = 230088.2022065172
l = 11, Error = 260824.73262650022
l = 12, Error = 318320.07029014005
l = 13, Error = 221556.3798228289
l = 14, Error = 90144.39345401693
l = 15, Error = 104622.19560262292
l = 16, Error = 74568.95118326641
l = 17, Error = 64528.921032612765
l = 18, Error = 89274.75489349461
l = 19, Error = 78751.3152892204
l = 20, Error = 46866.42088631915
l = 21, Error = 28174.345290952147
l = 22, Error = 29574.986837442346
l = 23, Error = 5973.58534213041
l = 24, Error = 30750.18344035303
l = 25, Error = 10046.978259674332
l = 26, Error = 5045.404045703749
l = 27, Error = 10572.38819722559
l = 28, Error = 23919.493380402397
l = 29, Error =

In [135]:
clustered3['target'] = y_train
train_result3 = clustered3[['target', 'cluster']]

In [136]:
train_result3

Unnamed: 0,target,cluster
0,11,18
1,11,13
2,11,10
3,11,10
4,11,10
...,...,...
494016,11,10
494017,11,10
494018,11,13
494019,11,13


In [137]:
c_perc3, perc3 = percision(train_result3, 23)


In [138]:
for i in range(23):
  print(f'cluster{i} percision: {c_perc3[i]}')

print(f'the total percision is: {perc3}')

cluster0 percision: 0.00693087945654132
cluster1 percision: 0.4611828242119262
cluster2 percision: 0.004414792083737331
cluster3 percision: 0.014130978237767221
cluster4 percision: 0.004637454683100516
cluster5 percision: 2.024205448756227e-06
cluster6 percision: 0.002647660726973145
cluster7 percision: 3.0363081731343407e-05
cluster8 percision: 3.845990352636831e-05
cluster9 percision: 0.0006902540580258734
cluster10 percision: 0.041226992374818076
cluster11 percision: 0.016418330394861756
cluster12 percision: 0.002068737968628864
cluster13 percision: 0.06428876505249778
cluster14 percision: 0.0009614975881592077
cluster15 percision: 0.00012145232692537362
cluster16 percision: 0.023877527473528455
cluster17 percision: 0.0005566564984079625
cluster18 percision: 0.01846682630900306
cluster19 percision: 0.06879262217598037
cluster20 percision: 0.21699684831211627
cluster21 percision: 0.006789185075128386
cluster22 percision: 7.894401250149285e-05
the total percision is: 0.955350076211335

In [139]:
c_rec3, rec3 = recall(train_result3, 23)

In [140]:
for i in range(23):
  print(f'cluster{i} recall: {c_rec3[i]}')

print(f'the total recall is: {rec3}')

cluster0 recall: 0.035198092066037544
cluster1 recall: 0.8114035400121087
cluster2 recall: 0.9900136177939174
cluster3 recall: 0.024861996509847217
cluster4 recall: 0.023551059849092292
cluster5 recall: 0.0009615384615384616
cluster6 recall: 0.8231592196349906
cluster7 recall: 0.75
cluster8 recall: 0.00019531651555336252
cluster9 recall: 0.003505417463352454
cluster10 recall: 0.20936902485659656
cluster11 recall: 0.08337959250806966
cluster12 recall: 0.01050597257344929
cluster13 recall: 0.32648697547235755
cluster14 recall: 0.004882912888834063
cluster15 recall: 0.058823529411764705
cluster16 recall: 0.04201004309270273
cluster17 recall: 0.002826949567219721
cluster18 recall: 0.09378276691543823
cluster19 recall: 0.12103351258947968
cluster20 recall: 1.0
cluster21 recall: 0.03447850490347252
cluster22 recall: 0.00040091284771479676
the total recall is: 0.6834430831354763


In [141]:
F1_score(c_perc3, c_rec3, 23)

0.05947413505085552

In [142]:
Conditional_Entropy(train_result3, 23)

0.16435349391916781

### test model

In [143]:
test_clustered3 = KNN_model_test(x_test, centroids3, 23)

In [144]:
test_clustered3['target'] = y_test
test_result3 = test_clustered3[['target', 'cluster']]

In [178]:
values = np.sort(test_result3['cluster'].unique())

for i in values: 
  print(f"cluster number {i} is {test_result3[test_result3['cluster'] == i].value_counts().idxmax()[0]}")

cluster number 0 is 16
cluster number 1 is 27
cluster number 2 is 1
cluster number 3 is 16
cluster number 4 is 10
cluster number 6 is 25
cluster number 7 is 16
cluster number 8 is 16
cluster number 9 is 16
cluster number 10 is 16
cluster number 11 is 27
cluster number 12 is 16
cluster number 13 is 16
cluster number 14 is 16
cluster number 15 is 16
cluster number 16 is 27
cluster number 17 is 16
cluster number 18 is 16
cluster number 19 is 27
cluster number 20 is 14
cluster number 21 is 21
cluster number 22 is 16


In [146]:
c_perc3, perc3 = percision(test_result3, 23)


In [147]:
for i in range(23):
  print(f'cluster{i} percision: {c_perc3[i]}')

print(f'the total percision is: {perc3}')

cluster0 percision: 0.006584637346596919
cluster1 percision: 0.24653165159294851
cluster2 percision: 0.0034530764210181104
cluster3 percision: 4.8227324315895405e-05
cluster4 percision: 0.016075774771965135
cluster5 percision: 0
cluster6 percision: 0.004067171017307179
cluster7 percision: 3.215154954393027e-06
cluster8 percision: 3.858185945271633e-05
cluster9 percision: 0.0005112096377484913
cluster10 percision: 0.04087426493519855
cluster11 percision: 0.10468866046999134
cluster12 percision: 0.0010449253601777337
cluster13 percision: 0.05917171178064926
cluster14 percision: 3.858185945271633e-05
cluster15 percision: 1.2860619817572108e-05
cluster16 percision: 0.0002668578612146212
cluster17 percision: 0.00036009735489201907
cluster18 percision: 0.0200111244361422
cluster19 percision: 0.17601687313320066
cluster20 percision: 0.18648220250974998
cluster21 percision: 0.0016300835618772646
cluster22 percision: 5.144247927028843e-05
the total percision is: 0.8679632314879415


In [148]:
c_rec3, rec3 = recall(test_result3, 23)

In [149]:
for i in range(23):
  print(f'cluster{i} recall: {c_rec3[i]}')

print(f'the total recall is: {rec3}')

cluster0 recall: 0.03380039939925071
cluster1 recall: 0.46728949180637575
cluster2 recall: 0.9781420765027322
cluster3 recall: 0.00024756151903748084
cluster4 recall: 1.0
cluster5 recall: 0
cluster6 recall: 0.7746478873239436
cluster7 recall: 1.6504101269165386e-05
cluster8 recall: 0.00019804921522998466
cluster9 recall: 0.0026241521017972965
cluster10 recall: 0.20981663943489956
cluster11 recall: 0.1984325770456637
cluster12 recall: 0.005363832912478751
cluster13 recall: 0.3037414797577198
cluster14 recall: 0.00019804921522998466
cluster15 recall: 6.601640507666154e-05
cluster16 recall: 0.0005058168942842691
cluster17 recall: 0.0018484593421465233
cluster18 recall: 0.10272152629928537
cluster19 recall: 0.3336319481263445
cluster20 recall: 1.0
cluster21 recall: 0.6679841897233202
cluster22 recall: 0.0002640656203066462
the total recall is: 0.5560789208082779


In [150]:
F1_score(c_perc3, c_rec3, 23)

0.0552335475663223

In [151]:
Conditional_Entropy(test_result3, 23)

0.4483244283847121

## K = 31

In [25]:
x_train = df.drop(columns = ['target'])
y_train = df['target']
x_test = test.drop(columns = ['target'])
y_test = test['target']

### Train model

In [26]:
clustered4, centroids4 = KNN_model_train(x_train, 31, 0.1, 30)

l = 0, Error = 24239.54987235169
l = 1, Error = 371163.12364392454
l = 2, Error = 10592294.692097774
l = 3, Error = 682392653.2297082
l = 4, Error = 2891368.532961162
l = 5, Error = 1197091.7264911644
l = 6, Error = 797284.929740488
l = 7, Error = 211267.32078933262
l = 8, Error = 215787.93738062153
l = 9, Error = 260175.49216074066
l = 10, Error = 337799.9490994972
l = 11, Error = 243513.8077619321
l = 12, Error = 90144.39345401693
l = 13, Error = 104622.19560262292
l = 14, Error = 74568.95118326641
l = 15, Error = 82156.58403656168
l = 16, Error = 86108.2754682526
l = 17, Error = 60857.61856676498
l = 18, Error = 30914.875991250057
l = 19, Error = 31548.366669025963
l = 20, Error = 29574.986837442346
l = 21, Error = 30750.18344035303
l = 22, Error = 10046.978259674332
l = 23, Error = 6871.100872204077
l = 24, Error = 14674.516746646113
l = 25, Error = 57113.880379903436
l = 26, Error = 44758.73055965722
l = 27, Error = 49349.07292732391
l = 28, Error = 57278.04303063824
l = 29, Error

In [27]:
clustered4['target'] = y_train
train_result4 = clustered4[['target', 'cluster']]

In [28]:
train_result4

Unnamed: 0,target,cluster
0,11,6
1,11,1
2,11,5
3,11,5
4,11,5
...,...,...
494016,11,5
494017,11,5
494018,11,5
494019,11,5


In [29]:
c_perc4, perc4 = percision(train_result4, 31)


In [30]:
for i in range(31):
  print(f'cluster{i} percision: {c_perc4[i]}')

print(f'the total percision is: {perc4}')

cluster0 percision: 0.1067889826545835
cluster1 percision: 0.04638061944735143
cluster2 percision: 0.00020039633942686648
cluster3 percision: 0.017588321144242858
cluster4 percision: 0.004416816289186088
cluster5 percision: 0.037095589053906614
cluster6 percision: 0.023515194698201087
cluster7 percision: 0.023379572933134418
cluster8 percision: 0.01994652049204386
cluster9 percision: 0.0002651709137870657
cluster10 percision: 0.0026112250288955327
cluster11 percision: 0.0010869983259820938
cluster12 percision: 0.026875375743136424
cluster13 percision: 2.024205448756227e-06
cluster14 percision: 3.0363081731343407e-05
cluster15 percision: 0.0039067165160995175
cluster16 percision: 0.04429973624603003
cluster17 percision: 0.024934162717779203
cluster18 percision: 0.014021671143534383
cluster19 percision: 3.441149262885586e-05
cluster20 percision: 0.024624459284119504
cluster21 percision: 0.024306659028664773
cluster22 percision: 0.00012145232692537362
cluster23 percision: 0.00205659273593

In [31]:
c_rec4, rec4 = recall(train_result4, 31)

In [32]:
for i in range(31):
  print(f'cluster{i} recall: {c_rec4[i]}')

print(f'the total recall is: {rec4}')

cluster0 recall: 0.1878841839096834
cluster1 recall: 0.23554143794074714
cluster2 recall: 0.0010177018441990995
cluster3 recall: 0.0893213265075351
cluster4 recall: 0.9904675442578302
cluster5 recall: 0.1883879191595222
cluster6 recall: 0.11942062953596908
cluster7 recall: 0.10774153226182591
cluster8 recall: 0.09192078432104178
cluster9 recall: 0.0013466559756573943
cluster10 recall: 0.8118313404657017
cluster11 recall: 0.005520261518534509
cluster12 recall: 0.12385145660954655
cluster13 recall: 0.0009615384615384616
cluster14 recall: 0.75
cluster15 recall: 0.019840046053578406
cluster16 recall: 0.20414921502597924
cluster17 recall: 0.11490564453689797
cluster18 recall: 0.07120828964411274
cluster19 recall: 0.0001747568823372191
cluster20 recall: 0.11347841904459846
cluster21 recall: 0.12344003782972512
cluster22 recall: 0.058823529411764705
cluster23 recall: 0.01044429367380086
cluster24 recall: 0.0002672752318098645
cluster25 recall: 0.24329996921670508
cluster26 recall: 0.010033101

In [33]:
F1_score(c_perc4, c_rec4, 31)

0.04514460865795432

In [34]:
Conditional_Entropy(train_result4, 31)

0.11657062548346328

### test model

In [35]:
test_clustered4 = KNN_model_test(x_test, centroids4, 31)

In [36]:
test_clustered4['target'] = y_test
test_result4 = test_clustered4[['target', 'cluster']]

In [179]:
values = np.sort(test_result4['cluster'].unique())

for i in values: 
  print(f"cluster number {i} is {test_result4[test_result4['cluster'] == i].value_counts().idxmax()[0]}")

cluster number 0 is 27
cluster number 1 is 16
cluster number 2 is 16
cluster number 3 is 16
cluster number 4 is 1
cluster number 5 is 16
cluster number 6 is 16
cluster number 7 is 14
cluster number 8 is 14
cluster number 9 is 16
cluster number 10 is 25
cluster number 11 is 16
cluster number 12 is 14
cluster number 14 is 16
cluster number 15 is 16
cluster number 16 is 14
cluster number 17 is 14
cluster number 18 is 16
cluster number 19 is 16
cluster number 20 is 14
cluster number 21 is 16
cluster number 22 is 16
cluster number 23 is 16
cluster number 24 is 16
cluster number 25 is 14
cluster number 26 is 21
cluster number 27 is 16
cluster number 28 is 16
cluster number 29 is 16
cluster number 30 is 27


In [38]:
c_perc4, perc4 = percision(test_result4, 31)


In [39]:
for i in range(31):
  print(f'cluster{i} percision: {c_perc4[i]}')

print(f'the total percision is: {perc4}')

cluster0 percision: 0.17623550367009938
cluster1 percision: 0.04525009082812746
cluster2 percision: 0.00013825166303890017
cluster3 percision: 0.005568648381008723
cluster4 percision: 0.00346272188588129
cluster5 percision: 0.037932398151928934
cluster6 percision: 0.02362174344992557
cluster7 percision: 0.017104624357370903
cluster8 percision: 0.0130953261292428
cluster9 percision: 2.250608468075119e-05
cluster10 percision: 0.003977146678584174
cluster11 percision: 2.8936394589537242e-05
cluster12 percision: 0.01811096785809592
cluster13 percision: 0
cluster14 percision: 3.215154954393027e-06
cluster15 percision: 0.003990007298401746
cluster16 percision: 0.024007562044452733
cluster17 percision: 0.017516164191533214
cluster18 percision: 0.014751130930755207
cluster19 percision: 3.53667044983233e-05
cluster20 percision: 0.01970246956052047
cluster21 percision: 0.04944265288865597
cluster22 percision: 1.2860619817572108e-05
cluster23 percision: 0.0012828468268028179
cluster24 percision: 

In [40]:
c_rec4, rec4 = recall(test_result4, 31)

In [41]:
for i in range(31):
  print(f'cluster{i} recall: {c_rec4[i]}')

print(f'the total recall is: {rec4}')

cluster0 recall: 0.3340463523288907
cluster1 recall: 0.23227872126223367
cluster2 recall: 0.0007096763545741116
cluster3 recall: 0.02858510339819445
cluster4 recall: 0.9808743169398907
cluster5 recall: 0.19471538677361325
cluster6 recall: 0.12125563202455811
cluster7 recall: 0.09172255650764642
cluster8 recall: 0.0702229271909105
cluster9 recall: 0.00011552870888415771
cluster10 recall: 0.7575015309246785
cluster11 recall: 0.00014853691142248848
cluster12 recall: 0.09711901518939328
cluster13 recall: 0
cluster14 recall: 1.6504101269165386e-05
cluster15 recall: 0.020481589675034245
cluster16 recall: 0.12873915966966087
cluster17 recall: 0.09392941501008603
cluster18 recall: 0.0757208166229308
cluster19 recall: 0.00018154511396081927
cluster20 recall: 0.10565335080429648
cluster21 recall: 0.2538000693172253
cluster22 recall: 6.601640507666154e-05
cluster23 recall: 0.00658513640639699
cluster24 recall: 0.0001320328101533231
cluster25 recall: 0.4106136101101705
cluster26 recall: 0.66798418

In [42]:
F1_score(c_perc4, c_rec4, 31)

0.04301150931950687

In [44]:
Conditional_Entropy(test_result4, 31)

0.3023129711592183

## K = 45

In [78]:
x_train = df.drop(columns = ['target'])
y_train = df['target']
x_test = test.drop(columns = ['target'])
y_test = test['target']

### Train model

In [79]:
clustered, centroids = KNN_model_train(x_train, 45, 0.1, 30)

l = 0, Error = 26862.042354679215
l = 1, Error = 427103.1373660644
l = 2, Error = 10528047.446459295
l = 3, Error = 682392653.2297082
l = 4, Error = 2860931.2750259945
l = 5, Error = 1247037.0092795924
l = 6, Error = 797284.929740488
l = 7, Error = 211267.32078933262
l = 8, Error = 183537.8997128994
l = 9, Error = 246554.34077918096
l = 10, Error = 335814.65475578303
l = 11, Error = 234710.05383794042
l = 12, Error = 150165.6397297771
l = 13, Error = 123337.00585108857
l = 14, Error = 74568.95118326641
l = 15, Error = 68418.63520671765
l = 16, Error = 84207.4539617598
l = 17, Error = 77287.99655443653
l = 18, Error = 44880.51664050117
l = 19, Error = 28174.345290952147
l = 20, Error = 29574.986837442346
l = 21, Error = 5973.58534213041
l = 22, Error = 30750.18344035303
l = 23, Error = 10189.686932362596
l = 24, Error = 10572.38819722559
l = 25, Error = 23919.493380402397
l = 26, Error = 59363.03162967584
l = 27, Error = 32816.87549264072
l = 28, Error = 81832.80792766697
l = 29, Error 

In [80]:
clustered['target'] = y_train
train_result = clustered[['target', 'cluster']]

In [81]:
train_result

Unnamed: 0,target,cluster
0,11,18
1,11,23
2,11,32
3,11,32
4,11,32
...,...,...
494016,11,32
494017,11,32
494018,11,38
494019,11,38


In [82]:
c_perc, perc = percision(train_result, 45)


In [50]:
for i in range(45):
  print(f'cluster{i} percision: {c_perc[i]}')

print(f'the total percision is: {perc}')

cluster0 percision: 0.00011942812147661739
cluster1 percision: 0.006076664757166193
cluster2 percision: 0.02049305596320804
cluster3 percision: 0.0033561326340378244
cluster4 percision: 0.00011335550513034871
cluster5 percision: 0.000240880448401991
cluster6 percision: 0.011533922647012982
cluster7 percision: 0.008270903463617944
cluster8 percision: 0.0026699269869094634
cluster9 percision: 0.026612229034798117
cluster10 percision: 3.0363081731343407e-05
cluster11 percision: 0.0012691768163701543
cluster12 percision: 0.023420057042109543
cluster13 percision: 0.008406525228684611
cluster14 percision: 0.0013076367198965226
cluster15 percision: 0.00010930709423283626
cluster16 percision: 0.0039269585705870805
cluster17 percision: 0.0008198032067462719
cluster18 percision: 0.0015282751138109513
cluster19 percision: 0.002613249234344289
cluster20 percision: 3.441149262885586e-05
cluster21 percision: 0.013817226393210006
cluster22 percision: 0.031247659512449878
cluster23 percision: 2.024205

In [51]:
c_rec, rec = recall(train_result, 45)

In [52]:
for i in range(45):
  print(f'cluster{i} recall: {c_rec[i]}')

print(f'the total recall is: {rec}')

cluster0 recall: 0.00021012144307133444
cluster1 recall: 0.03086000945743128
cluster2 recall: 0.10407286334011802
cluster3 recall: 0.005904768688343602
cluster4 recall: 0.05384615384615385
cluster5 recall: 0.0012232981763605336
cluster6 recall: 0.020292745468143452
cluster7 recall: 0.04200333066058101
cluster8 recall: 0.013559078106046589
cluster9 recall: 0.1351487489463188
cluster10 recall: 0.75
cluster11 recall: 0.006445445013260964
cluster12 recall: 0.04120517112432779
cluster13 recall: 0.04269207837332182
cluster14 recall: 0.006640761528814326
cluster15 recall: 0.00019231454111613661
cluster16 recall: 0.019942844219659122
cluster17 recall: 0.004163325726269043
cluster18 recall: 0.007761261539094143
cluster19 recall: 0.8124606670862178
cluster20 recall: 0.0001747568823372191
cluster21 recall: 0.07017002816669751
cluster22 recall: 0.14400052238318672
cluster23 recall: 0.0009615384615384616
cluster24 recall: 0.1204672531073044
cluster25 recall: 0.0011040279212222658
cluster26 recall: 

In [83]:
F1_score(c_perc, c_rec, 45)

0.013649659796455636

In [84]:
Conditional_Entropy(train_result, 45)

0.052186198860254066

### test model

In [85]:
test_clustered= KNN_model_test(x_test, centroids, 45)

In [86]:
test_clustered['target'] = y_test
test_result = test_clustered[['target', 'cluster']]

In [180]:
values = np.sort(test_result['cluster'].unique())

for i in values: 
  print(f"cluster number {i} is {test_result[test_result['cluster'] == i].value_counts().idxmax()[0]}")

cluster number 0 is 16
cluster number 1 is 14
cluster number 2 is 16
cluster number 3 is 21
cluster number 4 is 1
cluster number 5 is 16
cluster number 6 is 16
cluster number 7 is 14
cluster number 8 is 14
cluster number 9 is 16
cluster number 10 is 25
cluster number 11 is 16
cluster number 12 is 14
cluster number 14 is 16
cluster number 15 is 16
cluster number 16 is 14
cluster number 17 is 14
cluster number 18 is 16
cluster number 19 is 16
cluster number 20 is 14
cluster number 21 is 14
cluster number 22 is 14
cluster number 23 is 16
cluster number 24 is 16
cluster number 25 is 16
cluster number 26 is 16
cluster number 27 is 27
cluster number 28 is 16
cluster number 29 is 16
cluster number 30 is 16
cluster number 31 is 14
cluster number 32 is 16
cluster number 33 is 27
cluster number 34 is 16
cluster number 35 is 16
cluster number 36 is 27
cluster number 37 is 16
cluster number 38 is 16
cluster number 39 is 16
cluster number 40 is 16
cluster number 41 is 16
cluster number 42 is 10
clu

In [88]:
c_perc, perc = percision(test_result, 45)


In [89]:
for i in range(45):
  print(f'cluster{i} percision: {c_perc[i]}')

print(f'the total percision is: {perc}')

cluster0 percision: 1.2860619817572108e-05
cluster1 percision: 0.017850540306790084
cluster2 percision: 2.5721239635144213e-05
cluster3 percision: 0.0016300835618772648
cluster4 percision: 0.0034209248714741802
cluster5 percision: 0.04491892986782498
cluster6 percision: 0.0027553877959148243
cluster7 percision: 0.017101409202416512
cluster8 percision: 0.013696560105714295
cluster9 percision: 2.8936394589537245e-05
cluster10 percision: 0.003977146678584174
cluster11 percision: 0.002202381143759223
cluster12 percision: 0.016480884296218655
cluster13 percision: 0
cluster14 percision: 3.215154954393027e-06
cluster15 percision: 0.00016075774771965134
cluster16 percision: 0.007009037800576799
cluster17 percision: 0.016795969481749173
cluster18 percision: 0.02035193086130786
cluster19 percision: 3.53667044983233e-05
cluster20 percision: 0.016230102209776002
cluster21 percision: 0.022371048172666684
cluster22 percision: 0.02362174344992557
cluster23 percision: 0.030479668967645892
cluster24 pe

In [90]:
c_rec, rec = recall(test_result, 45)

In [91]:
for i in range(45):
  print(f'cluster{i} recall: {c_rec[i]}')

print(f'the total recall is: {rec}')

cluster0 recall: 6.601640507666154e-05
cluster1 recall: 0.09572248754331822
cluster2 recall: 0.0001320328101533231
cluster3 recall: 0.6679841897233202
cluster4 recall: 0.9690346083788707
cluster5 recall: 0.23057879883150964
cluster6 recall: 0.014144014787674738
cluster7 recall: 0.09170531542559611
cluster8 recall: 0.07344700953431837
cluster9 recall: 0.00014853691142248848
cluster10 recall: 0.7575015309246785
cluster11 recall: 0.011305309369378291
cluster12 recall: 0.08837778658988638
cluster13 recall: 0
cluster14 recall: 1.6504101269165386e-05
cluster15 recall: 0.0008252050634582694
cluster16 recall: 0.037585558869674664
cluster17 recall: 0.0900674126308167
cluster18 recall: 0.10447096103381691
cluster19 recall: 0.00018154511396081927
cluster20 recall: 0.08703298218996224
cluster21 recall: 0.11996344890605334
cluster22 recall: 0.12667022982362372
cluster23 recall: 0.15645888003168787
cluster24 recall: 0.00018154511396081927
cluster25 recall: 0.00011552870888415771
cluster26 recall: 0.

In [93]:
F1_score(c_perc, c_rec, 45)

0.030211586097324383

In [94]:
Conditional_Entropy(test_result, 45)

0.2256701340879011