<a href="https://colab.research.google.com/github/MHoseinHoushmand/Clustering_by_SLFA/blob/main/Clustering_by_SLFA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import numpy as np
import pdb
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from numpy.linalg import norm
import operator
import random

In [27]:
categories = [  #Select 4 categories from fetch_20newsgroups dataset
    "alt.atheism",
    "comp.graphics",
    "sci.space",
    "rec.sport.hockey",
]

dataset = fetch_20newsgroups( #Preprocessing before using dataset
    remove=("headers", "footers", "quotes"),
    subset="all",
    categories=categories,
    shuffle=True,
    random_state=42,
)
labels = dataset.target
unique_labels, category_sizes = np.unique(labels, return_counts=True)
true_k = unique_labels.shape[0]
print(f"{len(dataset.data)} documents - {true_k} categories")
print(labels)

3758 documents - 4 categories
[3 2 0 ... 3 1 2]


In [28]:
# Vectorize all document as their term frequency(tfidf score)
def docs_as_tfidf(docs):
  vectorizer = TfidfVectorizer(
     max_df=0.5, #Removing terms that are used in more than 50% of articles
     min_df=5,   #Removing terms that are not used in less than 10 of articles
     stop_words="english",
     #  max_features=1000,
  )
  docs_vector = vectorizer.fit_transform(docs)
  return docs_vector.toarray()

In [29]:
population_size = 400 # Frogs number
memplex_num = 20 #define as m
memplex_size = 20 #define as n
max_iteration = 100 #Total Iteration
memplex_iteration = 10 #Iteration As local search
cluster_size = 4
docs = dataset.data
docs_vector = docs_as_tfidf(docs)
print(list(docs_vector[0]))

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [30]:
def cosin_sim(a,b):
   return cosine_similarity([a], [b])[0][0]

In [31]:
#Calculate sum of squared error(SSE) as similarity of each documents with the cluster mean in document
def SSE(cluster,doc_mean):
  size = len(cluster)
  sse=0
  for doc in cluster:
    sse += cosin_sim(doc,doc_mean)**2
  sse = sse/size
  return sse

In [32]:
#Calculate similarity between clusters
def BC(doc_means):
   BC=0
   size = len(doc_means)
   for i in range(size):
      for j in range(i+1,size):
          BC += cosin_sim(doc_means[i],doc_means[j])**2
   return BC

In [33]:
#Calculate similarity within clusters
def WC(clusters):
    WC = 0
    for cluster in clusters:
        doc_mean = np.average(cluster, axis=0)
        WC += SSE(cluster,doc_mean)
    return WC

In [34]:
def build_clusters(answer,docs_vector,cluster_size):
   clusters = []
   for i in range(cluster_size):
       clusters.append([])
   for j in range(len(answer)):
       if -1 < answer[j]:
        clusters[answer[j]].append(docs_vector[j])
   return clusters

In [35]:
def fitness(answer,docs_vector,size):
   doc_means = []
 #  pdb.runcall(build_clusters,answer,docs_vector,clusters_size)
   clusters = build_clusters(answer,docs_vector,size)
   for i in range(size):
       doc_means.append(
          np.average(clusters[i], axis=0)
       )
   wc = WC(clusters)
   bc = BC(doc_means)
   fitness = wc/bc
   return fitness

In [36]:
def cross_over(answer_a,answer_b):
    frog_size = len(answer_a)
    points = sorted(np.random.choice(np.arange(0,frog_size), size=2, replace=False))
    child1 = answer_a[:points[0]] + answer_b[points[0]:points[1]] + answer_a[points[1]:]
    child2 = answer_b[:points[0]] + answer_a[points[0]:points[1]] + answer_b[points[1]:]
 #   pdb.set_trace()
    fitness1 =  fitness(child1 ,docs_vector,cluster_size)
    fitness2 = fitness(child2 ,docs_vector,cluster_size)
    if fitness1 > fitness2:
       return child1 , fitness1
    else :
       return child2 , fitness2

In [37]:
def best_and_worst(answers):
     best =  max(answers, key=answers.get)
     worst = min(answers, key=answers.get)
     return tuple(best) , tuple(worst)


In [38]:
def global_best(memplexes):
     local_bests = {}
     for memplex in memplexes:
         local_best =  max(memplex, key=memplex.get)
         local_bests[local_best]= memplex[local_best]
     global_best = max(local_bests, key=local_bests.get)
     return global_best, local_bests[global_best]


In [39]:
def keys_to_remove(keys , dict):
   for k in keys:
      if k in dict:
          dict.pop(k)
   return dict

In [40]:
def mutation(global_best,clusters_size):
    new_ans = list(global_best)
    size = int(len(global_best)/4)
    indexes = np.random.choice(np.arange(0,len(global_best)), size=size, replace=False)
    values= [random.randint(0, 3) for _ in range(size)]
    for i in range(size):
      new_ans[indexes[i]] = values[i]
    return tuple(new_ans)

In [41]:
def Create_memplexes(population,memplex_num):
     population = dict( sorted(population.items(), key=operator.itemgetter(1), reverse=True))
     memplexes = []
     keys = list(population.keys())
     population_size = len(population)
     for i in range(memplex_num):
         memplexes.append({})
     for i in range(population_size):
         memplexes[i % memplex_num][keys[i]] = population[keys[i]]
     return memplexes

In [42]:
def create_submemplex(memplex,memplex_size, submemplex_size):
    sub_memplex = {}
    prob_list = []
    keys = []
    for i in range(memplex_size):
       for j in range(2*(memplex_size-i)):
          prob_list.append(i)
    k=0
    while(k!=submemplex_size):
       index = random.choice(prob_list)
       key = list(memplex.keys())[index]
       if key not in keys:
           sub_memplex[key] = memplex[key]
           keys.append(key)
           k+=1
    return sub_memplex

In [43]:
def shufeling(memplexes):
    output = {}
    for memplex in memplexes:
        output.update(memplex)
    return output

In [44]:
def is_in_population(answer,memplexes,submemplex):
    if answer in submemplex:
       return True
    for memplexe in memplexes:
       if (answer in memplexe):
          return True
    else:
       return False

In [45]:
import operator
def frog_leaping_search(docs_vector):
             answers=np.random.randint(0, cluster_size, size=(population_size , len(docs)))
             population = {}
             i=0
             for answer in answers:
              #  pdb.runcall(fitness,answer,docs_vector,cluster_size)
                i+=1
                population[tuple(answer)] = fitness(answer,docs_vector,cluster_size)
                print(i,population[tuple(answer)])

             for i in range(max_iteration):
            #    pdb.runcall(Create_memplexes,population, memplex_num)
                memplexes = Create_memplexes(population, memplex_num)
                population.clear()
                if (i == 1):
                    pdb.set_trace()
                for j in range(memplex_num):
                    print(i,j,len(memplexes[j]))
                    #pdb.runcall(create_submemplex,memplexes[j],memplex_size, 5)
                    sub_memplex = create_submemplex(memplexes[j],memplex_size, 5)
                    memplexes[j] =  keys_to_remove(sub_memplex.keys(),memplexes[j])
                    for k in range(memplex_iteration):
                       # pdb.runcall(best_and_worst,sub_memplex)

                   #     for m in sub_memplex:\n",
                    #        print(list(m))
                        ans_best, ans_worst = best_and_worst(sub_memplex)
                        ans_out , fitness_out = cross_over(ans_best,ans_worst)
                        while(is_in_population(ans_out,memplexes,sub_memplex)== True):
                            ans_best, ans_worst = best_and_worst(sub_memplex)
                            ans_out , fitness_out = cross_over(ans_best,ans_worst)


                        if (ans_out in sub_memplex):
                            print("errrrrrrrrrrorrrrrrrA",k)
                         #   pdb.set_trace()

                      #pdb.runcall(best_and_worst,sub_memplex)
                       # print("############")


                        if (sub_memplex[ans_worst]<fitness_out):
                            del sub_memplex[ans_worst]
                            sub_memplex[ans_out] = fitness_out
                            #sec C
                        else:
                           # pdb.runcall(global_best,memplexes)
                            g_best, g_value = global_best(memplexes)
                            ans_out , fitness_out = cross_over(g_best,ans_worst)
                            while(is_in_population(ans_out,memplexes,sub_memplex)== False):
                                ans_best, ans_worst = best_and_worst(sub_memplex)
                                ans_out , fitness_out = cross_over(ans_out,ans_worst)
                         #   if (ans_out in sub_memplex):
                          #     print("errrrrrrrrrrorrrrrrrB")
                      #         pdb.set_trace()
                            #sec D
                            ###############################################
                     #       if len(sub_memplex)< 5:
                      #            print("errrrrrrrrrrrorrrrrrrrrrrrrD")
                       #           pdb.set_trace()
                            ###############################################
                            if ((sub_memplex[ans_worst] < fitness_out) and ans_out not in sub_memplex):
                                del sub_memplex[ans_worst]
                                sub_memplex[ans_out] = fitness_out
                                #sec E
                                ###############################################
                    #            if len(sub_memplex)< 5:
                     #                 print("errrrrrrrrrrrorrrrrrrrrrrrrE")
                      #                pdb.set_trace()
                            ###############################################
                            else:
                           #     print(\"#########\")
                            #    for m in sub_memplex:
                             #        print(list(m))

                             #   pdb.runcall(mutation,g_best,cluster_size)
                                ans_out = mutation(g_best,cluster_size)

                                if (ans_out in sub_memplex):
                                   print("errrrrrrrrrrorrrrrrrC")
                                #   pdb.set_trace()
                                del sub_memplex[ans_worst]
                                fitness_out = fitness(ans_out,docs_vector,cluster_size)
                                sub_memplex[ans_out] = fitness_out
                                #sec F
                                ###############################################
                       #         if len(sub_memplex)< 5:
                        #              print("errrrrrrrrrrrorrrrrrrrrrrrrF")
                         #             pdb.set_trace()
                               ###############################################
                #     pdb.runcall(join_dicts,memplexes[j],sub_memplex)
                    memplexes[j].update(sub_memplex)
                    print(i,j,len(memplexes[j]))
                g_best, g_value = global_best(memplexes)
               # pdb.runcall(show_result,g_best)
                print(g_best)
                print(g_value)
               # pdb.set_trace()
                population = shufeling(memplexes)
                print(len(population))
             return g_best, g_value, population

In [46]:
g_best, g_value,population = frog_leaping_search(docs_vector)
true = 0
size = len(dataset.data)
print(labels)
print(list(g_best))

1 0.012207604934240819
2 0.012286196468881502
3 0.012167288715556435
4 0.012140092149264519
5 0.012133623104650196
6 0.012253496405427668
7 0.012185031665487723
8 0.01225650210979554
9 0.01212277757077452
10 0.012145326930270055
11 0.012227749199952332
12 0.012185243864489605
13 0.012146042153795323
14 0.01226944553382099
15 0.012149675179194032
16 0.012216949928180842
17 0.012253724547831961
18 0.012175262542414486
19 0.01226469847657004
20 0.012220037837408278
21 0.012197002863566593
22 0.012173445447164508
23 0.012112655980934297
24 0.01216127543175424
25 0.012152394846931978
26 0.012215439972229583
27 0.0122589739423736
28 0.012147775477071448
29 0.012150760456693421
30 0.012227198108788606
31 0.012258968527293657
32 0.012177193275446165
33 0.012122757238643266
34 0.01218311865088088
35 0.012096989144151209
36 0.01223850277813342
37 0.012247330906802455
38 0.012217269019964167
39 0.012153317532963222
40 0.01220569939261492
41 0.012219003053970761
42 0.01216902866121732
43 0.0121806

KeyboardInterrupt: ignored

In [47]:
print(1 not in [1,3,4,56])

False
