<a href="https://colab.research.google.com/github/MHoseinHoushmand/Clustering_by_SLFA/blob/main/Clustering_by_SLFA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import numpy as np

import pdb
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

categories = [
    "alt.atheism",
    "talk.religion.misc",
    "comp.graphics",
    "sci.space",
]

dataset = fetch_20newsgroups(
    remove=("headers", "footers", "quotes"),
    subset="all",
    categories=categories,
    shuffle=True,
    random_state=42,
)


labels = dataset.target
unique_labels, category_sizes = np.unique(labels, return_counts=True)
true_k = unique_labels.shape[0]
print(f"{len(dataset.data)} documents - {true_k} categories")

3387 documents - 4 categories


In [6]:

def docs_as_tfidf(docs):
  vectorizer = TfidfVectorizer(
     max_df=0.5,
     min_df=5,
     stop_words="english",
  )

  docs_vector = vectorizer.fit_transform(docs)
  return docs_vector.toarray()

In [7]:
population_size = 120 # Frogs number
memplex_num = 12 #define as m
memplex_size = 10 #define as n
max_iteration = 50
memplex_iteration = 8
docs = dataset.data
docs_vector = docs_as_tfidf(docs)

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
def cosin_sim(a,b):
   return cosine_similarity([a], [b])[0][0]

In [9]:
from numpy.linalg import norm
def SSE(cluster,doc_mean):
  size = len(cluster)
  sse=0
  for doc in cluster:
    sse += cosin_sim(doc,doc_mean)**2
  sse = sse/size
  return sse

In [10]:
from numpy.linalg import norm
def BC(doc_means):
   BC=0
   size = len(doc_means)
   for i in range(size):
      for j in range(i+1,size):
          BC += cosin_sim(doc_means[i],doc_means[j])**2
   return BC

In [11]:
def WC(clusters):
    WC = 0
    for cluster in clusters:
        doc_mean = np.average(cluster, axis=0)
        WC += SSE(cluster,doc_mean)
    return WC

In [12]:
def build_clusters(answer,docs_vector,clusters_size):
   clusters = []
   for i in range(clusters_size):
       clusters.append([])
   for j in range(len(answer)):
       if -1 < answer[j]:
        clusters[answer[j]].append(docs_vector[j])
   return clusters

In [13]:
def fitness(answer,docs_vector,clusters_size):
   doc_means = []
 #  pdb.runcall(build_clusters,answer,docs_vector,clusters_size)
   clusters = build_clusters(answer,docs_vector,clusters_size)
   for i in range(clusters_size):
       doc_means.append(
          np.average(clusters[i], axis=0)
       )
   wc = WC(clusters)
   bc = BC(doc_means)
   fitness = wc/bc
   return fitness

In [14]:
import random
def cross_over(answer_a,answer_b):
    size = len(answer_a)
    output = []
    for i in range(size):
       choice = random.choice([0,1])
       if choice == 0:
          output.append(answer_a[i])
       else:
          output.append(answer_b[i])
    return tuple(output)

In [15]:
def best_and_worst(answers):
     best =  max(answers, key=answers.get)
     worst = min(answers, key=answers.get)
     return tuple(best) , tuple(worst)

In [16]:
def global_best(memplexes):
     local_bests = {}
     for memplex in memplexes:
         local_best =  max(memplex, key=memplex.get)
         local_bests[local_best]= memplex[local_best]
     global_best = max(local_bests, key=local_bests.get)
     return global_best, local_bests[global_best]

In [17]:
def keys_to_remove(keys , dict):
   for k in keys:
      if k in dict:
          dict.pop(k)
   return dict

In [18]:
def mutation(global_best,clusters_size):
    new_ans = list(global_best)
    size = int(len(global_best)/4)
    indexes = np.random.choice(np.arange(0,len(global_best)), size=size, replace=False)
    values= [random.randint(-1, 3) for _ in range(size)]
    for i in range(size):
      new_ans[indexes[i]] = values[i]
    return tuple(new_ans)

In [19]:
def Create_memplexes(population,memplex_num):
     memplexes = []
     keys = list(population.keys())
     population_size = len(population)
     for i in range(memplex_num):
         memplexes.append({})
     for i in range(population_size):
         memplexes[i % memplex_num][keys[i]] = population[keys[i]]
     return memplexes

In [20]:
def shufeling(memplexes):
    output = {}
    for memplex in memplexes:
        output.update(memplex)
    return output

In [24]:
import operator
def frog_leaping_search(docs_vector,cluster_size):
     answers=np.random.randint(-1, cluster_size, size=(population_size , len(dataset.data)))
     population = {}
     i=0
     for answer in answers:
       # pdb.runcall(fitness,answer,docs_vector,cluster_size)
        i+=1
        population[tuple(answer)] = fitness(answer,docs_vector,cluster_size)
        print(i,population[tuple(answer)])

     for i in range(max_iteration):
        population = dict( sorted(population.items(), key=operator.itemgetter(1), reverse=True))
   #    pdb.runcall(Create_memplexes,population, memplex_num)
        memplexes = Create_memplexes(population, memplex_num)
        population.clear()
      #  pdb.set_trace()
        for j in range(memplex_num):
            print(i,j,len(memplexes[j]))
            sub_memplex = dict(random.sample(list(memplexes[j].items()),k=5))
            memplexes[j] =  keys_to_remove(sub_memplex.keys(),memplexes[j])
            for k in range(memplex_iteration):
                #pdb.runcall(best_and_worst,sub_memplex)
                ans_best, ans_worst = best_and_worst(sub_memplex)
                ans_out = cross_over(ans_best,ans_worst)
                fitness_out = fitness(ans_out,docs_vector,cluster_size)

              # pdb.runcall(best_and_worst,sub_memplex)
               #print(population[ans_worst],population[ans_out])
                if (sub_memplex[ans_worst]<fitness_out):
                    del sub_memplex[ans_worst]
                    sub_memplex[ans_out] = fitness_out
                else:
                   # pdb.runcall(global_best,memplexes)
                    g_best, g_value = global_best(memplexes)
                    ans_out = cross_over(g_best,ans_worst)
                    fitness_out = fitness(ans_out,docs_vector,cluster_size)
                    if (sub_memplex[ans_worst] < fitness_out):
                        del sub_memplex[ans_worst]
                        sub_memplex[ans_out] = fitness_out
                    else:
                        del sub_memplex[ans_worst]
                     #   pdb.runcall(mutation,g_best,cluster_size)
                        ans_out = mutation(g_best,cluster_size)
                        fitness_out = fitness(ans_out,docs_vector,cluster_size)
                        sub_memplex[ans_out] = fitness_out
        #     pdb.runcall(join_dicts,memplexes[j],sub_memplex)
            memplexes[j].update(sub_memplex)
        g_best, g_value = global_best(memplexes)
        print(g_value)
        population = shufeling(memplexes)
     return g_best, g_value

In [26]:
g_best, g_value = frog_leaping_search(docs_vector,4)
true = 0
size = len(dataset.data)
for i in size:
   if labels[i] == g_best[i]:
      true = true +1
print(true)

1 0.013740152930011915
2 0.014112434424144832
3 0.014032940514688319
4 0.014047418071981198
5 0.014246090940651858
6 0.013937872825238597
7 0.01391743804524892
8 0.013868270777216304
9 0.014030279601807537
10 0.013862606562598523
11 0.014288091658328611
12 0.01396469887196816
13 0.01397043227520099
14 0.013996137063850594
15 0.01427795885071379
16 0.014051972436403498
17 0.014021721839893454
18 0.014038251404106828
19 0.0138446359742077
20 0.013911099610741695
21 0.014231945232574596
22 0.013890583451439566
23 0.013941571263550449
24 0.01398139735906956
25 0.013886145227611427
26 0.013945181900680805
27 0.013845319948155927
28 0.01388899875129841
29 0.01423933035881896
30 0.014262926633137305
31 0.014126949939349761
32 0.01385053077809274
33 0.013877038818497904
34 0.014133631658684689
35 0.013982544682944444
36 0.014015828497339422
37 0.014087029496031581
38 0.013889359875776743
39 0.013920944473767217
40 0.014071424765643895
41 0.014211552781349267
42 0.01410707398075283
43 0.0138269

TypeError: ignored

In [None]:
uniqueList=[[3,2,1,3],[5,3,2,1],[3,8,5,9],[2,6,4,9]]

print(random.sample(uniqueList, 2))

a= {'s':3,'4':6,'sdgfsdfg':656}
b= max(a.values())
print(b)

In [None]:
import operator
x = {'4':7,'8':2,'99':45}
x = dict( sorted(x.items(), key=operator.itemgetter(1), reverse=True))
print (x)

In [None]:

import numpy as np

# تعیین بازه مقداری

import pdb
beeeeee =46363
zee={'ddd':2352}
for i in range(1):
  # pdb.set_trace()
   array = np.random.choice(np.arange(-1, 10), size=5, replace=False)
   print(array)

ggggga=643
print('nali')

In [None]:


import random


import random

random_numbers = [random.randint(1, 100) for _ in range(10)]
print(random_numbers)
my_dict = {'a': 1.235, 'b': 2.23525235, 'c': 3.235223523, 'd': 4.235235235, 'e': 5.23525252323}

del my_dict[['a','b']]
random_objects = random.sample(list(my_dict.items()), k=3)
print(random_objects)