<a href="https://colab.research.google.com/github/MHoseinHoushmand/Clustering_by_SLFA/blob/main/Clustering_by_SLFA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

import pdb
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

categories = [
    "alt.atheism",
    "talk.religion.misc",
    "comp.graphics",
    "sci.space",
]

dataset = fetch_20newsgroups(
    remove=("headers", "footers", "quotes"),
    subset="all",
    categories=categories,
    shuffle=True,
    random_state=42,
)


labels = dataset.target
unique_labels, category_sizes = np.unique(labels, return_counts=True)
true_k = unique_labels.shape[0]
print(f"{len(dataset.data)} documents - {true_k} categories")

3387 documents - 4 categories


In [2]:

def docs_as_tfidf(docs):
  vectorizer = TfidfVectorizer(
     max_df=0.5,
     min_df=5,
     stop_words="english",
  )

  docs_vector = vectorizer.fit_transform(docs)
  return docs_vector.toarray()



In [3]:
population_size = 120 # Frogs number
memplex_num = 12 #define as m
memplex_size = 10 #define as n
max_iteration = 50
memplex_iteration = 8
docs = dataset.data
docs_vector = docs_as_tfidf(docs)

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
def cosin_sim(a,b):
   return cosine_similarity([a], [b])[0][0]

In [5]:
from numpy.linalg import norm
def SSE(cluster,doc_mean):
  size = len(cluster)
  sse=0
  for doc in cluster:
    sse += cosin_sim(doc,doc_mean)**2
  sse = sse/size
  return sse

In [6]:
from numpy.linalg import norm
def BC(doc_means):
   BC=0
   size = len(doc_means)
   for i in range(size):
      for j in range(i+1,size):
          BC += cosin_sim(doc_means[i],doc_means[j])**2
   return BC

In [7]:
def WC(clusters):
    WC = 0
    for cluster in clusters:
        doc_mean = np.average(cluster, axis=0)
        WC += SSE(cluster,doc_mean)
    return WC


In [8]:
def build_clusters(answer,docs_vector,clusters_size):
   clusters = []
   for i in range(clusters_size):
       clusters.append([])
   for j in range(len(answer)):
       if -1 < answer[j]:
        clusters[answer[j]].append(docs_vector[j])
   return clusters

In [9]:
def fitness(answer,docs_vector,clusters_size):
   doc_means = []
 #  pdb.runcall(build_clusters,answer,docs_vector,clusters_size)
   clusters = build_clusters(answer,docs_vector,clusters_size)
   for i in range(clusters_size):
       doc_means.append(
          np.average(clusters[i], axis=0)
       )
   wc = WC(clusters)
   bc = BC(doc_means)
   fitness = wc/bc
   return fitness


In [10]:
import random
def cross_over(answer_a,answer_b):
    size = len(answer_a)
    output = np.array(())
    for i in range(size):
       choice = random.choice([0,1])
       if choice == 0:
          np.append(output,answer_a[i])
       else:
          np.append(output,answer_b[i])
    return output

In [11]:
def best_and_worst(answers):
     best =  max(answers, key=answers.get)
     worst = min(answers, key=answers.get)
     return best , worst

In [12]:
def global_best(answers):
     temp ={}
     best =  max(temp, key=temp.get)
     return best, temp[tuple(best)]

In [13]:
def mutation(global_best,clusters_size):
    new_ans = global_best
    size = len(global_best)/4
    array = np.random.choice(np.arange(-1,clusters_size), size=size, replace=False)
    for i in array:
      new_ans = new_ans[i]
    return new_ans

In [17]:
def Create_memplexes(population,memplex_num):
     memplexes = []
     population_size = len(population)
     for i in range(memplex_num):
         memplexes.append([])
     for i in range(population_size):
         memplexes[i % memplex_num].append(population[i])
     return memplexes


In [15]:
import operator
def frog_leaping_search(docs_vector,cluster_size):
     answers=np.random.randint(-1, cluster_size, size=(population_size , len(dataset.data)))
     population = {}
     i=0
     for answer in answers:
       # pdb.runcall(fitness,answer,docs_vector,cluster_size)
        i+=1
        population[tuple(answer)] = fitness(answer,docs_vector,cluster_size)
        print(i,population[tuple(answer)])
     population = dict( sorted(population.items(), key=operator.itemgetter(1), reverse=True))
     memplexes = Create_memplexes(population, memplex_num)
     for i in range(max_iteration):
         pdb.set_trace()
         for j in range(memplex_num):
            sub_memplex = random.sample(memplexes[j],memplex_size/2)
            for k in range(memplex_iteration):
               ans_best, ans_worst = best_and_worst(sub_memplex)
               ans_out = cross_over(ans_best,ans_worst)
               if (population[tuple(ans_worst)]<population[tuple(ans_out)]):
                    np.delete(sub_memplex, ans_worst)
                    np.add(sub_memplex,ans_out)
               else:
                    g_best, g_value = global_best(population)
                    ans_out = cross_over(g_best,ans_worst)
                    if (population[tuple(ans_worst)]<population[tuple(ans_out)]):
                        np.delete(sub_memplex, ans_worst)
                        np.add(sub_memplex,ans_out)
                    else:
                        ans_out = mutation(global_best,cluster_size)
                        np.delete(sub_memplex, ans_worst)
                        np.add(sub_memplex,ans_out)







In [18]:
pdb.runcall(frog_leaping_search,docs_vector,4)

> [0;32m<ipython-input-15-3a55813e6a67>[0m(3)[0;36mfrog_leaping_search[0;34m()[0m
[0;32m      1 [0;31m[0;32mimport[0m [0moperator[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      2 [0;31m[0;32mdef[0m [0mfrog_leaping_search[0m[0;34m([0m[0mdocs_vector[0m[0;34m,[0m[0mcluster_size[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 3 [0;31m     [0manswers[0m[0;34m=[0m[0mnp[0m[0;34m.[0m[0mrandom[0m[0;34m.[0m[0mrandint[0m[0;34m([0m[0;34m-[0m[0;36m1[0m[0;34m,[0m [0mcluster_size[0m[0;34m,[0m [0msize[0m[0;34m=[0m[0;34m([0m[0mpopulation_size[0m [0;34m,[0m [0mlen[0m[0;34m([0m[0mdataset[0m[0;34m.[0m[0mdata[0m[0;34m)[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      4 [0;31m     [0mpopulation[0m [0;34m=[0m [0;34m{[0m[0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      5 [0;31m     [0mi[0m[0;34m=[0m[0;36m0[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> tbreak 11
Breakpoint 2 at <ipyt

KeyError: ignored

In [None]:
uniqueList=[[3,2,1,3],[5,3,2,1],[3,8,5,9],[2,6,4,9]]

print(random.sample(uniqueList, 2))

a= {'s':3,'4':6,'sdgfsdfg':656}
b= max(a.values())
print(b)

In [None]:
import operator
x = {'4':7,'8':2,'99':45}
x = dict( sorted(x.items(), key=operator.itemgetter(1), reverse=True))
print (x)

In [None]:

import numpy as np

# تعیین بازه مقداری

import pdb
beeeeee =46363
zee={'ddd':2352}
for i in range(1):
   pdb.set_trace()
   array = np.random.choice(np.arange(-1, 10), size=5, replace=False)
   print(array)

ggggga=643
print('nali')


In [None]:
a = []
a.append([])
a.append([])
a[1].append(3)
print (a)