<a href="https://colab.research.google.com/github/MHoseinHoushmand/Clustering_by_SLFA/blob/main/Clustering_by_SLFA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

categories = [
    "alt.atheism",
    "talk.religion.misc",
    "comp.graphics",
    "sci.space",
]

dataset = fetch_20newsgroups(
    remove=("headers", "footers", "quotes"),
    subset="all",
    categories=categories,
    shuffle=True,
    random_state=42,
)


labels = dataset.target
unique_labels, category_sizes = np.unique(labels, return_counts=True)
true_k = unique_labels.shape[0]
print(f"{len(dataset.data)} documents - {true_k} categories")

3387 documents - 4 categories


In [2]:

def docs_as_tfidf(docs):
  vectorizer = TfidfVectorizer(
     max_df=0.5,
     min_df=5,
     stop_words="english",
  )
  docs_vector = vectorizer.fit_transform(docs)
  return docs_vector.toarray()



In [3]:
population_size = 120 # Frogs number
memplex_num = 12 #define as m
memplex_size = 10 #define as n
max_iteration = 50
memplex_iteration = 8
docs = dataset.data
docs_vector = docs_as_tfidf(docs)

In [4]:
def cosin_sim(a,b):
   return np.dot(a,b)/(norm(a)*norm(b))

In [5]:
from numpy.linalg import norm
def SSE(cluster,docs_vector):
  size = len(cluster)
  SSE=0
  meandoc = np.average(docs_vector,axis=0)
  for doc in cluster:
    SSE += cosin_sim(doc,meandoc)**2
  SSE = SSE/size
  return SSE

In [6]:
from numpy.linalg import norm
def BC(doc_means):
   BC=0
   size = len(doc_means)
   for i in range(size):
      for j in range(i+1,size):
          BC += cosin_sim(doc_means[i],doc_means[j])**2
   return BC

In [7]:
def WC(clusters,docs_vector):
    WC = 0
    for cluster in clusters:
        WC += SSE(cluster,docs_vector)
    return WC


In [9]:
def build_clusters(answer,docs_vector,clusters_size):
   clusters = []
   for i in range(clusters_size):
       curent_cluster = []
       for j in answer:
          if i == answer[j]:
             curent_cluster.append(docs_vector[j])
       clusters.append(curent_cluster)
   return clusters

In [10]:
def fitness(answer,docs_vector,clusters_size):
   doc_means = []
   clusters = build_clusters(answer,docs_vector,clusters_size)
   for i in range(clusters_size):
       doc_means.append(
          np.average(clusters[i], axis=0)
       )
   fitness = WC(clusters,docs_vector)/BC(doc_means)


In [11]:
import random
def cross_over(answer_a,answer_b):
    size = len(answer_a)
    output = np.array(())
    for i in range(size):
       choice = random.choice([0,1])
       if choice == 0:
          np.append(output,answer_a[i])
       else:
          np.append(output,answer_b[i])
    return output

In [12]:
def best_and_worst(answers,fitness_dict):
     temp ={}
     for answer in answers:
        temp[tuple(answer)] = fitness_dict[tuple(answer)]
     best =  max(temp, key=temp.get)
     worst = min(temp, key=temp.get)
     return np.array(best) ,np.array(worst)

In [13]:
def global_best(answers,fitness_dict):
     temp ={}
     for answer in answers:
        temp[tuple(answer)] = fitness_dict[tuple(answer)]
     best =  max(temp, key=temp.get)
     return np.array(best), temp[tuple(best)]

In [None]:
def mutation(global_best,clusters_size):
    new_ans = global_best
    size = len(global_best)/4
    array = np.random.choice(np.arange(-1,clusters_size), size=size, replace=False)
    for i in array:
      new_ans = new_ans[i]
    return new_ans

In [None]:
def Create_memplexes(population,memplex_num):
     memplexes = {}
     population_size = len(population)
     for i in range(memplex_num):
         memplexes[i] = []
     for i in population_size:
         memplexes[i % memplex_num].append(population[i])
     return memplexes


In [None]:
import operator
def frog_leaping_search(docs_vector,cluster_size):
     answers=np.random.randint(-1, cluster_size, size=(population_size , len(dataset.data)))
     population = {}
     for answer in answers:
        population[tuple(answer)] = fitness(answer,docs_vector,cluster_size)
     population = dict( sorted(population.items(), key=operator.itemgetter(1), reverse=True))
     memplexes = Create_memplexes(population, memplex_num)

     for i in range(max_iteration):
         for j in range(memplex_num):
            sub_memplex = random.sample(memplexes[j],memplex_size/2)
            for k in range(memplex_iteration):
               ans_best, ans_worst = best_and_worst(sub_memplex,fitness_dict)
               ans_out = cross_over(ans_best,ans_worst)
               if (fitness_dict[tuple(ans_worst)]<fitness_dict[tuple(ans_out)]):
                    np.delete(sub_memplex, ans_worst)
                    np.add(sub_memplex,ans_out)
               else:
                    g_best, g_value = global_best(population,fitness_dict)
                    ans_out = cross_over(g_best,ans_worst)
                    if (fitness_dict[tuple(ans_worst)]<fitness_dict[tuple(ans_out)]):
                        np.delete(sub_memplex, ans_worst)
                        np.add(sub_memplex,ans_out)
                    else:
                        ans_out = mutation(global_best,cluster_size)
                        np.delete(sub_memplex, ans_worst)
                        np.add(sub_memplex,ans_out)
               fitness_dict[tuple(ans_out)] = fitness(ans_out,docs_vector,cluster_size)





In [None]:
uniqueList=[[3,2,1,3],[5,3,2,1],[3,8,5,9],[2,6,4,9]]

print(random.sample(uniqueList, 2))

a= {'s':3,'4':6,'sdgfsdfg':656}
b= max(a.values())
print(b)

[[5, 3, 2, 1], [2, 6, 4, 9]]
656


In [None]:
import operator
x = {'4':7,'8':2,'99':45}
x = dict( sorted(x.items(), key=operator.itemgetter(1), reverse=True))
print (x)

{'99': 45, '4': 7, '8': 2}


In [None]:

import numpy as np

# تعیین بازه مقداری


array = np.random.choice(np.arange(-1, 10), size=5, replace=False)

print(array)




[4 6 9 2 1]
