In [None]:
from itertools import combinations 
from collections import Counter
from sklearn import metrics
import numpy as np
import gensim

In [None]:
top_n_topic_words = 9

def getWordCount(docs:list):
  """
  Return a count of all words in the documents.
  """

  temp = []
  for doc in docs:
    temp.extend(doc)

  return Counter(temp)



def getClusterWords(docs_cleaned:list, words:list, n_concept:int, 
                    vocab_dict:dict, word_labels=None, 
                    doc_labels=None): -> list
  """
  Function returns a list of words for each cluster.
  """
  
  labels = None
  if isinstance(word_labels, list):
    labels = word_labels
  elif isinstance(doc_labels, list):
    labels = doc_labels
  else:
    print("Need to add word_labels or doc_labels")
    return

  
  word_counts = getWordCount(docs_cleaned)
  vocab =  list(vocab_dict.values())
  corpus = [vocab_dict.doc2bow(text) for text in docs_cleaned]
  tfidf = gensim.models.TfidfModel(corpus, id2word=vocab_dict.id2token)
  corpus_tfidf = tfidf[corpus]


  # group all words in the vocab by clusters
  cluster_words = [[] for i in range(n_concept)]

  if isinstance(doc_labels, list) :
    # working on document level
    for doc_id, doc in enumerate(docs_cleaned):
      cluster_words[doc_labels[doc_id]].extend([w for w in doc 
                                                if w in vocab])
  else:
    # working on word level
    for word_id, word in enumerate(words):
      if word in vocab:
        # make sure the word is in the vocabulary
        cluster_words[word_labels[word_id]].append(word)


  # calculate absolute frequency * idf for cluster_words:
  cluster_words_freq = [[] for i in range(n_concept)]
  for cluster_id, cluster_list in enumerate(cluster_words):

    # remove any duplicates
    cluster_list = list(set(cluster_list))

    for w in cluster_list:

      # weight words by their absolute frequency and their idf
      w_absolute_count = word_counts.get(w)
      w_idf = corpus_tfidf.obj.idfs[dictionary.token2id[w]]
      cluster_words_freq[cluster_id].append([w, w_absolute_count * w_idf])


  """
  # assign absolute frequence to each word of each cluster
  cluster_words_freq = [[] for i in range(n_concept)]
  for cluster_id, cluster_list in enumerate(cluster_words):

    for w in cluster_list:

      w_count = word_counts.get(w)
      cluster_words_freq[cluster_id].append([w, w_count])
  """


  # sort each word by their absolute frequency
  cluster_words_sorted = [[] for i in range(n_concept)]
  for cluster_id, cluster_list in enumerate(cluster_words_freq):
    

    sorted_list = sorted(cluster_list, key= lambda x: x[1], reverse=True)
    cluster_words_sorted[cluster_id].extend(list(set(map(
        itemgetter(0), sorted_list ))))
  
  return cluster_words_sorted


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from math import log2, log10, log1p
import numpy as np

def coherenceScore(cluster_words_sorted:list, 
                   w2v_model=None, top_n_topic_words=10,
                   n_clusters=20):
  """
  Calculating the coherence score using word embeddings and 
  cosine similarity.

  """
  assert n_clusters == len(cluster_words_sorted), ("len(cluster_words_sorted) "
                                                    "!= n_clusters")
  per_topic_cs = [0 for _ in range(n_clusters)]
  for topic_id, topic_word_list in enumerate(cluster_words_sorted):

    top_words = topic_word_list[:top_n_topic_words]

    # calculate similarity for each pair of terms
    pair_scores = 0
    for pair in combinations(top_words, 2):

      pair_scores += log1p(cosine_similarity(
          w2v_model.wv.get_vector(pair[0]).reshape(1, -1), 
          w2v_model.wv.get_vector(pair[1]).reshape(1, -1)))
    

    # devide the score by the number of pairs
    per_topic_cs[topic_id] =  pair_scores / sum(range(top_n_topic_words)) 


  # return the mean score across all topics 
  return np.average(per_topic_cs)

In [None]:
from s_dbw import S_Dbw
def s_Dbw_validity_index(data, labels_pred, centers_id):
  
  return S_Dbw(data, labels_pred, centers_id, method='Halkidi', 
               alg_noise='bind', centr='mean',nearest_centr=True, 
               metric='euclidean')



In [None]:
def silhoutteCoefficient(data, labels_pred):
  return metrics.silhouette_score(data, labels_pred)


In [None]:
def internalValidation(data, labels_pred, centers_id):
  """
  return the Silhoutte Coefficient and the s_Dbw index as a tuple
  """
  s_coefficient = silhoutteCoefficient(data, labels_pred)
  s_Dbw = s_Dbw_validity_index(data, labels_pred, centers_id)
  return s_coefficient, s_Dbw
  



In [None]:
def adjustedRandIndex(labels_true, labels_pred):
  "ARI used to examine homogeinity and competeness"
  return metrics.adjusted_rand_score(labels_true, labels_pred)


In [None]:
def fowlkesMallowsIndex(labels_true, labels_pred):
  "FMI used to examine recall and precision"
  return metrics.fowlkes_mallows_score(labels_true, labels_pred)


In [None]:
def externalValidation(labels_true, labels_pred):
  """
  return the results of all external evaluations
  """
  fmi = fowlkesMallowsIndex(labels_true, labels_pred)
  ari = adjustedRandIndex(labels_true, labels_pred)
  return ari, fmi
  


testing