In [None]:
import os
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.cm as cm
from evaluation import *

In [None]:
def getHistogramm(word_labels:list, xlabel: str, ylabel: str, 
                  title:str, filename=None):
  """
  Function to create a histogram.

  Parameters:
    word_labels(list): list of labels for each word
    xlabel(str): name of the x-axis of the histogram
    ylabel(str): name of the y-axis of the histogram
    title(str): graph's title
    filename: (optional) name of file to be saved to
  
  Returns:
    (plt, fig)
  """
  
  label_count = Counter(word_labels)


  labels, values = zip(*label_count.items())
  sorted_labels = sorted(labels)
  sorted_values = [x for _,x in sorted(zip(labels,values))]

  fig = plt.figure(figsize=(10, 10), dpi=80)
  plt.style.use('ggplot')

  plt.bar(sorted_labels, sorted_values, align='center', 
          tick_label= range(len(np.unique(np.array(word_labels))))
          )

  plt.xlabel(xlabel)
  plt.ylabel(ylabel)
  plt.title(title)

  # if filename is passed then save result as file
  if isinstance(filename,str):
    fig.savefig(filename, bbox_inches='tight')


  return plt, fig
  #plt.show()

In [None]:
def getTSNE(n_clusters, word_labels, word_vectors: list, 
            title: str, n_components=2, perplexity=24, 
            verbose=1, random_state=42, filename=None):
  """
  Draw TSNE's data reduction.

  Parameters:
    n_cluster(int): number of clusters
    word_labels(list): predicted cluster labels for each word
    word_vectors(list): list of word vectorizations
    title(str): title of graph
    n_components(int): (optional) number of axis
    perplexioty(int): (optional)
    verbose(1| 0): (optional) verbose trigger
    random_state(ing): (optinal) random seed
    filename(str): (optional) name of file 

  Returns:
    (plt, fig)
  """
  
  tsne = TSNE(n_components=n_components, perplexity=perplexity, 
              verbose=verbose, random_state=random_state)
  tsne_train_data = tsne.fit_transform(word_vectors)


  tsne_dim_1 = []
  tsne_dim_2 = []
  for c in range(0,n_clusters):
    temp_1 = []
    temp_2 = []
    for i, ind_c in enumerate(word_labels):
      if c == ind_c:
        temp_1.append(tsne_train_data[i,0])
        temp_2.append(tsne_train_data[i,1])

    tsne_dim_1.append(temp_1) 
    tsne_dim_2.append(temp_2)

  colors = cm.rainbow(np.linspace(0, 1, n_clusters))

  #Plot
  fig = plt.figure(figsize=(10, 10), dpi=80)
  plt.style.use('ggplot')

  for cluster, color in zip(range(0, n_clusters), colors):
    plt.scatter(tsne_dim_1[cluster], tsne_dim_2[cluster], color=color)

  #plt.legend()
  plt.xlabel("tsne-2d-one")
  plt.ylabel("tsne-2d-two")
  plt.title(title)

  # if filename is passed then save result as file
  if isinstance(filename,str):
    fig.savefig(filename, bbox_inches='tight')
  
  return plt, fig
  #plt.show()

In [None]:
def getClusterLabels(docs:list, word_labels:list, index2word:list, 
                     n_concepts:int, filename=None, 
                     top_n_words=10):
  """
  Function creates a label for each cluster.

  Parameters:
    docs(list): list of docs
    word_labels(list): list of predicted clusters for each word
    index2word(list): list of words (with their assosiated index)
    n_concepts(int): number of clusters
    filename(str): (optional) name of the txt to save all the labels
    top_n_words(int): (optional) number of words for each topic (their labels)

  Returns:
    <nothing>
  """
  cluster_words_sorted = getClusterWords(docs, index2word,
                                         word_labels, n_concepts,0)

  # if filename is passed then save result as file
  if isinstance(filename,str):

    with open(filename, "w") as outfile:

      for cluster_id, cluster_list in enumerate(cluster_words_sorted):
        outfile.write("Cluster " + str(cluster_id) + 
                        "'s top " + str(top_n_words) + " words: ")

        temp = [w_c[0] for w_c in cluster_list[0:top_n_words]]

        outfile.write(str(temp))
        outfile.write("\n\n")

  else:
    # just print the cluster labels

    for cluster_id, cluster_list in enumerate(cluster_words_sorted):
      print("Cluster " + str(cluster_id) + 
            "'s top " + str(top_n_words) + " words: ")

      temp = [w_c[0] for w_c in cluster_list[0:top_n_words]]

      print(str(temp))
      print()
      print()