In [1]:
import src.glossary_terms_extractor as gtee
import src.evaluation_switch as eval_switch
import src.clustering_omega_evaluation as coe
import matplotlib.pyplot as plt
import pprint
pp = pprint.PrettyPrinter()

### Extract glossary terms and define gold list

In [2]:
REQS = 'data/opencoss_reqs.txt'
GOLD_GLOSSAR = 'evaluation/GOLD_GLOSSAR_OPENCOSS.txt'
GROUND_TRUTH = 'evaluation/GROUND_TRUTH_OPENCOSS.csv'

In [3]:
def find_glossary_terms():
    gte = gtee.GlossaryTermsExtractor(filter_terms=True)
    gte.requirements = [x[:-1] for x in open(REQS).readlines()]
    gte.gold_list = [x[:-1] for x in open(GOLD_GLOSSAR).readlines()]
    # find glossary terms
    gte.fit()
    return gte

In [4]:
def evaluate(algorithm,
             embedding,
             ground_truth: str,
             threshold=None,
             number_of_clusters=None,
             calculate_recall=False,
             plot_ahc=False,
             verbose=False) -> list:
    """
    Evaluates the chosen embedding and clustering technique with the omega score.
    Optionally computes recall and plots AHC dendrogram.

    Returns the resulting omega score.
    """
    es = eval_switch.EvaluationSwitch(gte,
                                      threshold=threshold,
                                      number_of_clusters=number_of_clusters,
                                      plot_ahc=plot_ahc)

    es.switch(algorithm, embedding)
    es.fit()
    obtained_clusters = es.obtained
    # display(obtained_clusters)

    oe = coe.OmegaEvaluator(
        ground_truth,
        GOLD_GLOSSAR,
        gte
    )
    if calculate_recall:
        recall = oe.recall
        print('\nRecall= ' + recall + '%\n')
    oe.obtained = obtained_clusters
    # print(oe.ground_truth)
    omega = oe.omega_index_
    if verbose:
        print(f'Omega Index for {algorithm} and {embedding} is {omega}.')
    return [omega, obtained_clusters]

In [5]:
def plot_threshold_omega(ground_truth: str,
                         max_t: int,
                         algorithm: str,
                         embedding: str) -> float:
    """
    Plots the chosen AHC distance threshold versus the obtained omega index for a certain
    threshold range based on the given ground truth.

    Returns the optimal (maximal) omega value.
    """
    omegas = []
    clusters_result = []
    for t in range(5, max_t):
        out = evaluate(algorithm,
                       embedding,
                       ground_truth,
                       threshold=t / 1000,
                       calculate_recall=False,
                       verbose=False)
        omegas.append(out[0])
        clusters_result.append(out[1])
        print(f'Testing for a threshold from .005 to .{max_t - 1} ... {round((t - 4) / (max_t - 5) * 100, 2)}' + '%',
              end='\r')
    plt.plot([x / 1000 for x in range(5, max_t)], omegas)
    plt.title(
        '{0} distance threshold vs Omega score \n b/w Ideal and Obtained Clusters for {1} embeddings'.format(algorithm,
                                                                                                             embedding))
    plt.xlabel('Threshold')
    plt.ylabel('Omega Index')
    plt.show()
    winner_clusters_config = clusters_result[max(enumerate(omegas), key=lambda x: x[1])[0]]
    print(f'The highest omega index obtained was: {max(omegas)} for the clustering result:\n')
    pp.pprint(winner_clusters_config)
    return max(omegas)

In [6]:
def plot_n_clusters_omega(ground_truth: str,
                          max_n: int,
                          algorithm: str,
                          embedding: str) -> float:
    """
    Plots the chosen number of clusters versus the obtained omega index for a certain
    range of numbers of clusters based on the given ground truth.

    Returns the optimal (maximal) omega value.
    """
    omegas = []
    clusters_result = []
    for n in range(1, max_n):
        out = evaluate(algorithm,
                       embedding,
                       ground_truth,
                       number_of_clusters=n,
                       calculate_recall=False,
                       verbose=False)
        omegas.append(out[0])
        clusters_result.append(out[1])
        print(
            f'Testing for a number of clusters from 1 to {max_n - 1} ... {round((n - 0) / (max_n - 1) * 100, 2)}' + '%',
            end='\r')
    plt.plot([x for x in range(1, max_n)], omegas)
    plt.title(
        '{0} number of clusters vs Omega score \n b/w Ideal and Obtained Clusters for {1} embeddings'.format(algorithm,
                                                                                                             embedding))
    plt.xlabel('Number of clusters')
    plt.ylabel('Omega Index')
    plt.show()
    winner_clusters_config = clusters_result[max(enumerate(omegas), key=lambda x: x[1])[0]]
    print(f'The highest omega index obtained was: {max(omegas)} for the clustering result:\n')
    pp.pprint(winner_clusters_config)
    return max(omegas)

In [7]:
gte = find_glossary_terms()

### Find the threshold that maximizes Omega Index between *Ideal Keyword Clusters* and *Obtained* for **AHC** and **bert**

In [None]:
ahc_omega = plot_threshold_omega(GROUND_TRUTH, 
                                 230, 
                                'agglomerative-hierarchical-clustering',
                                'bert')

### Find the threshold that maximizes Omega Index between *Ideal Paper Clusters* and *Obtained* for **AHC** and **bert**

In [None]:
ahc_omega_paper = plot_threshold_omega('evaluation/GROUND_TRUTH_OPENCOSS_PAPER.csv', 
                                       230,
                                      'agglomerative-hierarchical-clustering',
                                      'bert')

### Find the threshold that maximizes Omega Index between *Ideal Paper Clusters* and *Obtained* for **AHC** and **Co-occurrence**

In [None]:
ahc_omega_paper = plot_threshold_omega('evaluation/GROUND_TRUTH_OPENCOSS_PAPER.csv', 
                                       300,
                                      'agglomerative-hierarchical-clustering',
                                      'co-occurrence')

### Find the threshold that maximizes Omega Index between *Ideal Keyword Clusters* and *Obtained* for **AHC** and **Co-occurrence**

In [None]:
ahc_omega_coocc = plot_threshold_omega(GROUND_TRUTH, 
                                       510,
                                      'agglomerative-hierarchical-clustering',
                                      'co-occurrence')


### Find the threshold that maximizes Omega Index between *Ideal Keyword Clusters* and *Obtained* for **C-Means** and **Co-occurrence**

In [None]:
c_means_omega_coocc = plot_n_clusters_omega(GROUND_TRUTH,
                                            400,
                                            'c-means',
                                            'co-occurrence')

### Find the threshold that maximizes Omega Index between *Ideal Keyword Clusters* and *Obtained* for **C-Means** and **bert**

In [None]:
c_means_omega_coocc = plot_n_clusters_omega(GROUND_TRUTH,
                                            400,
                                            'c-means',
                                            'bert')

### Find the threshold that maximizes Omega Index between *Ideal Paper Clusters* and *Obtained* for **C-Means** and **Co-occurrence**

In [None]:
c_means_omega_coocc = plot_n_clusters_omega('evaluation/GROUND_TRUTH_OPENCOSS_PAPER.csv',
                                            400,
                                            'c-means',
                                            'co-occurrence')

### Find the threshold that maximizes Omega Index between *Ideal Paper Clusters* and *Obtained* for **Keyword Clustering** (OPENCOSS)

In [None]:
kc_omega = evaluate('keyword-clustering', None, 'evaluation/GROUND_TRUTH_OPENCOSS_PAPER.csv')
print(kc_omega[0])

In [None]:
kc_omega = evaluate('keyword-clustering', None, GROUND_TRUTH)
print(kc_omega[0])