# Analysis of termhood

This notebook contains most parts of the analysis on term, the importance of a term within a corpus of a certain domain.

In [None]:
import utils  # a python module in the same dir as the notebooks

In [None]:
CORPUS_NAME = 'genia'
MODEL_SPEC = '_min2_l10'
FREQ_THRESHOLD = 5
C_VALUE_THRESHOLD = 2

In [None]:
import os
os.chdir(utils.ROOT)  # get to the root directory of the project

from datautils import dataio, annotations as anno
from stats import ngramcounting

# load the corpus/corpora
print('Loading n-gram model')
model = ngramcounting.NgramModel.load_model(CORPUS_NAME, MODEL_SPEC)

if CORPUS_NAME.lower() == 'pmc':
    corpus = dataio.load_craft_corpus() + dataio.load_genia_corpus()
else:
    corpus = dataio.load_corpus(CORPUS_NAME.lower())

## Question 1: What is the termhood of concepts incl. and excl. the counts of DC's?

In [None]:
# get all continuous concepts

from stats import conceptstats
cont_concepts = conceptstats.gold_standard_concepts(corpus, discontinuous=False)
disc_concepts = conceptstats.gold_standard_concepts(corpus, continuous=False)

In [None]:
from stats import conceptstats

In [None]:
term_freqs = conceptstats.count_concepts(corpus, discontinuous=False)
doc_freqs = conceptstats.count_concepts(corpus, discontinuous=False,
                                        doc_frequency=True)

In [None]:
c_values = conceptstats.calculate_c_values(cont_concepts, C_VALUE_THRESHOLD,
                                           term_freqs)
tf_idfs = conceptstats.calculate_tf_idf_values(cont_concepts, term_freqs,
                                               doc_freqs, len(corpus))


In [None]:
all_concepts = cont_concepts.union(disc_concepts)
term_freqs_incl_dc = conceptstats.count_concepts(corpus)
doc_freqs_incl_dc = conceptstats.count_concepts(corpus, doc_frequency=True)

In [None]:
c_values_incl_dc = conceptstats.calculate_c_values(all_concepts, C_VALUE_THRESHOLD,
                                                   term_freqs_incl_dc)
tf_idfs_incl_dc = conceptstats.calculate_tf_idf_values(all_concepts, term_freqs_incl_dc,
                                                       doc_freqs_incl_dc, len(corpus))

In [None]:
glossex_values = {concept: conceptstats.glossex(concept, model)
                  for concept in all_concepts}
pmi_nl_values = {concept: conceptstats.length_normalized_pmi(concept, model)
                 for concept in all_concepts}

In [None]:
import pandas as pd

data_dict = {'concept': [], 'freq': [], 'c-value': [], 'tf-idf': [],
             'freq_INCL' : [],'c-value_INCL': [], 'tf-idf_INCL': [],
             'glossex': [], 'pmi_nl': [], 'type': []}



for concept in all_concepts:
    
    if term_freqs[concept] < FREQ_THRESHOLD:
        continue
    
    data_dict['concept'].append(concept)
    data_dict['freq'].append(term_freqs[concept])
    data_dict['freq_INCL'].append(term_freqs_incl_dc[concept])
    
    data_dict['c-value'].append(c_values[concept] if concept in c_values else None)
    data_dict['tf-idf'].append(tf_idfs[concept] if concept in tf_idfs else None)
    data_dict['c-value_INCL'].append(c_values_incl_dc[concept])
    data_dict['tf-idf_INCL'].append(tf_idfs_incl_dc[concept])
    data_dict['glossex'].append(glossex_values[concept])
    data_dict['pmi_nl'].append(pmi_nl_values[concept])
    
    
    if concept in cont_concepts:
        if concept in disc_concepts:
            concept_type = 'both'
        else:
            concept_type = 'only_CC'
    elif concept in disc_concepts:
        concept_type = 'only_DC'
    else:
        concept_type = 'neither'
    
    data_dict['type'].append(concept_type)


data = pd.DataFrame(data_dict)

In [None]:
import seaborn as sns
sns.boxplot(x='type', y='pmi_nl', data=data, showfliers=True)

In [None]:
data.sort_values('pmi_nl')

In [None]:
import pingouin
pingouin.mwu(data[data['type'] == 'only_CC']['pmi_nl'],
             data[data['type'] == 'both']['pmi_nl'])

In [None]:
sns.boxplot(x='type', y='glossex', data=data, showfliers=False)

In [None]:
import pingouin
pingouin.mwu(data[data['type'] == 'only_CC']['glossex'],
             data[data['type'] == 'both']['glossex'])

In [None]:
%matplotlib notebook
import seaborn

In [None]:
sns.lmplot(x='c-value', y='freq', data=data)

In [None]:
sns.lmplot(x='tf-idf', y='tf-idf_INCL', data=data)

In [None]:
sns.lmplot(x='freq', y='freq_INCL', data=data)

In [None]:
# we can perform t-tests etc.

Conclusion: Most DC's that we are interested in also occur as CC's, thus making them verifiable. The inclusion of these can enhance the term extraction recall and ranking.