In [None]:
from IPython.display import display
import mangoes
import nltk
import string 
import pprint 
import os 
import datetime 
import logging 

Read more for 

https://github.com/UniversalDependencies/UD_English-EWT


In [None]:
date = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")

In [None]:
# Or specify date to saved ones
# eg: date = "2021-03-24-15-23"

In [None]:
OUTPUT_PATH = os.path.join(os.path.abspath(''), "output/{}".format(date))
if not os.path.exists(OUTPUT_PATH):
    print("made a dir: ", OUTPUT_PATH)
    os.makedirs(OUTPUT_PATH)   

In [None]:
date

In [None]:
# for logging 
logging.basicConfig(level=logging.DEBUG, 
                    filename=f"{OUTPUT_PATH}/log", 
                    filemode="a+",
                    format="%(asctime)s;%(levelname)s;%(message)s",
                    datefmt="%Y-%m-%d %H:%M:%S"
                   )

# Build Embedding 
### Corpus

In [None]:
# Specify the path to the data contained directory/file 
UD_English_EWT = "../../UD_English-EWT-master/corpus"
# UD_English_EWT =  "../../UD_English-EWT-master/corpus/en_ewt-ud-train.conllu"
# WIKI = "../../wikipedia_data/treebank.conllu"

In [None]:
logging.info(f"Corpus path: {UD_English_EWT}")

NOTE: When loading dataset to `Corpus`, set `ignore_punctuation` to `False` if `DependencyContext` is used to build co-occurrence matrix

In [None]:
corpus = mangoes.Corpus(UD_English_EWT, 
                        reader=mangoes.corpus.CONLLU, 
                        language="English",
                        lower=True, 
                        ignore_punctuation=False) 

In [None]:
logging.info("Corpus has {} sentences, {} words, {} unique tokens".format(corpus.nb_sentences, corpus.size, len(corpus.words_count)))

In [None]:
corpus.describe()

In [None]:
#Save corpus 
corpus_metadata = os.path.join(OUTPUT_PATH, ".corpus")
print("Saved Corpus")
corpus.save_metadata(corpus_metadata)

In [None]:
# or load 
# corpus = mangoes.Corpus.load_from_metadata(corpus_metadata)

### Vocabulary
#### -- Target Words --

In [None]:
logging.info("Building target words...")

In [None]:
stopwords_filter_lemma = mangoes.corpus.remove_elements(nltk.corpus.stopwords.words('english'), attribute="lemma")

In [None]:
stopwords_filter = mangoes.corpus.remove_elements(nltk.corpus.stopwords.words('english'))

#### Option 1: Use Lemma and POS as targetwords

In [None]:
target_vocabulary = corpus.create_vocabulary(attributes=("lemma","POS"), 
                                              filters = [ stopwords_filter_lemma,  # removes stopwords 
                                                         mangoes.corpus.remove_most_frequent(100), # removes words that are frequent more than 100
                                                         mangoes.corpus.remove_least_frequent(2)]) # removes words that are frequent less than 2

#### Option 2: Only keep lemma as targetwords

In [None]:
# target_vocabulary = corpus.create_vocabulary(attributes="lemma", 
#                                               filters = [ stopwords_filter, 
#                                                          mangoes.corpus.remove_most_frequent(100),
#                                                          mangoes.corpus.remove_least_frequent(2)])

#### Option 3: Use Lemma and POS as targetwords + keep only POS of "NOUN", "VERB", "ADJ"

In [None]:
# pos_filter_target = mangoes.corpus.filter_by_attribute("POS", ["NOUN", "VERB", "ADJ"])

In [None]:
# must include POS as one of attributes if POS filter is used
# target_vocabulary = corpus.create_vocabulary(attributes=("lemma","POS"), 
#                                             filters = [ stopwords_filter_lemma, 
#                                                         pos_filter_target,
#                                                         mangoes.corpus.remove_most_frequent(100),
#                                                         mangoes.corpus.remove_least_frequent(2)],)

#### Check target vocaulary

In [None]:
print(f"{len(target_vocabulary)} words will be used as target vocabulary")

In [None]:
logging.info(f"Target Words: {len(target_vocabulary)}")

In [None]:
# check all the target vocabulary
target_vocabulary._index_word

#### -- Context Words --

In [None]:
logging.info("Building Context words...")

#### Option 1: Only keep lemma 

In [None]:
context_vocabulary = corpus.create_vocabulary(attributes=("lemma"), 
                                              filters = [ stopwords_filter])

#### Option 2: Keep lemma and POS with "VERB" or "ADJ"

In [None]:
# pos_filter_context = mangoes.corpus.filter_by_attribute("POS",["NOUN", "VERB", "ADJ"])

In [None]:
# context_vocabulary = corpus.create_vocabulary(attributes=("lemma","POS"), 
#                                             filters = [ stopwords_filter_lemma,
#                                                         pos_filter_context])

#### Check context vocabulary

In [None]:
print(f"{len(context_vocabulary)} words will be used as target vocabulary")

In [None]:
logging.info(f"Context words: {len(context_vocabulary)}")

In [None]:
context_vocabulary._index_word

### Context 

#### Option 1: consider dependency relation  as undirected by setting directed parameter 

In [None]:
# NOTE: entity needs to match with context_vocabulary 
dependency_context = mangoes.context.DependencyBasedContext(entity=("lemma"), 
                                                            labels=True,
                                                            collapse=True, 
                                                            vocabulary=context_vocabulary,
                                                            directed=False)

#### Option 2: consider only dependency relation of "advmod" or "nsubj"
Note if collapse is set to True, then case will be included as well. 


This will result in changed number of context words when co-occurrence matrix is built 

In [None]:
# dependency_context = mangoes.context.DependencyBasedContext(entity=("lemma","POS"), 
#                                                             labels=True,
#                                                             collapse=True, 
#                                                             vocabulary=context_vocabulary,
#                                                             deprel_keep = ("advmod", "nsubj"),
#                                                             directed=False)

#### Option 3 adding depth to consider indirected dependency relation

In [None]:
# dependency_context = mangoes.context.DependencyBasedContext(entity=("lemma","POS"), 
#                                                             labels=True,
#                                                             collapse=True, 
#                                                             vocabulary=context_vocabulary,
#                                                             directed=False,
#                                                             depth=2
#                                                             )

#### Option 4: Combining option 2 and 3

If `depth` > 1 and `deprel_keep` is specified then `deprel_keep` will be counted as long as indirected dependency relations has one relation from `deprel_keep`.


eg. If `depth=2` and `deprel_keep=("nsubj")` then  `{'lemma': 'muslim', 'POS': 'ADJ'}/nsubj_amod",` will be included as context words since one of the dependency relation `nsubj` is in `deprel_keep`

In [None]:
# dependency_context = mangoes.context.DependencyBasedContext(entity=("lemma","POS"), 
#                                                             labels=True,
#                                                             collapse=True, 
#                                                             vocabulary=context_vocabulary,
#                                                             directed=False,
#                                                             deprel_keep=("advmod", "nsubj"),
#                                                             depth=2
#                                                             )

In [None]:
# # save vocabularies 
target_vocabulary_file_name = "vocabulary_{}_target_words".format(len(target_vocabulary))
context_vocabulary_file_name = "vocabulary_{}_context_words".format(len(context_vocabulary))

# # or load by specify saved vocabulary files 

In [None]:
# # save
target_vocabulary.save(OUTPUT_PATH, name=target_vocabulary_file_name)
context_vocabulary.save(OUTPUT_PATH, name=context_vocabulary_file_name)

In [None]:
# load vocabs
# target_vocabulary = mangoes.Vocabulary.load("output", target_vocabulary_file_name)
# context_vocabulary = mangoes.Vocabulary.load("output", context_vocabulary_file_name)

### Cooccurrence Matrix 

In [None]:
coocc_count = mangoes.counting.count_cooccurrence(corpus,  
                                                target_vocabulary, 
                                                context=dependency_context,
                                                )

coocc_count.pprint(display=display)

In [None]:
print("target words")
coocc_count.words[:20]

In [None]:
print("context words")
coocc_count.contexts_words[:20]

In [None]:
coocc_count.shape

In [None]:
i = 11
print("target word: ", target_vocabulary[i])
print("close words")
pprint.pprint(coocc_count.get_closest_words(target_vocabulary[i], 12))

### Weighting 

In [None]:
ppmi = mangoes.weighting.PPMI()
svd = mangoes.reduction.SVD(dimensions=300)

In [None]:
embeddings = mangoes.create_representation(coocc_count, weighting=ppmi, reduction=svd)

In [None]:
embeddings.pprint(display=display)

In [None]:
# # Save the embeddings 
embedding_path = os.path.join(OUTPUT_PATH,
                              "embeddings/ppmi_svd_{}target_words_deprel".format(len(target_vocabulary)))

In [None]:
embeddings.save(embedding_path)

In [None]:
# load embedding 
# embeddings = mangoes.Embeddings.load(embedding_path)

# Explore Embedding 

In [None]:
import pandas as pd 

### Closest Words 

#### Similarities 

In [None]:
sims = {0: "cityblock",
        1: "cosine", 
        2: "euclidean", 
        3: "l1", 
        4:"l2", 
        5:"manhattan",
        6:"braycurtis", 
        7:"canberra", 
        8:"chebyshev", 
        9:"correlation", 
        10:"dice", 
        11:"hamming", 
        12:"jaccard", 
        13:"kulsinski", 
        14:"mahalanobis", 
       15: "minkowski", 
       16: "rogerstanimoto", 
       17: "russellrao", 
       18: "seuclidean", 
        19:"sokalmichener", 
        20:"sokalsneath", 
       21: "sqeuclidean", 
       22: "yule"}


In [None]:
i = 2
nb = 5

In [None]:
result = {word: pd.Series([w for w, _ in embeddings.get_closest_words(word, nb=nb, metric=sims[i])], index=range(1,nb+1))
          for word in embeddings.words[100:200:10]}
print(f"similarity measure: {sims[i]}")
print(pd.DataFrame(result).transpose())

### Analogies 

In [None]:
# Check what words are available 
question = "king queen male"
ans = "female"
for w in (question.split() + [ans]):
    try:
        embeddings.words.word_index[w]
        print(f"'{w}' exists")
    except KeyError:
        print(f"'{w}'  doesn't exist in embedding")

In [None]:
# You can resolve analogy according to a representation using the analogy() method
# Here, we will display the results of some examples :
for analogy in [question]:
    print(analogy, '->', embeddings.analogy(analogy,5).using_cosadd)
    print(analogy, '->', embeddings.analogy(analogy,5).using_cosmul)

### Visualization

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt
import mangoes.visualize
plt.figure()

# 1. distances between the words
ax = plt.subplot(221, projection='polar')
mangoes.visualize.plot_distances(embeddings, ax)

# 2. isotropy
ax = plt.subplot(222)
mangoes.visualize.plot_isotropy(embeddings, ax)

# 3. t-sne
plt.subplot(212)
mangoes.visualize.plot_tsne(embeddings)

plt.show()


# Evalutaion 

In [None]:
# # Evaluate
import mangoes.evaluation.analogy

google_dataset = mangoes.evaluation.analogy.GOOGLE
msr_dataset = mangoes.evaluation.analogy.MSR

analogy_evaluation = mangoes.evaluation.analogy.Evaluation(embeddings, google_dataset, msr_dataset)
print(analogy_evaluation.get_report())