## **Imports**

In [None]:
import csv
import copy
import numpy as np
import tensorflow as tf
from tqdm.auto import tqdm 
from gensim.models import Word2Vec
from scipy.spatial.distance import jensenshannon
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu, sentence_bleu

# Adapt this variable to the path of the cloned repository
path = "YourPathHere"

# For importing custom modules
import sys
sys.path.append(f'{path}/Modules')

Mount google drive:

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

## **Dataset**

In [None]:
from helper_functions import dataset_cleanup

In [None]:
data_path = f"{path}/Dataset/news_data_preprocessed.csv"
min_sent_len=10
max_sent_len=28

cleaned_data, max_seq_length = dataset_cleanup(data_path=data_path, 
                                               min_sent_len=min_sent_len, 
                                               max_sent_len=max_sent_len)

train_data = []
for sent in cleaned_data:
    train_data.append(sent[1:])

### **Load word2vec embeddings**

In [None]:
# Load previously saved embeddings
word2vec_model = Word2Vec.load(f"{path}/Skip-Gram Embeddings/skip-gram_embeddings.model")

print("Examine the trained embeddings: ")
word2vec_model.most_similar("<NUM>", topn=10)

Create data used for evaluation:

In [None]:
word2index_dict = {token: token_index for token_index, token in enumerate(word2vec_model.wv.index2word)}

sent2index = []

for sent in train_data:
    sent = [word2index_dict[key] for key in sent]
    sent2index.append(sent)

In [None]:
start_token = word2index_dict["<Start>"]
end_token = word2index_dict["<End>"]
print(f"<Start>: {start_token}")
print(f"<End>: {end_token}")

## **Evaluation**

Prepare the reference data used for Bleu, Self-Bleu, Word Frequency and Jenssen-Shannon Distance calculations:

In [None]:
reference_data = []
for sent in sent2index[int(len(sent2index)*0.85):]:
    temp = []
    for token_id in sent:
        if token_id == end_token:
            break
        temp.append(word2vec_model.wv.index2word[token_id])
    reference_data.append(temp)

Avg Sentence Length:

In [None]:
avg_length = 0.0
for sent in reference_data[:10000]:
    avg_length += len(sent)
print(f"Average length of the test sentences: {avg_length/10000} tokens")

BLEU-4:

In [None]:
n_grams = 4
score_bleu = corpus_bleu([reference_data[:10000] for i in range(10000)], reference_data[-10000:], weights=tuple(1./n_grams for _ in range(n_grams)), smoothing_function=SmoothingFunction().method1)
print(f"BLEU-4 Score of the test sentences: {score_bleu}")

Self BLEU-4:

In [None]:
references = []

hyps = []

for idx, hyp in enumerate(tqdm(reference_data[:10000])):
    
    bleu_reference = copy.deepcopy(reference_data[:10000])

    bleu_reference.pop(idx)
    
    references.append(bleu_reference)
    
    hyps.append(hyp)
    

self_bleu = corpus_bleu(references, hyps, weights = tuple(1./n_grams for _ in range(n_grams)), smoothing_function=SmoothingFunction().method1)

print(f"Self BLEU-4 Score of the test sentences: {self_bleu}")

JS Distance and word overlap of the top 12 words:

In [None]:
def align_counts(ref, gen):

    q_ref = dict.fromkeys(set(list(ref.keys())+list(gen.keys())))
    k_gen = dict.fromkeys(set(list(ref.keys())+list(gen.keys())))

    for key in tqdm(q_ref.keys()):
        try: 
            q_ref[key] = ref[key]
        except:
            q_ref[key] = 0 
        try:
            k_gen[key] = gen[key]
        except:
            k_gen[key] = 0

    return list(q_ref.values()), list(k_gen.values())


ref_word_freq = {}
gen_word_freq = {}

ref_sent_length = {}
gen_sent_length = {}

jsd_sent_length = 0.0
jsd_word_count = 0.0


for idx, sample in enumerate(tqdm(reference_data[:10000])):
    

    # Get the sentence lengths
    ref_length = len(reference_data[-10000+idx])
    gen_length = len(sample)

    # Increment the respective sentence length entry for ref and gen 
    if ref_length in ref_sent_length:
            ref_sent_length[ref_length] += 1
    else:
        ref_sent_length[ref_length] = 1
    
    if gen_length in gen_sent_length:
            gen_sent_length[gen_length] += 1
    else:
        gen_sent_length[gen_length] = 1


    # Loop over the tokens and increment the word count for ref and gen 
    for token in reference_data[-10000+idx]:
        
        if token in ref_word_freq:
            ref_word_freq[token] += 1
        else:
            ref_word_freq[token] = 1

    for token in sample:
        
        if token in gen_word_freq:
            gen_word_freq[token] += 1
        else:
            gen_word_freq[token] = 1

# Calculate Jensen-Shannon Distance 
aligned_sent_lengths = align_counts(ref_sent_length, gen_sent_length)
jsd_sent_length = jensenshannon(aligned_sent_lengths[0], aligned_sent_lengths[1], 2)

aligned_word_counts = align_counts(ref_word_freq, gen_word_freq)
jsd_word_count = jensenshannon(aligned_word_counts[0], aligned_word_counts[1], 2)

print(f"Jensen-Shannon distance for the sentence length frequencies: {jsd_sent_length}")
print(f"Jensen-Shannon distance for the word counts: {jsd_word_count}")
print(f"Top 12 words in the first 10000 reference sentences: {list(dict(sorted(ref_word_freq.items(), key=lambda item: item[1], reverse=True)).items())[:12]}")
print(f"Top 12 words in the last 10000 reference sentences: {list(dict(sorted(gen_word_freq.items(), key=lambda item: item[1], reverse=True)).items())[:12]}")