Imports:

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from random import randint

import numpy as np
import torch
from scipy.linalg import sqrtm

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# Adapt this variable to the path of the cloned repository
path = "YourPathHere"

Load the reference data and perform necessary preprocessing steps:

In [None]:
import csv
with open(f"{path}/Dataset/news_data_preprocessed.csv", encoding='utf-8', newline="") as file:
    reader = csv.reader(file)
    news_tokenized = list(reader)
    
# Replace <, NUM, > with <number>
for idefix, sent in enumerate(news_tokenized):
    for obelix, token in enumerate(sent):
        if token=="<":
            del sent[obelix:obelix+3]
            news_tokenized[idefix].insert(obelix, "<number>")

avg = 0.0
count = 0
# Delete sentences that are shorter than 10 and longer than 28 tokens
news_cache= []
for sent in news_tokenized:
    if len(sent)>9 and len(sent)<29 and not ("(" in sent and "hr" in sent):
        news_cache.append(sent)
        avg += len(sent)
    else:
        count+=1

news_tokenized = news_cache



print(avg/len(news_cache))
print(count)


for sent in news_tokenized:
    sent.insert(len(sent), "</s>")
    sent.insert(0, "<s>")


max_length = 0
idx = 0
for sent in news_tokenized:
    if len(sent) > max_length:
        max_length = len(sent)

print(f"Longest Sentence has {max_length} tokens.")   
print(len(news_tokenized))


all_sents = []
for sent in news_tokenized:
    all_sents += sent


all_sents_batched = []
counter = 0
append = False

for idx, word in enumerate(all_sents):

    counter += 1

    if word == "<s>" and append == False:
        append = True
        all_sents_batched.append(all_sents[idx:idx+30])
        counter = 1

    elif counter == 30:
        append = False

all_sents_batched = all_sents_batched[:-1]


train_data = []

for sent in all_sents_batched:
    train_data.append(sent[1:])

Construct reference data:

In [None]:
reference_data = []
for sent in train_data[int(len(train_data)*0.85):]:
    temp = []
    for token in sent:
        if token == "</s>":
            break
        temp.append(token)
    reference_data.append(temp)

Load generated sentences from a model:

In [None]:
gen_data = []

# Chose one of the following: cVAELM_InferSent, LSTMLM_InferSent, GSGAN_InferSent, LaTextGAN_InferSent, GPT-2_Small_InferSent
with open(f"{path}/Evaluation/FID/cVAELM_InferSent.csv", encoding='utf-8', newline="") as file:
    reader = csv.reader(file)
    gen_data = list(reader)

# Change our tokens, such that they correspond to the tokens used in GloVe
for sentence in gen_data:
    for idx, word in enumerate(sentence):
        if word == "<NUM>":
            sentence[idx] = "<number>"
        elif word == "<End>":
            sentence[idx] = "</s>"
        elif word == "<Start>":
            sentence[idx] = "<s>"

Download GloVe embeddings used in InferSent:

In [None]:
!mkdir GloVe
!curl -Lo GloVe/glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip
!unzip GloVe/glove.840B.300d.zip -d GloVe/

In [None]:
!mkdir encoder
!curl -Lo encoder/infersent1.pkl https://dl.fbaipublicfiles.com/infersent/infersent1.pkl

Load the InferSent model:

In [None]:
import sys
sys.path.append(f'{path}/Evaluation/FID')

from InferSent_models import InferSent
model_version = 1
MODEL_PATH = "encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

In [None]:
# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = 'GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec'
model.set_w2v_path(W2V_PATH)

In [None]:
model.build_vocab(reference_data[:10000]+gen_data, tokenize=False)
#model.build_vocab(reference_data[:10000]+reference_data[-10000:], tokenize=False)

In [None]:
# Keep it on CPU or put it on GPU
use_cuda = True
model = model.cuda() if use_cuda else model

In [None]:
test_embeddings = model.encode(reference_data[:10000], bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(test_embeddings)))

gen_embeddings = model.encode(gen_data, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(gen_embeddings)))

In [None]:
# To calculate InferSent on the train data itself
# gen_embeddings = model.encode(reference_data[-10000:], bsize=128, tokenize=False, verbose=True)
# print('nb sentences encoded : {0}'.format(len(gen_embeddings)))

In [None]:
# Calculate frechet inception distance
def calculate_fid(test_embeddings, gen_embeddings):

    # Calculate mean and covariance statistics
    mu1, sigma1 = np.mean(test_embeddings, axis=0), np.cov(test_embeddings, rowvar=False)
    mu2, sigma2 = np.mean(gen_embeddings, axis=0), np.cov(gen_embeddings, rowvar=False)

    # Calculate sum squared difference between means
    diff = np.sum((mu1 - mu2)**2.0)

    # Calculate sqrt of product between cov
    square_root = sqrtm(sigma1.dot(sigma2))

    # Check and correct imaginary numbers from sqrt
    if np.iscomplexobj(square_root):
      square_root = square_root.real

    # Calculate score
    frechet_infersent_dist = diff + np.trace(sigma1 + sigma2 - 2.0 * square_root)

    return round(frechet_infersent_dist, 4)

In [None]:
calculate_fid(test_embeddings, gen_embeddings)