In [35]:
# https://github.com/cbowdon/doc2vec-pytorch/blob/master/doc2vec.ipynb
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam  # ilenic uses Adam, but gensim uses plain SGD
import numpy as np
import pandas as pd
from collections import Counter
import altair as alt
from tqdm import tqdm, trange
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA


import nltk
from nltk.tokenize import word_tokenize

## Loading Data

In [36]:
# Read train claims
with open('../data/train-claims.json', 'r') as f:
    claims = json.load(f)

In [37]:
len(claims)

1228

In [38]:
# Read evidence
with open('../data/evidence.json', 'r') as f:
    evidences = json.load(f)
evidences = {i: str.lower(j) for i,j in evidences.items()}

In [39]:
len(evidences)

1208827

In [40]:
evidences['evidence-442946']

'at very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.'

## Prepare the corpus

In [41]:
# Collect all texts from claims
corpus = {}
for id, claim in claims.items():
    corpus[id] = str.strip(str.lower(claim['claim_text']))  # Add claim text

for id, evidence in evidences.items():
    corpus[id] = str.strip(evidence)  # Add evidence text

In [42]:
def tokenize_text(df):
    #df["tokens"] = df.text.apply(lambda x: [token.text.strip() for token in word_tokenize(x) if token.text.isalnum()])
    #df['tokens'] = df['text'].apply(lambda x: [token for token in word_tokenize(x) if token.isalnum()])
    df['tokens'] = df['text'].apply(lambda x: [token for token in word_tokenize(x) if token.isalpha()])
    df["length"] = df.tokens.apply(len)
    return df

In [43]:
# Convert the list of documents into a pandas DataFrame
df = pd.DataFrame.from_dict(corpus, orient='index', columns=['text'])

In [44]:
df = tokenize_text(df)
df

Unnamed: 0,text,tokens,length
claim-1937,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc...",22
claim-126,el niño drove record highs in global temperatu...,"[el, niño, drove, record, highs, in, global, t...",16
claim-2510,"in 1946, pdo switched to a cool phase.","[in, pdo, switched, to, a, cool, phase]",7
claim-2021,weather channel co-founder john coleman provid...,"[weather, channel, john, coleman, provided, ev...",15
claim-2449,"""january 2008 capped a 12 month period of glob...","[january, capped, a, month, period, of, global...",17
...,...,...,...
evidence-1208822,also on the property is a contributing garage ...,"[also, on, the, property, is, a, contributing,...",9
evidence-1208823,| class = ``fn org'' | fyrde | | | | 6110 | | ...,"[class, fn, org, fyrde, volda]",5
evidence-1208824,"dragon storm (game), a role-playing game and c...","[dragon, storm, game, a, game, and, collectibl...",9
evidence-1208825,it states that the zeriuani ``which is so grea...,"[it, states, that, the, zeriuani, which, is, s...",46


In [45]:
class Vocab:
    def __init__(self, all_tokens, min_count=2):
        self.min_count = min_count
        # Only includes the word when has more than 1 ocurrence
        self.freqs = {t:n for t, n in Counter(all_tokens).items() if n >= min_count}
        self.words = sorted(self.freqs.keys())
        self.word2idx = {w: i for i, w in enumerate(self.words)}

In [51]:
# Create the vocabulary
vocab = Vocab([tok for tokens in df.tokens for tok in tokens], min_count=2)
print(f"Dataset comprises {len(df)} documents and {len(vocab.words)} unique words (over the limit of {vocab.min_count} occurrences)")

Dataset comprises 1210055 documents and 232159 unique words (over the limit of 2 occurrences)


In [52]:
def clean_tokens(df, vocab):
    df["length"] = df.tokens.apply(len)
    df["clean_tokens"] = df.tokens.apply(lambda x: [t for t in x if t in vocab.freqs.keys()])
    df["clean_length"] = df.clean_tokens.apply(len)
    return df

In [53]:
df = clean_tokens(df, vocab)
df[:5]

Unnamed: 0,text,tokens,length,clean_tokens,clean_length
claim-1937,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc...",22,"[not, only, is, there, no, scientific, evidenc...",22
claim-126,el niño drove record highs in global temperatu...,"[el, niño, drove, record, highs, in, global, t...",16,"[el, niño, drove, record, highs, in, global, t...",16
claim-2510,"in 1946, pdo switched to a cool phase.","[in, pdo, switched, to, a, cool, phase]",7,"[in, pdo, switched, to, a, cool, phase]",7
claim-2021,weather channel co-founder john coleman provid...,"[weather, channel, john, coleman, provided, ev...",15,"[weather, channel, john, coleman, provided, ev...",15
claim-2449,"""january 2008 capped a 12 month period of glob...","[january, capped, a, month, period, of, global...",17,"[january, capped, a, month, period, of, global...",17


## Train the Doc2Vec model

In [49]:
class NegativeSampling(nn.Module):
    def __init__(self):
        super(NegativeSampling, self).__init__()
        self.log_sigmoid = nn.LogSigmoid()

    def forward(self, scores):
        batch_size = scores.shape[0]
        positive = self.log_sigmoid(scores[:,0])
        negatives = torch.sum(self.log_sigmoid(-scores[:,1:]), dim=1)
        return -torch.sum(positive + negatives) / batch_size  # average for batch

loss = NegativeSampling()

In [54]:
class NoiseDistribution:
    def __init__(self, vocab):
        self.probs = np.array([vocab.freqs[w] for w in vocab.words])
        # A unigram distribution raised to the 3/4rd power, as proposed by T. Mikolov et al. in Distributed Representations of Words and Phrases and their Compositionality
        self.probs = np.power(self.probs, 0.75)
        self.probs /= np.sum(self.probs)
    def sample(self, n):
        "Returns the indices of n words randomly sampled from the vocabulary."
        return np.random.choice(a=self.probs.shape[0], size=n, p=self.probs)
        
noise = NoiseDistribution(vocab)

In [66]:
context_size=5
n_negative_samples=5

doc = df[:1]
for i in range(context_size, len(doc.clean_tokens) - context_size):
    print(i)

In [55]:
def example_generator(df, context_size, noise, n_negative_samples, vocab):
    for doc_id, doc in df.iterrows():
        for i in range(context_size, len(doc.clean_tokens) - context_size):
            positive_sample = vocab.word2idx[doc.clean_tokens[i]]
            sample_ids = noise.sample(n_negative_samples).tolist()
            # Fix a wee bug - ensure negative samples don't accidentally include the positive
            sample_ids = [sample_id if sample_id != positive_sample else -1 for sample_id in sample_ids]
            sample_ids.insert(0, positive_sample)
            context = doc.clean_tokens[i - context_size:i] + doc.clean_tokens[i + 1:i + context_size + 1]
            context_ids = [vocab.word2idx[w] for w in context]
            yield {"doc_ids": torch.tensor(doc_id),  # we use plural here because it will be batched
                   "sample_ids": torch.tensor(sample_ids), 
                   "context_ids": torch.tensor(context_ids)}
            
examples = example_generator(df, context_size=5, noise=noise, n_negative_samples=5, vocab=vocab)

In [57]:
class NCEDataset(Dataset):
    def __init__(self, examples):
        self.examples = list(examples)
    def __len__(self):
        return len(self.examples)
    def __getitem__(self, index):
        return self.examples[index]
    
dataset = NCEDataset(examples)
dataloader = DataLoader(dataset, batch_size=2, drop_last=True, shuffle=True)

TypeError: new(): invalid data type 'str'

In [None]:
def describe_batch(batch, vocab):
    results = []
    for doc_id, context_ids, sample_ids in zip(batch["doc_ids"], batch["context_ids"], batch["sample_ids"]):
        context = [vocab.words[i] for i in context_ids]
        context.insert(len(context_ids) // 2, "____")
        samples = [vocab.words[i] for i in sample_ids]
        result = {"doc_id": doc_id,
                  "context": " ".join(context), 
                  "context_ids": context_ids, 
                  "samples": samples, 
                  "sample_ids": sample_ids}
        results.append(result)
    return results

describe_batch(next(iter(dataloader)), vocab)

In [None]:
class DistributedMemory(nn.Module):
    def __init__(self, vec_dim, n_docs, n_words):
        super(DistributedMemory, self).__init__()
        self.paragraph_matrix = nn.Parameter(torch.randn(n_docs, vec_dim))
        self.word_matrix = nn.Parameter(torch.randn(n_words, vec_dim))
        self.outputs = nn.Parameter(torch.zeros(vec_dim, n_words))
    
    def forward(self, doc_ids, context_ids, sample_ids):
                                                                               # first add doc ids to context word ids to make the inputs
        inputs = torch.add(self.paragraph_matrix[doc_ids,:],                   # (batch_size, vec_dim)
                           torch.sum(self.word_matrix[context_ids,:], dim=1))  # (batch_size, 2x context, vec_dim) -> sum to (batch_size, vec_dim)
                                                                               #
                                                                               # select the subset of the output layer for the NCE test
        outputs = self.outputs[:,sample_ids]                                   # (vec_dim, batch_size, n_negative_samples + 1)
                                                                               #
        return torch.bmm(inputs.unsqueeze(dim=1),                              # then multiply with some munging to make the tensor shapes line up 
                         outputs.permute(1, 0, 2)).squeeze()                   # -> (batch_size, n_negative_samples + 1)

model = DistributedMemory(vec_dim=50,
                          n_docs=len(df),
                          n_words=len(vocab.words))

In [None]:
with torch.no_grad():
    logits = model.forward(**next(iter(dataloader)))
logits

In [None]:
def train(model, dataloader, epochs=40, lr=1e-3):
    optimizer = Adam(model.parameters(), lr=lr)
    training_losses = []
    try:
        for epoch in trange(epochs, desc="Epochs"):
            epoch_losses = []
            for batch in dataloader:
                model.zero_grad()
                logits = model.forward(**batch)
                batch_loss = loss(logits)
                epoch_losses.append(batch_loss.item())
                batch_loss.backward()
                optimizer.step()
            training_losses.append(np.mean(epoch_losses))
    except KeyboardInterrupt:
        print(f"Interrupted on epoch {epoch}!")
    finally:
        return training_losses

In [None]:
training_losses = train(model, dataloader, epochs=40, lr=1e-3)

In [None]:
df_loss = pd.DataFrame(enumerate(training_losses), columns=["epoch", "training_loss"])
alt.Chart(df_loss).mark_bar().encode(alt.X("epoch"), alt.Y("training_loss", scale=alt.Scale(type="log")))

In [None]:
def most_similar(paragraph_matrix, docs_df, index, n=None):
    pm = normalize(paragraph_matrix, norm="l2")  # in a smarter implementation we would cache this somewhere
    sims = np.dot(pm, pm[index,:])
    df = pd.DataFrame(enumerate(sims), columns=["doc_id", "similarity"])
    n = n if n is not None else len(sims)
    return df.merge(docs_df[["text"]].reset_index(drop=True), left_index=True, right_index=True).sort_values(by="similarity", ascending=False)[:n]

In [None]:
most_similar(model.paragraph_matrix.data, df, 1, n=10)