In [36]:
# https://github.com/cbowdon/doc2vec-pytorch/blob/master/doc2vec.ipynb
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam  # ilenic uses Adam, but gensim uses plain SGD
import numpy as np
import pandas as pd
import spacy
from collections import Counter
import altair as alt
import numpy as np
from tqdm import tqdm, trange
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA


## Loading Data

In [37]:
# Read train claims
with open('../data/train-claims.json', 'r') as f:
    claims = json.load(f)

In [3]:
len(claims)

1228

In [4]:
print(claims['claim-1937']['claim_text'])
print(claims['claim-1937']['evidences'])

Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.
['evidence-442946', 'evidence-1194317', 'evidence-12171']


In [38]:
# Read evidence
with open('../data/evidence.json', 'r') as f:
    evidences = json.load(f)
evidences = {i: str.lower(j) for i,j in evidences.items()}

In [6]:
len(evidences)

1208827

In [7]:
evidences['evidence-442946']

'at very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.'

In [39]:
# Calculate vocabulary size
# Collect all texts from claims and evidences
corpus = []
for claim in claims.values():
    corpus.append(str.strip(str.lower(claim['claim_text'])))  # Add claim text

#for evidence_id, evidence_text in evidences.items():
#    corpus.append(str.strip(evidence_text))  # Add evidence text

In [40]:
nlp = spacy.load("en_core_web_sm")

In [41]:
def tokenize_text(df):
    df["tokens"] = df.text.apply(lambda x: [token.text.strip() for token in nlp(x) if token.text.isalnum()])
    return df

In [42]:
# Convert the list of documents into a pandas DataFrame
df = pd.DataFrame(corpus, columns=['text'])

In [43]:
example_df = tokenize_text(df)
example_df

Unnamed: 0,text,tokens
0,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc..."
1,el niño drove record highs in global temperatu...,"[el, niño, drove, record, highs, in, global, t..."
2,"in 1946, pdo switched to a cool phase.","[in, 1946, pdo, switched, to, a, cool, phase]"
3,weather channel co-founder john coleman provid...,"[weather, channel, co, founder, john, coleman,..."
4,"""january 2008 capped a 12 month period of glob...","[january, 2008, capped, a, 12, month, period, ..."
...,...,...
1223,climate scientists say that aspects of the cas...,"[climate, scientists, say, that, aspects, of, ..."
1224,"in its 5th assessment report in 2013, the ipcc...","[in, its, 5th, assessment, report, in, 2013, t..."
1225,"since the mid 1970s, global temperatures have ...","[since, the, mid, 1970s, global, temperatures,..."
1226,but abnormal temperature spikes in february an...,"[but, abnormal, temperature, spikes, in, febru..."


In [44]:
class Vocab:
    def __init__(self, all_tokens, min_count=2):
        self.min_count = min_count
        self.freqs = {t:n for t, n in Counter(all_tokens).items() if n >= min_count}
        self.words = sorted(self.freqs.keys())
        self.word2idx = {w: i for i, w in enumerate(self.words)}

In [45]:
vocab = Vocab([tok for tokens in example_df.tokens for tok in tokens], min_count=1)
print(f"Dataset comprises {len(example_df)} documents and {len(vocab.words)} unique words (over the limit of {vocab.min_count} occurrences)")

Dataset comprises 1228 documents and 3868 unique words (over the limit of 1 occurrences)


In [46]:
def clean_tokens(df, vocab):
    df["length"] = df.tokens.apply(len)
    df["clean_tokens"] = df.tokens.apply(lambda x: [t for t in x if t in vocab.freqs.keys()])
    df["clean_length"] = df.clean_tokens.apply(len)
    return df

In [47]:
example_df = clean_tokens(example_df, vocab)
example_df[:5]

Unnamed: 0,text,tokens,length,clean_tokens,clean_length
0,not only is there no scientific evidence that ...,"[not, only, is, there, no, scientific, evidenc...",24,"[not, only, is, there, no, scientific, evidenc...",24
1,el niño drove record highs in global temperatu...,"[el, niño, drove, record, highs, in, global, t...",18,"[el, niño, drove, record, highs, in, global, t...",18
2,"in 1946, pdo switched to a cool phase.","[in, 1946, pdo, switched, to, a, cool, phase]",8,"[in, 1946, pdo, switched, to, a, cool, phase]",8
3,weather channel co-founder john coleman provid...,"[weather, channel, co, founder, john, coleman,...",17,"[weather, channel, co, founder, john, coleman,...",17
4,"""january 2008 capped a 12 month period of glob...","[january, 2008, capped, a, 12, month, period, ...",19,"[january, 2008, capped, a, 12, month, period, ...",19


In [48]:
class NegativeSampling(nn.Module):
    def __init__(self):
        super(NegativeSampling, self).__init__()
        self.log_sigmoid = nn.LogSigmoid()
    def forward(self, scores):
        batch_size = scores.shape[0]
        n_negative_samples = scores.shape[1] - 1   # TODO average or sum the negative samples? Summing seems to be correct by the paper
        positive = self.log_sigmoid(scores[:,0])
        negatives = torch.sum(self.log_sigmoid(-scores[:,1:]), dim=1)
        return -torch.sum(positive + negatives) / batch_size  # average for batch

loss = NegativeSampling()

In [49]:
data = [[[1, -1, -1, -1]],  # this dummy data uses -1 to 1, but the real model is unconstrained
        [[0.5, -1, -1, -1]],
        [[0, -1, -1, -1]],
        [[0, 0, 0, 0]],
        [[0, 0, 0, 1]],
        [[0, 1, 1, 1]],
        [[0.5, 1, 1, 1]],
        [[1, 1, 1, 1]]]

loss_df = pd.DataFrame(data, columns=["scores"])
loss_df["loss"] = loss_df.scores.apply(lambda x: loss(torch.FloatTensor([x])))

loss_df

Unnamed: 0,scores,loss
0,"[1, -1, -1, -1]",tensor(1.2530)
1,"[0.5, -1, -1, -1]",tensor(1.4139)
2,"[0, -1, -1, -1]",tensor(1.6329)
3,"[0, 0, 0, 0]",tensor(2.7726)
4,"[0, 0, 0, 1]",tensor(3.3927)
5,"[0, 1, 1, 1]",tensor(4.6329)
6,"[0.5, 1, 1, 1]",tensor(4.4139)
7,"[1, 1, 1, 1]",tensor(4.2530)


In [50]:
data = pd.DataFrame(zip(np.arange(0,1,0.01), np.power(np.arange(0,1,0.01), 0.75)), columns=["x", "y"])
alt.Chart(data, title="x^0.75").mark_line().encode(x="x", y="y")

In [51]:
class NoiseDistribution:
    def __init__(self, vocab):
        self.probs = np.array([vocab.freqs[w] for w in vocab.words])
        self.probs = np.power(self.probs, 0.75)
        self.probs /= np.sum(self.probs)
    def sample(self, n):
        "Returns the indices of n words randomly sampled from the vocabulary."
        return np.random.choice(a=self.probs.shape[0], size=n, p=self.probs)
        
noise = NoiseDistribution(vocab)

In [52]:
def example_generator(df, context_size, noise, n_negative_samples, vocab):
    for doc_id, doc in df.iterrows():
        for i in range(context_size, len(doc.clean_tokens) - context_size):
            positive_sample = vocab.word2idx[doc.clean_tokens[i]]
            sample_ids = noise.sample(n_negative_samples).tolist()
            # Fix a wee bug - ensure negative samples don't accidentally include the positive
            sample_ids = [sample_id if sample_id != positive_sample else -1 for sample_id in sample_ids]
            sample_ids.insert(0, positive_sample)                
            context = doc.clean_tokens[i - context_size:i] + doc.clean_tokens[i + 1:i + context_size + 1]
            context_ids = [vocab.word2idx[w] for w in context]
            yield {"doc_ids": torch.tensor(doc_id),  # we use plural here because it will be batched
                   "sample_ids": torch.tensor(sample_ids), 
                   "context_ids": torch.tensor(context_ids)}
            
examples = example_generator(example_df, context_size=5, noise=noise, n_negative_samples=5, vocab=vocab)

In [53]:
class NCEDataset(Dataset):
    def __init__(self, examples):
        self.examples = list(examples)  # just naively evaluate the whole damn thing - suboptimal!
    def __len__(self):
        return len(self.examples)
    def __getitem__(self, index):
        return self.examples[index]
    
dataset = NCEDataset(examples)
dataloader = DataLoader(dataset, batch_size=2, drop_last=True, shuffle=True)  # TODO bigger batch size when not dummy data

In [54]:
def describe_batch(batch, vocab):
    results = []
    for doc_id, context_ids, sample_ids in zip(batch["doc_ids"], batch["context_ids"], batch["sample_ids"]):
        context = [vocab.words[i] for i in context_ids]
        context.insert(len(context_ids) // 2, "____")
        samples = [vocab.words[i] for i in sample_ids]
        result = {"doc_id": doc_id,
                  "context": " ".join(context), 
                  "context_ids": context_ids, 
                  "samples": samples, 
                  "sample_ids": sample_ids}
        results.append(result)
    return results

describe_batch(next(iter(dataloader)), vocab)

[{'doc_id': tensor(1227),
  'context': 'a dielectric material such as ____ creates radio frequency heating at',
  'context_ids': tensor([ 177, 1055, 2170, 3361,  396,  920, 2803, 1513, 1691,  411]),
  'samples': ['water', 'a', 'them', 'is', 'certainly', 'periods'],
  'sample_ids': tensor([3754,  177, 3478, 1916,  658, 2564])},
 {'doc_id': tensor(469),
  'context': 'climate change which is sort ____ the voice of the consensus',
  'context_ids': tensor([ 715,  665, 3797, 1916, 3234, 3476, 3724, 2433, 3476,  812]),
  'samples': ['of', 'and', 'five', 'reduction', 'casts', 'be'],
  'sample_ids': tensor([2433,  341, 1448, 2863,  634,  463])}]

In [55]:
class DistributedMemory(nn.Module):
    def __init__(self, vec_dim, n_docs, n_words):
        super(DistributedMemory, self).__init__()
        self.paragraph_matrix = nn.Parameter(torch.randn(n_docs, vec_dim))
        self.word_matrix = nn.Parameter(torch.randn(n_words, vec_dim))
        self.outputs = nn.Parameter(torch.zeros(vec_dim, n_words))
    
    def forward(self, doc_ids, context_ids, sample_ids):
                                                                               # first add doc ids to context word ids to make the inputs
        inputs = torch.add(self.paragraph_matrix[doc_ids,:],                   # (batch_size, vec_dim)
                           torch.sum(self.word_matrix[context_ids,:], dim=1))  # (batch_size, 2x context, vec_dim) -> sum to (batch_size, vec_dim)
                                                                               #
                                                                               # select the subset of the output layer for the NCE test
        outputs = self.outputs[:,sample_ids]                                   # (vec_dim, batch_size, n_negative_samples + 1)
                                                                               #
        return torch.bmm(inputs.unsqueeze(dim=1),                              # then multiply with some munging to make the tensor shapes line up 
                         outputs.permute(1, 0, 2)).squeeze()                   # -> (batch_size, n_negative_samples + 1)

model = DistributedMemory(vec_dim=50,
                          n_docs=len(example_df),
                          n_words=len(vocab.words))

In [56]:
with torch.no_grad():
    logits = model.forward(**next(iter(dataloader)))
logits

tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])

In [57]:
def train(model, dataloader, epochs=40, lr=1e-3):
    optimizer = Adam(model.parameters(), lr=lr)
    training_losses = []
    try:
        for epoch in trange(epochs, desc="Epochs"):
            epoch_losses = []
            for batch in dataloader:
                model.zero_grad()
                logits = model.forward(**batch)
                batch_loss = loss(logits)
                epoch_losses.append(batch_loss.item())
                batch_loss.backward()
                optimizer.step()
            training_losses.append(np.mean(epoch_losses))
    except KeyboardInterrupt:
        print(f"Interrupted on epoch {epoch}!")
    finally:
        return training_losses

In [58]:
training_losses = train(model, dataloader, epochs=40, lr=1e-3)

Epochs: 100%|██████████| 40/40 [59:25<00:00, 89.15s/it]


In [59]:
df_loss = pd.DataFrame(enumerate(training_losses), columns=["epoch", "training_loss"])
alt.Chart(df_loss).mark_bar().encode(alt.X("epoch"), alt.Y("training_loss", scale=alt.Scale(type="log")))

In [60]:
def most_similar(paragraph_matrix, docs_df, index, n=None):
    pm = normalize(paragraph_matrix, norm="l2")  # in a smarter implementation we would cache this somewhere
    sims = np.dot(pm, pm[index,:])
    df = pd.DataFrame(enumerate(sims), columns=["doc_id", "similarity"])
    n = n if n is not None else len(sims)
    return df.merge(docs_df[["text"]].reset_index(drop=True), left_index=True, right_index=True).sort_values(by="similarity", ascending=False)[:n]

In [61]:
most_similar(model.paragraph_matrix.data, example_df, 1, n=10)

Unnamed: 0,doc_id,similarity,text
1,1,1.0,el niño drove record highs in global temperatu...
804,804,0.489355,the satellite sensors show less warming in the...
672,672,0.480265,the iris hypothesis has not withstood the test...
420,420,0.469281,the ­atmospheric residency time of carbon diox...
274,274,0.467659,after the 9/11 terrorist attacks grounded comm...
683,683,0.462144,"over the last decade, heatwaves are five times..."
805,805,0.455915,whether antarctic mass loss keeps worsening de...
334,334,0.451902,ben santer could not have and did not single-h...
1080,1080,0.443797,the most recent survey of climate scientists s...
834,834,0.441108,“climate economists see a positive externality...


In [62]:
def pca_2d(paragraph_matrix, groups):
    pca = PCA(n_components=2)
    reduced_dims = pca.fit_transform(paragraph_matrix)
    print(f"2-component PCA, explains {sum(pca.explained_variance_):.2f}% of variance")
    df = pd.DataFrame(reduced_dims, columns=["x", "y"])
    df["group"] = groups
    return df

example_2d = pca_2d(model.paragraph_matrix.data, ["0","1","2","3"])
alt.Chart(example_2d).mark_point().encode(x="x", y="y", color="group")

2-component PCA, explains 7.53% of variance


ValueError: Length of values (4) does not match length of index (1228)