In [1]:
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import re

In [2]:
class Doc2VecModel(nn.Module):
    def __init__(self, num_docs, vocab_size, vector_size):
        super(Doc2VecModel, self).__init__()
        self.num_docs = num_docs
        self.vocab_size = vocab_size
        self.vector_size = vector_size
        
        # Embeddings for words and documents
        self.word_embeddings = nn.Embedding(vocab_size, vector_size)
        self.doc_embeddings = nn.Embedding(num_docs, vector_size)
        
        # Decoder layer to predict the words
        self.decoder = nn.Linear(vector_size, vocab_size)

    def forward(self, doc_ids, context_word_ids):
        # Average the context words embeddings
        word_embeds = self.word_embeddings(context_word_ids)
        context_vector = torch.mean(word_embeds, dim=1)
        
        # Add document vector
        doc_embeds = self.doc_embeddings(doc_ids)
        combined_vector = (context_vector + doc_embeds) / 2

        # Predict words
        logits = self.decoder(combined_vector)
        return logits

def train(model, data_loader, epochs, learning_rate=0.01):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for doc_ids, context_word_ids, target_word_ids in data_loader:
            optimizer.zero_grad()
            logits = model(doc_ids, context_word_ids)
            loss = criterion(logits, target_word_ids)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}")

## Loading Data

In [3]:
# Read train claims
with open('../data/train-claims.json', 'r') as f:
    claims = json.load(f)

In [4]:
len(claims)

1228

In [5]:
print(claims['claim-1937']['claim_text'])
print(claims['claim-1937']['evidences'])

Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.
['evidence-442946', 'evidence-1194317', 'evidence-12171']


In [6]:
# Read evidence
with open('../data/evidence.json', 'r') as f:
    evidences = json.load(f)
evidences = {i: str.lower(j) for i,j in evidences.items()}

In [7]:
len(evidences)

1208827

In [8]:
evidences['evidence-442946']

'at very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.'

In [9]:
# Calculate vocabulary size
# Collect all texts from claims and evidences
corpus = []
for claim in claims.values():
    corpus.append(str.lower(claim['claim_text']))  # Add claim text

for evidence_id, evidence_text in evidences.items():
    corpus.append(evidence_text)  # Add evidence text

In [10]:
# Define tokenizer - here we use a basic English tokenizer
tokenizer = get_tokenizer('basic_english')

# Tokenize the data
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

# Build vocabulary
vocab = build_vocab_from_iterator(yield_tokens(corpus))

# Get the size of the vocabulary
print(f"The vocabulary size is: {len(vocab)}")

1210055lines [00:18, 66415.19lines/s]


The vocabulary size is: 730578


In [11]:
# Tokenize the corpus into a list of word indices
tokenized_corpus = [[vocab[token] for token in tokenizer(doc)] for doc in corpus]

# Parameters for context window size
window_size = 2  # Context window size: 2 words to the left and 2 to the right

# Prepare data for training
doc_ids = []
context_word_ids = []
target_word_ids = []

for doc_id, words in enumerate(tokenized_corpus):
    for i in range(len(words)):
        start = max(0, i - window_size)
        end = min(len(words), i + window_size + 1)

        # Prepare the context indices and the target word
        context_indices = [words[j] for j in range(start, end) if j != i]
        target_index = words[i]

        # Append each context with the same target to the dataset
        for ctx_id in context_indices:
            doc_ids.append(doc_id)
            context_word_ids.append(ctx_id)
            target_word_ids.append(target_index)

# Convert lists to PyTorch tensors if needed
doc_ids = torch.tensor(doc_ids)
context_word_ids = torch.tensor(context_word_ids)
target_word_ids = torch.tensor(target_word_ids)

In [12]:
# Create the model
num_docs = len(doc_ids)
vocab_size = len(vocab)
vector_size = 50

model = Doc2VecModel(num_docs, vocab_size, vector_size)

In [13]:
dataset = TensorDataset(doc_ids, context_word_ids, target_word_ids)
data_loader = DataLoader(dataset, batch_size=1, shuffle=True)

In [54]:
# Train the model
train(model, data_loader, epochs=10)

In [None]:
# To get document vectors
document_vectors = model.doc_embeddings.weight.data
print(document_vectors)