In [16]:
import torch
import torch.nn as nn
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

from collections import Counter, defaultdict

import numpy as np
from tqdm.notebook import tqdm
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [18]:
def process_text(corpus):
  """process text"""
  stop_word = set(stopwords.words('english'))
  processed_corpus = []

  for doc in corpus:
    tokens = word_tokenize(doc.lower())
    tokens = [re.sub(r'\W+', '', word) for word in tokens]
    tokens = [word for word in tokens if word not in stop_word and word]
    processed_corpus.append(tokens)
  return processed_corpus


def compute_cooccrance_matrix(corpus, window_size=10):
  """prepare cooccurance matriz"""
  vocab = Counter([word for sentence in corpus for word in sentence])
  word_to_id = {j: i for i, j in enumerate(vocab.keys())}
  id_to_word = {i: j for i, j in enumerate(vocab.keys())}
  vocab_size = len(vocab)

  cooccurance_matrix = np.zeros((vocab_size, vocab_size), dtype=np.float32)
  counter_dict = defaultdict(dict)


  for sentence in corpus:
    for word in sentence:
      word_idx = word_to_id[word]

      for i in range(max(0, word_idx - window_size), min(word_idx + window_size + 1, len(sentence))):
        if i != word_idx:
          context_word_idx = word_to_id[sentence[i]]
          cooccurance_matrix[word_idx, context_word_idx] += 1

  return cooccurance_matrix, word_to_id, id_to_word


class Glove(nn.Module):
  def __init__(self, vocab_size, embedding_dim):
    super().__init__()
    self.vocab_size = vocab_size
    self.embedding_dim = embedding_dim

    self.w_embedding = nn.Embedding(vocab_size, embedding_dim)
    self.context_embedding = nn.Embedding(vocab_size, embedding_dim)
    self.word_bias = nn.Embedding(vocab_size, 1)
    self.context_bias = nn.Embedding(vocab_size, 1)

    # Initialize weights

    nn.init.xavier_uniform_(self.w_embedding.weight)
    nn.init.xavier_uniform_(self.context_embedding.weight)


  def forward(self, word_indices, context_indices, cooccurance_matrix):
    word_vecs = self.w_embedding(word_indices)
    context_vecs = self.context_embedding(context_indices)
    word_bias_vecs = self.word_bias(word_indices).squeeze()
    context_bias_vecs = self.context_bias(context_indices).squeeze()


    log_cooccurance = torch.log(cooccurance_matrix + 1) # avoid log(0)
    prediction = torch.sum(word_vecs * context_vecs, dim=1) + word_bias_vecs + context_bias_vecs
    loss = (prediction - log_cooccurance)**2
    return torch.mean(loss)

In [45]:
def train_glove(cooccurance_matrix, word_to_id, embedding_dim=50, epochs=1000, lr=0.05):

  """Setup training"""
  vocab_size = len(word_to_id)
  glove_model = Glove(vocab_size, embedding_dim)
  optimizer = torch.optim.Adam(glove_model.parameters(), lr=lr)
  word_indices, context_indices, cooccurance_counts = [], [], []

  ### Flatten co-occurance-matrix for training
  for i in range(cooccurance_matrix.shape[0]):
    for j in range(cooccurance_matrix.shape[1]):
      if cooccurance_matrix[i, j] > 0:
        word_indices.append(i)
        context_indices.append(j)
        cooccurance_counts.append(cooccurance_matrix[i, j])

  word_indices = torch.tensor(word_indices, dtype=torch.long)
  context_indices = torch.tensor(context_indices, dtype=torch.long)
  cooccurance_counts = torch.tensor(cooccurance_counts, dtype=torch.float32)

  for epoch in range(epochs):
    optimizer.zero_grad()
    loss = glove_model(word_indices, context_indices, cooccurance_counts)
    loss.backward()
    optimizer.step()
    print(f"Epoch: {epoch + 1}, Loss: {loss.item()}")

  return glove_model

In [4]:
!wget https://hazidesaratcollege.ac.in/library/uploads/85jkr_harrypotter_1.pdf

--2025-01-27 18:02:16--  https://hazidesaratcollege.ac.in/library/uploads/85jkr_harrypotter_1.pdf
Resolving hazidesaratcollege.ac.in (hazidesaratcollege.ac.in)... 139.59.43.125
Connecting to hazidesaratcollege.ac.in (hazidesaratcollege.ac.in)|139.59.43.125|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1001500 (978K) [application/pdf]
Saving to: ‘85jkr_harrypotter_1.pdf.1’


2025-01-27 18:02:18 (1.02 MB/s) - ‘85jkr_harrypotter_1.pdf.1’ saved [1001500/1001500]



In [5]:
!pip install pdfplumber --quiet

In [6]:
import pdfplumber
with pdfplumber.open("/content/85jkr_harrypotter_1.pdf") as pdf:
  texts = []
  for page in pdf.pages:
    texts.append(page.extract_text())

In [46]:
processed_text = process_text(texts)
cooccurance_matrix, word_to_id, id_to_word = compute_cooccrance_matrix(processed_text)
glove = train_glove(cooccurance_matrix, word_to_id)

Epoch: 1, Loss: 3.565089702606201
Epoch: 2, Loss: 3.2887158393859863
Epoch: 3, Loss: 2.823871612548828
Epoch: 4, Loss: 2.257408618927002
Epoch: 5, Loss: 1.6120216846466064
Epoch: 6, Loss: 1.03743314743042
Epoch: 7, Loss: 0.6824626326560974
Epoch: 8, Loss: 0.5425599217414856
Epoch: 9, Loss: 0.5047336220741272
Epoch: 10, Loss: 0.4732012450695038
Epoch: 11, Loss: 0.42093250155448914
Epoch: 12, Loss: 0.36361125111579895
Epoch: 13, Loss: 0.31699874997138977
Epoch: 14, Loss: 0.28454601764678955
Epoch: 15, Loss: 0.26298701763153076
Epoch: 16, Loss: 0.24708954989910126
Epoch: 17, Loss: 0.2318907082080841
Epoch: 18, Loss: 0.21441584825515747
Epoch: 19, Loss: 0.1943833976984024
Epoch: 20, Loss: 0.17356030642986298
Epoch: 21, Loss: 0.15439677238464355
Epoch: 22, Loss: 0.1387893706560135
Epoch: 23, Loss: 0.12732234597206116
Epoch: 24, Loss: 0.11900654435157776
Epoch: 25, Loss: 0.11177616566419601
Epoch: 26, Loss: 0.10380594432353973
Epoch: 27, Loss: 0.09472904354333878
Epoch: 28, Loss: 0.085639260

In [47]:
print(len(word_to_id))

6000


In [48]:
word_embedding = glove.w_embedding.weight.data.numpy()

In [49]:
def find_similar(word, top=5):
  word_idx = word_to_id[word]
  word_vec = word_embedding[word_idx]

  cosine_sim = np.dot(word_embedding, word_vec) / (np.linalg.norm(word_embedding, axis=1) * np.linalg.norm(word_vec))
  similar_words_idx = np.argsort(cosine_sim)[::-1][:top]
  similar_words = [id_to_word[idx] for idx in similar_words_idx]
  return similar_words

In [55]:
find_similar("harry")

['harry', 'one', 'years', 'nt', 'another']