In [1]:
import os

folder_path = r'/kaggle/input/got-books'

data=''
for filename in os.listdir(folder_path):
  filepath = os.path.join(folder_path, filename)
  if os.path.isfile(filepath) and filename.endswith('.txt'):
    with open(filepath, 'r', encoding='latin') as file_ref:
      data += file_ref.read()

print(f'Length: {len(data)}\n')
print(f'Sample: \n{data[:1000]}')

Length: 9778333

Sample: 
Version History: 
2.0 - Reedited 4/25/10 by maelstrom385 


A FEAST FOR CROWS
Book Four: A Song of Ice and Fire 
George R.R. Martin 
PROLOGUE 
Dragons, said Mollander. He snatched a withered apple off the ground and tossed it 
hand to hand. 
Throw the apple, urged Alleras the Sphinx. He slipped an arrow from his quiver and nocked 
it to his bowstring. 
I should like to see a dragon. Roone was the youngest of them, a chunky boy still two years 
shy of manhood. I should like that very much. 
And I should like to sleep with Roseys arms around me, Pate thought. He shifted restlessly on 
the bench. By the morrow the girl could well be his. I will take her far from Oldtown, across the 
narrow sea to one of the Free Cities. There were no maesters there, no one to accuse him. 
He could hear Emmas laughter coming through a shuttered window overhead, mingled with 
the deeper voice of the man she was entertaining. She was the oldest of the serving wenches at 
t

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

def lowercase_and_filter(text):
    text=text.lower()
    cleaned_text = re.sub(r'<[^>]*>', '', text)
    cleaned_text = re.sub(r'\S+@\S+', '', cleaned_text)
    cleaned_text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', cleaned_text)
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
    cleaned_text = re.sub(r'\n', ' ', cleaned_text)
    cleaned_text = re.sub(r'\t', ' ', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

def tokenize(sen):
    return word_tokenize(sen)

def stopwords_removal(sen):
    stop_words = set(stopwords.words('english'))
    return [word for word in sen if word not in stop_words]

def lemmatize(sen):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in sen]

def preprocess_text(text):
    cleaned_text = lowercase_and_filter(text)
    tokenized_text = tokenize(cleaned_text)
    text_without_stopwords = stopwords_removal(tokenized_text)
    lemmatized_text = lemmatize(text_without_stopwords)
    return lemmatized_text

preprocessed_data = preprocess_text(data)

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/w

In [4]:
print(preprocessed_data[:100])

['version', 'history', '20', 'reedited', '42510', 'maelstrom385', 'feast', 'crow', 'book', 'four', 'song', 'ice', 'fire', 'george', 'rr', 'martin', 'prologue', 'dragon', 'said', 'mollander', 'snatched', 'withered', 'apple', 'ground', 'tossed', 'hand', 'hand', 'throw', 'apple', 'urged', 'alleras', 'sphinx', 'slipped', 'arrow', 'quiver', 'nocked', 'bowstring', 'like', 'see', 'dragon', 'roone', 'youngest', 'chunky', 'boy', 'still', 'two', 'year', 'shy', 'manhood', 'like', 'much', 'like', 'sleep', 'roseys', 'arm', 'around', 'pate', 'thought', 'shifted', 'restlessly', 'bench', 'morrow', 'girl', 'could', 'well', 'take', 'far', 'oldtown', 'across', 'narrow', 'sea', 'one', 'free', 'city', 'maesters', 'one', 'accuse', 'could', 'hear', 'emmas', 'laughter', 'coming', 'shuttered', 'window', 'overhead', 'mingled', 'deeper', 'voice', 'man', 'entertaining', 'oldest', 'serving', 'wench', 'quill', 'tankard', 'forty', 'day', 'still', 'pretty', 'fleshy']


In [5]:
def get_train_data(data, window_size):
    train_data = []
    for i in range(window_size, len(data) - window_size):
        context = [data[j] for j in range(i - window_size, i + window_size + 1) if j != i]
        target = data[i]
        train_data.append((context, target))
    return train_data

# SKIPGRAM

# CBOW

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np

class LoadDataset(Dataset):
    def __init__(self, train_data, word_to_idx):
        self.train_data = train_data
        self.word_to_idx = word_to_idx

    def __len__(self):
        return len(self.train_data)

    def __getitem__(self, idx):
        context, target = self.train_data[idx]
        context_idxs = [self.word_to_idx[word] for word in context]
        target_idx = self.word_to_idx[target]
        return torch.tensor(context_idxs), torch.tensor(target_idx)

class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.output = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context_idxs):
        embedded_context = self.embedding(context_idxs).mean(dim=1)
        output = self.output(embedded_context)
        return output

def main(data, embedding_dim=100, lr=0.01, epochs=10, batch_size=32, window_size=2):
    vocab = list(set(data))
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}
    vocab_size = len(vocab)

    train_data = get_train_data(data, window_size)

    model = CBOW(vocab_size, embedding_dim)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    dataset = LoadDataset(train_data, word_to_idx)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    for epoch in range(epochs):
        total_loss = 0
        for context_idxs, target_idx in dataloader:
            optimizer.zero_grad()
            output = model(context_idxs)
            loss = loss_fn(output, target_idx)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(dataloader)
        print(f'Epoch {epoch + 1}, Average Loss: {avg_loss}')

    return model, word_to_idx, vocab

model, word_to_idx, idx_to_word = main(preprocessed_data[:100000])


Epoch 1, Average Loss: 8.159978136291503
Epoch 2, Average Loss: 6.069935362548828
Epoch 3, Average Loss: 4.646448402481079
Epoch 4, Average Loss: 3.8009480448532105
Epoch 5, Average Loss: 3.3166903911209107
Epoch 6, Average Loss: 3.005242641143799
Epoch 7, Average Loss: 2.7972702291107177
Epoch 8, Average Loss: 2.6344286576652527
Epoch 9, Average Loss: 2.512125694103241
Epoch 10, Average Loss: 2.4133790214920046


In [8]:
import time

def cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    similarity = dot_product / (norm_vector1 * norm_vector2)
    return similarity

def get_word_vector(word, model, word_to_idx):
    word_idx = word_to_idx[word]
    return model.embedding(torch.tensor(word_idx)).detach().numpy()

In [17]:
def search(query, model, word_to_idx, idx_to_word, n):
    words = query.split()
    if len(words) != 3:
        print("Invalid query format. Please provide three words separated by '+' and '-'.")
        return

    # Extracting vectors for each word
    word1_vector = get_word_vector(words[0], model, word_to_idx)
    word2_vector = get_word_vector(words[1], model, word_to_idx)
    word3_vector = get_word_vector(words[2], model, word_to_idx)

    # Calculating the vector arithmetic: king - male + female
    query_vector = word1_vector - word2_vector + word3_vector

    # Finding the most similar word to the query vector
    similarities = []
    for idx, vec in enumerate(model.embedding.weight.detach().numpy()):
        sim = cosine_similarity([query_vector], [vec])[0][0]
        similarities.append((idx_to_word[idx], sim))
    similarities.sort(key=lambda x: x[1], reverse=True)

    # Displaying the top 'n' similar words
    print(f'\n\nTop {n} similar words for the query "{query}"')
    for w, sim in similarities[:n]:
        time.sleep(0.5)
        print(f"Word: {w}, Similarity: {sim}")

In [21]:
# A - B + C format
search("person human animal", model, word_to_idx, idx_to_word, 5)



Top 5 similar words for the query "person human animal"
Word: animal, Similarity: 0.6417559385299683
Word: person, Similarity: 0.44666817784309387
Word: ala, Similarity: 0.35783103108406067
Word: buzz, Similarity: 0.3319343328475952
Word: beesbury, Similarity: 0.33158421516418457
