In [43]:
import torch
import torch.nn.functional as F
from torch import nn
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from pprint import pprint
import re
import pickle
import os
import numpy as np
from google.colab import drive
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE

## Importing Dataset (War and Peace - Leo Tolstoy)

In [44]:
!wget https://cs.stanford.edu/people/karpathy/char-rnn/warpeace_input.txt -O text1.txt

--2024-10-28 19:43:55--  https://cs.stanford.edu/people/karpathy/char-rnn/warpeace_input.txt
Resolving cs.stanford.edu (cs.stanford.edu)... 171.64.64.64
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3258246 (3.1M) [text/plain]
Saving to: ‘text1.txt’


2024-10-28 19:43:55 (7.09 MB/s) - ‘text1.txt’ saved [3258246/3258246]



In [45]:
!head text1.txt

"Well, Prince, so Genoa and Lucca are now just family estates of the
Buonapartes. But I warn you, if you don't tell me that this means war,
if you still try to defend the infamies and horrors perpetrated by that
Antichrist--I really believe he is Antichrist--I will have nothing more
to do with you and you are no longer my friend, no longer my 'faithful
slave,' as you call yourself! But how do you do? I see I have frightened
you--sit down and tell me all the news."

It was in July, 1805, and the speaker was the well-known Anna Pavlovna
Scherer, maid of honor and favorite of the Empress Marya Fedorovna. With


In [46]:
with open('text1.txt','r') as file:
  text1=file.read()      # converting file to string
print(len(text1))

3196213


## Data Cleaning and Preprocessing

In [47]:
def clean_text(text):

  text=text.lower()
  text = re.sub(r'\.{3,}', '.', text)
  text = re.sub(r'\n\s*\n', ' ' + '.' * 5 + ' ', text)
  text=re.sub(r'(\w)\n(\w)',r'\1 \2',text)
  text=re.sub(r'[^a-zA-Z0-9 \'\.]',' ',text)
  text=re.sub(r'[\']','',text)
  text = text.replace('\n', ' ')
  cleaned_text = re.sub(r'\s+', ' ', text).strip()

  return cleaned_text

In [48]:
clean_text1=clean_text(text1)
clean_text1[:3000]

'well prince so genoa and lucca are now just family estates of the buonapartes. but i warn you if you dont tell me that this means war if you still try to defend the infamies and horrors perpetrated by that antichrist i really believe he is antichrist i will have nothing more to do with you and you are no longer my friend no longer my faithful slave as you call yourself but how do you do i see i have frightened you sit down and tell me all the news. ..... it was in july 1805 and the speaker was the well known anna pavlovna scherer maid of honor and favorite of the empress marya fedorovna. with these words she greeted prince vasili kuragin a man of high rank and importance who was the first to arrive at her reception. anna pavlovna had had a cough for some days. she was as she said suffering from la grippe grippe being then a new word in st. petersburg used only by the elite. ..... all her invitations without exception written in french and delivered by a scarlet liveried footman that m

#### Tokenization

In [49]:
def paragraph_processing(text, context_len):

  context_padding = '.' * context_len
  paragraphs=text.split(".....")
  processed_paragraphs = [context_padding + para.strip() for para in paragraphs]

  return processed_paragraphs


In [50]:
context_len=5
paragraphs_txt1=paragraph_processing(clean_text1,context_len)
print(paragraphs_txt1)



In [51]:
def tokenization(paragraphs_txt,context_len):
  tokens=[]
  for para in paragraphs_txt:
    para_tokens = re.findall(r'\b\w+\b|\.{' + str(context_len) + r'}|[.]', para)
    para_tokens = [token for token in para_tokens if token != '.' * context_len]
    tokens.extend(para_tokens)
  return tokens

In [52]:
tokens_txt1=tokenization(paragraphs_txt1,context_len)

In [53]:
print(tokens_txt1)



#### Creating Word Vocabulary and mappings to/from integer indices

In [54]:
def create_vocab(tokens):
    token_to_index = {
      '.': 0,
      ' ': 1,
    }
    unique_tokens = sorted(list(set(token for token in tokens if token not in token_to_index)))
    token_to_index.update({token: idx + 2 for idx, token in enumerate(unique_tokens)})
    index_to_token = {idx: token for token, idx in token_to_index.items()}

    return token_to_index, index_to_token, unique_tokens


In [55]:
token_to_index1, index_to_token1, unique_tokens1 = create_vocab(tokens_txt1)

In [56]:
print(len(unique_tokens1))
print(len(list(token_to_index1.items())))
print(len(list(index_to_token1.items())))
print(unique_tokens1[:100])
print(list(token_to_index1.items())[:100])
print(list(index_to_token1.items())[:100])

17831
17833
17833
['000', '1', '10', '100', '102', '11', '110', '120', '130', '13th', '140', '15', '150', '15y', '160', '17', '178', '1789', '1797', '17th', '18', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813', '1815', '1820', '18th', '1st', '2', '20', '217', '22', '22nd', '23', '23rd', '24', '24th', '25', '25th', '26', '27', '27th', '3', '30', '31', '3rd', '4', '40', '4th', '4x', '5', '50', '6', '60', '62', '666', '671', '6th', '7', '70', '700', '7th', '8', '80', '800', '86th', '8th', '9', '90', '9th', 'a', 'aah', 'ab', 'aback', 'abacus', 'abandon', 'abandoned', 'abandoning', 'abandonment', 'abandons', 'abasement', 'abashed', 'abate', 'abbe', 'abbes', 'abbreviations', 'abc', 'abdicate', 'abdomen', 'abdomens', 'abduction', 'abductors', 'abhorrence', 'ability']
[('.', 0), (' ', 1), ('000', 2), ('1', 3), ('10', 4), ('100', 5), ('102', 6), ('11', 7), ('110', 8), ('120', 9), ('130', 10), ('13th', 11), ('140', 12), ('15', 13), ('150', 14), ('15y', 15), ('160', 16), (

## Creating X,y Datasets

In [57]:
def create_X_y(paragraphs, token_to_index, index_to_token, context_len):
    X = []
    y = []
    for para in paragraphs:
      para_tokens = re.findall(r'\b\w+\b|[.]', para)
      if len(para_tokens) <= context_len:
        continue
      for i in range(len(para_tokens) - context_len):
          input_context = [token_to_index[token] for token in para_tokens[i:i + context_len]]
          output_word = token_to_index[para_tokens[i + context_len]]
          X.append(input_context)
          y.append(output_word)

          print(' '.join(index_to_token[i] for i in input_context),' ------> ',index_to_token[output_word])

    print('Training Samples No. : ', len(X))
    print('Training outputs no. : ', len(y))

    X = torch.tensor(X)
    y = torch.tensor(y)

    return X, y

In [None]:
X1, y1= create_X_y(paragraphs_txt1, token_to_index1, index_to_token1, context_len)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
lets be quick . boris  ------>  come
be quick . boris come  ------>  here
quick . boris come here  ------>  said
. boris come here said  ------>  natasha
boris come here said natasha  ------>  .
come here said natasha .  ------>  but
here said natasha . but  ------>  where
said natasha . but where  ------>  is
natasha . but where is  ------>  sonya
. . . . .  ------>  she
. . . . she  ------>  looked
. . . she looked  ------>  round
. . she looked round  ------>  and
. she looked round and  ------>  seeing
she looked round and seeing  ------>  that
looked round and seeing that  ------>  her
round and seeing that her  ------>  friend
and seeing that her friend  ------>  was
seeing that her friend was  ------>  not
that her friend was not  ------>  in
her friend was not in  ------>  the
friend was not in the  ------>  room
was not in the room  ------>  ran
not in the room ran  ------>  to
in the room ran to  ------>  look
t

In [None]:
subset_size_1 = len(X1) // 7
X1_subset = X1[:subset_size_1]
y1_subset = y1[:subset_size_1]

print(len(X1_subset))
print(X1_subset[:10])
print(len(y1_subset))
print(y1_subset[:10])

## Embedding and Model Training

In [None]:
emb_dim=128
hidden_layer_size=1024

In [None]:
class NextTokenGen(nn.Module):
  def __init__(self, context_len, vocab_size, emb_dim, hidden_layer_size):
    super(NextTokenGen,self).__init__()
    self.context_len = context_len
    self.emb_dim = emb_dim
    self.embed=nn.Embedding(vocab_size,emb_dim)
    self.layer0=nn.Linear(context_len*emb_dim, hidden_layer_size)
    self.layer1=nn.Linear(hidden_layer_size, vocab_size)

  def forward(self, X, activation=None):
    X=self.embed(X)
    X=X.view(X.shape[0],self.context_len*self.emb_dim)
    if activation=='relu':
      X=F.relu(self.layer0(X))
    elif activation=='tanh':
      X=torch.tanh(self.layer0(X))
    else:
      X=self.layer0(X)

    X=self.layer1(X)

    return X

In [None]:
text_gen1 = NextTokenGen(context_len,len(list(token_to_index1.items())),emb_dim,hidden_layer_size)

In [None]:
def model_training(model, batch_size, epoch_no, learn_rate, X, y, act_fn):

  loss_fn=nn.CrossEntropyLoss()
  optimizer=torch.optim.AdamW(model.parameters(), lr=learn_rate)

  for epoch in range(epoch_no):
    epoch_loss=0.0

    for i in range(0,X.shape[0],batch_size):
      optimizer.zero_grad()
      X_batch=X[i:i+batch_size]
      y_batch=y[i:i+batch_size]
      y_pred=model(X_batch, activation=act_fn)
      loss=loss_fn(y_pred,y_batch)
      loss.backward()
      optimizer.step()

      epoch_loss+=loss.item()

    epoch_loss = epoch_loss / (X.shape[0] // batch_size)

    if epoch%10==0:
      print(f"Epoch-{epoch} loss: {epoch_loss:.4f}")

In [None]:
model_training(text_gen1,200,41,0.005,X1_subset,y1_subset,'tanh')

## Saving The Model Using Pickle To Drive

In [None]:
def save_model_to_drive(model, model_name: str):

  drive.mount('/content/drive')
  os.makedirs('/content/drive/MyDrive/checkpoints', exist_ok=True)
  print(os.listdir('/content/drive/MyDrive/checkpoints'))

  model_path = f'/content/drive/MyDrive/checkpoints/{model_name}.pkl'

  with open(model_path, 'wb') as f:
      pickle.dump(model, f)

  print(f'Model saved to {model_path}')


In [None]:
def load_model_from_drive(model_name: str):

  drive.mount('/content/drive')
  with open(f'/content/drive/MyDrive/checkpoints/{model_name}.pkl', 'rb') as f:
    model_loaded = pickle.load(f)

  print('Model loaded successfully!')
  return model_loaded

In [None]:
save_model_to_drive(text_gen1, 'emb128_context5_tanh')

In [None]:
text_gen1_loaded=load_model_from_drive('emb128_context5_tanh')

## Visualization of Embeddings using t-SNE

##### For visualization we are considering some nouns, pronouns, adverbs, verbs, synonyms, antonyms, etc.

In [None]:
tokens_to_plot=[
                  'prince','lucca','pavlovna','anna','europe','crusades', #Names
                  'who','where','when','what','which','why', # interrogative words
                  'a', 'an', 'the', # articles
                  'in', 'on', 'of', 'over', 'under', 'out', # prepositions
                  'i','you','he','she','they', 'it', # pronouns
                  # 'hot', 'cold', 'long', 'short', 'up', 'down', # antonyms
                  'warn', 'caution', 'frightened', 'scared', 'importance','value', # synonymns
                  # 'inevitably', 'urgently', 'apparently', 'constantly' # Adverbs
                ]

In [None]:
def plot_embeddings(tokens_to_plot, token_to_index, index_to_token, model):

  embeds=np.array(model.embed(torch.tensor([token_to_index[token] for token in tokens_to_plot])).detach().numpy())

  tsne = TSNE(n_components=2, perplexity=20, random_state=0)
  embeds_2d = tsne.fit_transform(embeds)

  plt.figure(figsize=(10, 10))
  plt.scatter(embeds_2d[:, 0], embeds_2d[:, 1])

  for i, token in enumerate(tokens_to_plot):
      plt.annotate(token, (embeds_2d[i, 0], embeds_2d[i, 1]))

  plt.title("t-SNE Visualization of Word Embeddings")
  plt.xlabel("t-SNE Component 1")
  plt.ylabel("t-SNE Component 2")
  plt.show()

In [None]:
plot_embeddings(tokens_to_plot, token_to_index1, index_to_token1, text_gen1_loaded)

## next K words prediction

In [None]:
def get_embedding(word, vocab_words, embeddings):
    if word in vocab_words:
        idx = vocab_words.index(word)
        return embeddings[idx].reshape(1, -1)
    return np.mean(embeddings, axis=0).reshape(1, -1)

In [None]:
def find_closest_word(word, vocab_words, embeddings):
    if word in vocab_words:
        return word

    word_embedding = get_embedding(word, vocab_words, embeddings)

    similarities = cosine_similarity(word_embedding, np.array([embeddings]).reshape(-1,1))
    closest_idx = np.argmax(similarities)
    closest_word = vocab_words[closest_idx]
    return closest_word


In [None]:
# def predict_next_k_words(model, token_to_index, index_to_token, context, k):

#     prompt_tokens = re.findall(r'\b\w+\b|[.]', context)
#     context_tokens=[]
#     for token in prompt_tokens:
#         if token in list(token_to_index.keys()):
#             context_tokens.append(token)
#         else:
#             context_tokens.append(find_closest_word(token, list(token_to_index.keys()), np.array(list(token_to_index.values()))))

#     context_indices = [token_to_index.get(word, token_to_index[' ']) for word in context_tokens]

#     if len(context_indices) < context_len:
#         context_indices = [1] * (context_len - len(context_indices)) + context_indices
#     else:
#         context_indices = context_indices[-context_len:]

#     predicted_words = []

#     model.eval()
#     with torch.no_grad():
#         for _ in range(k):
#             context_tensor = torch.tensor(context_indices, dtype=torch.int64).unsqueeze(0)
#             context_tensor=context_tensor.reshape(1,-1)
#             output = model(context_tensor)
#             next_word_index = torch.argmax(output, dim=1).item()
#             next_word = index_to_token[next_word_index]
#             predicted_words.append(next_word)
#             context_indices.append(next_word_index)
#             context_indices = context_indices[-context_len:]

#     return ' '.join(predicted_words)


In [None]:
def predict_next_k_words(context, k):
    context_tokens = re.findall(r'\b\w+\b|[.]', context)
    context_indices = [token_to_index1.get(word, token_to_index1[' ']) for word in context_tokens]

    # Trim or pad the context to fit the required length
    if len(context_indices) < context_len:
        context_indices = [1] * (context_len - len(context_indices)) + context_indices
    else:
        context_indices = context_indices[-context_len:]

    predicted_words = []

    text_gen1_loaded.eval()  # Set model to eval mode for inference
    with torch.no_grad():
        for _ in range(k):
            # Convert context to tensor and pass through model
            context_tensor = torch.tensor(context_indices, dtype=torch.int64).unsqueeze(0)
            print((context_tensor))
            print(context_tensor.shape)
            output = text_gen1_loaded(context_tensor)

            # Get predicted word index and corresponding word
            next_word_index = torch.argmax(output, dim=1).item()
            next_word = index_to_token1[next_word_index]

            # Add predicted word to results
            predicted_words.append(next_word)

            # Update context with new word and adjust to maintain context length
            context_indices.append(next_word_index)
            context_indices = context_indices[-context_len:]  # Keep only last `context_len` tokens

    return ' '.join(predicted_words)


In [None]:
context = "this entire ordeal of assignment is taking too much time and all i want is "
k = 100
predicted_text = predict_next_k_words(context, k)
print(predicted_text)
