In [None]:
corpus = ["Virat Kohli is married to the famous Bollywood actress Anushka Sharma, who is one of the biggest stars of the Hindi film industry",
          "Virat and Anushka started dated in 2013, after which the media gave them the name Virushka.",
          "The couple kept their relationship really private until they suddenly got married on 11 December 2017, in a beautiful private function in Florence, Italy.",
          "Attended by only very close family members and friends, their wedding photos and happy relationships set new goals for couple goals in India.",
          "Virat has cited that Anushka made him more emotionally strong and mature. They are together and rank among the most popular and followed couples in the country.",
          "Virat and Anushka have turned parents to their first child, Vamika Kohli, born on January 11, 2021. Kohli has mentioned that Vamika, his daughter's name, is another name for Goddess Durga.",
          "The couple welcomed their second baby, Akaay Kohli, on February 15, 2024.",
          "His National Honours: 2013 – Arjuna Award, second highest sporting honour. 2017 – Padma Shri, India's fourth highest civilian award. 2018 – Major Dhyan Chand Khel Ratna Award, India's highest sporting honour.",
          "Test records: Highest Number of Wins as Captain of India: 40 wins in 68 matches, Four Test double hundreds in four successive series",
          "ODI records: scored the most ODI centuries (50), He has the record of most ODI centuries while chasing (27), Virat has most number of ODI centuries in "]

# create `Word2Vec` model

In [None]:
from gensim.models import Word2Vec
import matplotlib.pyplot as plt

In [None]:
# convert text into tokens
sent_tokens = []

for sent in corpus:
    sent_tokens.append(sent.split(" "))

In [None]:
# load the word2vec model
model = Word2Vec(sent_tokens, min_count=1, max_count=20, vector_size=50) # embending method
# any word 20+ times accour... then that's ignore...
# 50 dim vector # vec_size take 100+ so high acc accour

In [None]:
# vocubalary used by our model
vocab = model.wv.index_to_key # list of bow

In [None]:
model.wv["Virat"]

In [None]:
# # if 2 vector size so plot the visual otherwise not possible
# plt.figure(figsize=(10,5))
# for word in vocab:
#     vec = model.wv[word]
#     plt.scatter(vec[0], vec[1])
#     plt.annotate(word, (vec[0], vec[1]))

# create `next Word Predictor` model

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

### create dataset

In [None]:
sentense = []
for sent in corpus:
    sentense.extend(sent)
text = "".join(sentense)
text

"Virat Kohli is married to the famous Bollywood actress Anushka Sharma, who is one of the biggest stars of the Hindi film industryVirat and Anushka started dated in 2013, after which the media gave them the name Virushka.The couple kept their relationship really private until they suddenly got married on 11 December 2017, in a beautiful private function in Florence, Italy.Attended by only very close family members and friends, their wedding photos and happy relationships set new goals for couple goals in India.Virat has cited that Anushka made him more emotionally strong and mature. They are together and rank among the most popular and followed couples in the country.Virat and Anushka have turned parents to their first child, Vamika Kohli, born on January 11, 2021. Kohli has mentioned that Vamika, his daughter's name, is another name for Goddess Durga.The couple welcomed their second baby, Akaay Kohli, on February 15, 2024.His National Honours: 2013 – Arjuna Award, second highest sport

In [None]:
### take unique word from i/p and give word wise number and save into one dic:
word2idx = {word:i for i, word in enumerate(set(text.split()))}
idx2word = {i:word for word, i in word2idx.items()}

In [None]:
# Virat Kohli is married to the famous Bollywood actress Anushka Sharma

# create custom dataset in pytorch
class CustomDataset(Dataset):

    def __init__(self, text, word2idx, seq_len):
        self.text = text
        self.word2idx = word2idx
        self.seq_len = seq_len # how many step prev seq ex., 5 --> the famous Bollywood actress Anushka <Sharma>

    def __len__(self):
        return len(self.text)-self.seq_len

    def __getitem__(self, index):
        sequence = [self.word2idx[word] for word in self.text[index:index+self.seq_len]] # ex., [1:6]
        target = self.word2idx[self.text[index+self.seq_len]] # Sharma [6]

        return torch.tensor(sequence), torch.tensor(target)

In [None]:
dataset = CustomDataset(text.split(), word2idx, 5)

In [None]:
dataset[5]

(tensor([122,  43,  17,  87, 140]), tensor(10))

In [None]:
for i in [122,  43,  17,  87, 140, 10]:
    print(idx2word[i], end=" ")

the famous Bollywood actress Anushka Sharma, 

In [None]:
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

### create RNN model

In [None]:
class VanillaRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(VanillaRNN, self).__init__()

        self.embed = nn.Embedding(vocab_size, embed_size) # work like word2vec embending method
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, h0):
        embed = self.embed(x)
        out, h = self.rnn(embed, h0)
        output = self.fc(out[:,-1,:]) # flatten layer
        return output, h

In [None]:
rnn_model = VanillaRNN(len(word2idx), 128, 256)

In [None]:
criterion = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(rnn_model.parameters(), lr=0.001)

### train rnn model

In [None]:
rnn_model.train()

for epoc in range(50):
    total_loss = 0
    for inputs, labels in data_loader:

        optimiser.zero_grad()
        h0 = torch.zeros(1, inputs.size(0), 256)
        output, next_h_state = rnn_model(inputs, h0)
        loss = criterion(output, labels)
        loss.backward()
        optimiser.step()

        total_loss += loss.item()

    print(f"epoc: {epoc+1}/50, total loss is: {total_loss}")

epoc: 1/20, total loss is: 35.70972299575806
epoc: 2/20, total loss is: 31.013436317443848
epoc: 3/20, total loss is: 27.161025047302246
epoc: 4/20, total loss is: 23.376649856567383
epoc: 5/20, total loss is: 19.565773963928223
epoc: 6/20, total loss is: 15.811407327651978
epoc: 7/20, total loss is: 12.272490501403809
epoc: 8/20, total loss is: 9.116433143615723
epoc: 9/20, total loss is: 6.532524406909943
epoc: 10/20, total loss is: 4.61534583568573
epoc: 11/20, total loss is: 3.259540766477585
epoc: 12/20, total loss is: 2.4033148884773254
epoc: 13/20, total loss is: 1.8083640336990356
epoc: 14/20, total loss is: 1.4163614362478256
epoc: 15/20, total loss is: 1.152532160282135
epoc: 16/20, total loss is: 0.9572903662919998
epoc: 17/20, total loss is: 0.8174900934100151
epoc: 18/20, total loss is: 0.7080075889825821
epoc: 19/20, total loss is: 0.6238533556461334
epoc: 20/20, total loss is: 0.5507866814732552
epoc: 21/20, total loss is: 0.49309200048446655
epoc: 22/20, total loss is: 

### Test model

In [None]:
unknown = torch.tensor([word2idx[word] for word in text.split()[-15:]])
unknown

tensor([ 22, 148, 142,  84, 157, 109,  53,  32,  85, 148, 102,  22, 142,  84,
        114])

In [None]:
h0 = torch.zeros(1,1,256)
out, _ = rnn_model(unknown.unsqueeze(0), h0)
pred_word = idx2word[out.argmax().item()]

In [None]:
print(" ".join(text.split()[-15:])," ",pred_word)

of most ODI centuries while chasing (27), Virat has most number of ODI centuries in   2013,
