## Downloading The Data 

In [None]:
!pip install kaggle --quiet

In [None]:
from google.colab import files
files.upload()

In [None]:
!rm -f ~/.kaggle            # remove if it was a file before
!mkdir -p ~/.kaggle         # make directory
!cp kaggle.json ~/.kaggle/  # copy your uploaded kaggle.json
!chmod 600 ~/.kaggle/kaggle.json  # secure permissions

In [None]:
!kaggle datasets download -d dorianlazar/medium-articles-dataset -p /content

In [None]:
!unzip -o /content/medium-articles-dataset.zip -d /content/medium_data

## Imported The Necessary Libraries

In [None]:
import pandas as pd
import torch
import  torch.nn as nn
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from torch.utils.data import Dataset, DataLoader


## Data Preprocessing

In [None]:
df = pd.read_csv("/content/medium_data/medium_data.csv")
df.head()

> I only need the title columns for for this project and i will just preprocess the data i am not going to do any  data cleaning

In [None]:
df.info()

In [None]:
document = "\n".join(df["title"].dropna().astype(str))

In [None]:
document

In [None]:
len(document)

In [None]:
nltk.download("punkt_tab")

In [None]:
tokens = word_tokenize(document.lower())

In [None]:
tokens[:6]

In [None]:
len(tokens)

In [None]:
vocab = {'<unk>' : 0}
for token in Counter(tokens):
  if token not in vocab:
    vocab[token] = len(vocab)

len(vocab)

In [None]:
Counter(vocab).most_common(10)

In [None]:
input_sequences = document.split('\n')

In [None]:
input_sequences[:8]

In [None]:

def text_to_indices(sentence, vocab):

  numerical_sentence = []

  for token in sentence:
    if token in vocab:
      numerical_sentence.append(vocab[token])
    else:
      numerical_sentence.append(vocab['<unk>'])

  return numerical_sentence

In [None]:
input_numerical_sentences = []

for sentence in input_sequences:
  input_numerical_sentences.append(text_to_indices(word_tokenize(sentence.lower()), vocab))

In [None]:
input_numerical_sentences[:10]

In [None]:
len(input_numerical_sentences)

In [None]:
training_sequence = []
for sentence in input_numerical_sentences:

  for i in range(1, len(sentence)):
    training_sequence.append(sentence[:i+1])


In [None]:
training_sequence[:10]

In [None]:
len(training_sequence)

In [None]:
len_list = []
for sequence in training_sequence:
  len_list.append(len(sequence))

max(len_list)

In [None]:
len(training_sequence[0])

In [None]:
padded_training_sequence = []
for sequence in training_sequence:

  padded_training_sequence.append([0]*(max(len_list) - len(sequence)) + sequence)

In [None]:
len(padded_training_sequence[0])

In [None]:
padded_training_sequence = torch.tensor(padded_training_sequence, dtype=torch.long)

In [None]:
padded_training_sequence[:3]

In [None]:
X = padded_training_sequence[:, :-1]
y = padded_training_sequence[:,-1]

In [None]:
X

In [None]:
y

## Dataset & Data Loader

In [None]:
class CustomDataset(Dataset):

  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __len__(self):
    return self.X.shape[0]

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

In [None]:
dataset = CustomDataset(X,y)

In [None]:
len(dataset)

In [None]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

## Implimenting The LSTM Model

In [None]:

class LSTMModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, 100)
    self.lstm = nn.LSTM(100, 150, batch_first=True)
    self.fc = nn.Linear(150, vocab_size)

  def forward(self, x):
    embedded = self.embedding(x)
    intermediate_hidden_states, (final_hidden_state, final_cell_state) = self.lstm(embedded)
    output = self.fc(final_hidden_state.squeeze(0))
    return output

In [None]:
model = LSTMModel(len(vocab))

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model.to(device)

In [None]:
epochs = 100
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

## Training The Model

In [None]:
for epoch in range(epochs):
  total_loss = 0
  for batch_x, batch_y in dataloader:

    batch_x, batch_y = batch_x.to(device), batch_y.to(device)
    optimizer.zero_grad()
    output = model(batch_x)
    loss = criterion(output, batch_y)
    loss.backward()
    optimizer.step()
    total_loss = total_loss + loss.item()

  print(f"Epoch: {epoch + 1}, Loss: {total_loss:.4f}")

## Testing The Model

In [None]:
import time

def prediction(model, vocab, text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    tokenized_text = word_tokenize(text.lower())
    numerical_text = text_to_indices(tokenized_text, vocab)

    padded_text = torch.tensor([0] * (51 - len(numerical_text)) + numerical_text,
                                dtype=torch.long).unsqueeze(0).to(device)

    output = model(padded_text)
    _, index = torch.max(output, dim=1)

    predicted_token = list(vocab.keys())[index]
    return predicted_token

In [None]:
num_tokens = 20
input_text = "A Step-by-Step Implementation of"

print(input_text, end=" ")

for i in range(num_tokens):
    next_word = prediction(model, vocab, input_text)
    print(next_word, end=" ", flush=True)
    input_text += " " + next_word
    time.sleep(0.5)
print()

In [None]:
torch.save({
    "model_state": model.state_dict(),
    "vocab": vocab,
    "max_length": 51
}, "checkpoint.pth")
