## Downloading The Data 

In [1]:
!pip install kaggle --quiet

In [2]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"junaidio","key":"501b6112c0040dbe0263fb0a5da8ced2"}'}

In [3]:
!rm -f ~/.kaggle            # remove if it was a file before
!mkdir -p ~/.kaggle         # make directory
!cp kaggle.json ~/.kaggle/  # copy your uploaded kaggle.json
!chmod 600 ~/.kaggle/kaggle.json  # secure permissions

In [4]:
!kaggle datasets download -d dorianlazar/medium-articles-dataset -p /content

Dataset URL: https://www.kaggle.com/datasets/dorianlazar/medium-articles-dataset
License(s): CC0-1.0
Downloading medium-articles-dataset.zip to /content
 99% 1.32G/1.33G [00:08<00:00, 146MB/s]
100% 1.33G/1.33G [00:08<00:00, 176MB/s]


In [5]:
!unzip -o /content/medium-articles-dataset.zip -d /content/medium_data

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/medium_data/images/2249.jpeg  
  inflating: /content/medium_data/images/225.png  
  inflating: /content/medium_data/images/2250.jpeg  
  inflating: /content/medium_data/images/2251.jpg  
  inflating: /content/medium_data/images/2252.jpeg  
  inflating: /content/medium_data/images/2253.jpeg  
  inflating: /content/medium_data/images/2254.jpg  
  inflating: /content/medium_data/images/2255.jpg  
  inflating: /content/medium_data/images/2256.png  
  inflating: /content/medium_data/images/2257.jpeg  
  inflating: /content/medium_data/images/2258.jpeg  
  inflating: /content/medium_data/images/2259.jpg  
  inflating: /content/medium_data/images/226.jpeg  
  inflating: /content/medium_data/images/2260.jpeg  
  inflating: /content/medium_data/images/2261.jpeg  
  inflating: /content/medium_data/images/2262.jpg  
  inflating: /content/medium_data/images/2263.jpg  
  inflating: /content/medium_data/images/226

## Imported The Necessary Libraries

In [6]:
import pandas as pd
import torch
import  torch.nn as nn
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from torch.utils.data import Dataset, DataLoader


## Data Preprocessing

In [7]:
df = pd.read_csv("/content/medium_data/medium_data.csv")
df.head()

Unnamed: 0,id,url,title,subtitle,image,claps,responses,reading_time,publication,date
0,1,https://towardsdatascience.com/a-beginners-gui...,A Beginner’s Guide to Word Embedding with Gens...,,1.png,850,8,8,Towards Data Science,2019-05-30
1,2,https://towardsdatascience.com/hands-on-graph-...,Hands-on Graph Neural Networks with PyTorch & ...,,2.png,1100,11,9,Towards Data Science,2019-05-30
2,3,https://towardsdatascience.com/how-to-use-ggpl...,How to Use ggplot2 in Python,A Grammar of Graphics for Python,3.png,767,1,5,Towards Data Science,2019-05-30
3,4,https://towardsdatascience.com/databricks-how-...,Databricks: How to Save Files in CSV on Your L...,When I work on Python projects dealing…,4.jpeg,354,0,4,Towards Data Science,2019-05-30
4,5,https://towardsdatascience.com/a-step-by-step-...,A Step-by-Step Implementation of Gradient Desc...,One example of building neural…,5.jpeg,211,3,4,Towards Data Science,2019-05-30


> I only need the title columns for for this project and i will just preprocess the data i am not going to do any  data cleaning

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6508 entries, 0 to 6507
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            6508 non-null   int64 
 1   url           6508 non-null   object
 2   title         6508 non-null   object
 3   subtitle      3479 non-null   object
 4   image         6361 non-null   object
 5   claps         6508 non-null   int64 
 6   responses     6508 non-null   object
 7   reading_time  6508 non-null   int64 
 8   publication   6508 non-null   object
 9   date          6508 non-null   object
dtypes: int64(3), object(7)
memory usage: 508.6+ KB


In [9]:
document = "\n".join(df["title"].dropna().astype(str))

In [10]:
document



In [11]:
len(document)

335365

In [12]:
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [13]:
tokens = word_tokenize(document.lower())

In [14]:
tokens[:6]

['a', 'beginner', '’', 's', 'guide', 'to']

In [15]:
len(tokens)

61970

In [16]:
vocab = {'<unk>' : 0}
for token in Counter(tokens):
  if token not in vocab:
    vocab[token] = len(vocab)

len(vocab)

8347

In [17]:
Counter(vocab).most_common(10)

[('consumption', 8346),
 ('mass', 8345),
 ('himself', 8344),
 ('donald', 8343),
 ('catalyst', 8342),
 ('innovator', 8341),
 ('greenhouse', 8340),
 ('penny', 8339),
 ('……', 8338),
 ('forced', 8337)]

In [18]:
input_sequences = document.split('\n')

In [19]:
input_sequences[:8]

['A Beginner’s Guide to Word Embedding with Gensim Word2Vec\xa0Model',
 'Hands-on Graph Neural Networks with PyTorch & PyTorch Geometric',
 'How to Use ggplot2 in\xa0Python',
 'Databricks: How to Save Files in CSV on Your Local\xa0Computer',
 'A Step-by-Step Implementation of Gradient Descent and Backpropagation',
 'An Easy Introduction to SQL for Data Scientists',
 'Hypothesis testing visualized',
 'Introduction to Latent Matrix Factorization Recommender Systems']

In [20]:

def text_to_indices(sentence, vocab):

  numerical_sentence = []

  for token in sentence:
    if token in vocab:
      numerical_sentence.append(vocab[token])
    else:
      numerical_sentence.append(vocab['<unk>'])

  return numerical_sentence

In [21]:
input_numerical_sentences = []

for sentence in input_sequences:
  input_numerical_sentences.append(text_to_indices(word_tokenize(sentence.lower()), vocab))

In [23]:
input_numerical_sentences[:10]

[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
 [13, 14, 15, 16, 9, 17, 18, 17, 19],
 [20, 6, 21, 22, 23, 24],
 [25, 26, 20, 6, 27, 28, 23, 29, 30, 31, 32, 33],
 [1, 34, 35, 36, 37, 38, 39, 40],
 [41, 42, 43, 6, 44, 45, 46, 47],
 [48, 49, 50],
 [43, 6, 51, 52, 53, 54, 55],
 [56, 57, 58, 59, 60, 61, 62, 63, 64],
 [65, 66, 67, 12, 68, 69, 42, 64]]

In [24]:
len(input_numerical_sentences)

6508

In [25]:
training_sequence = []
for sentence in input_numerical_sentences:

  for i in range(1, len(sentence)):
    training_sequence.append(sentence[:i+1])


In [26]:
training_sequence[:10]

[[1, 2],
 [1, 2, 3],
 [1, 2, 3, 4],
 [1, 2, 3, 4, 5],
 [1, 2, 3, 4, 5, 6],
 [1, 2, 3, 4, 5, 6, 7],
 [1, 2, 3, 4, 5, 6, 7, 8],
 [1, 2, 3, 4, 5, 6, 7, 8, 9],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]]

In [27]:
len(training_sequence)

55467

In [28]:
len_list = []
for sequence in training_sequence:
  len_list.append(len(sequence))

max(len_list)

51

In [29]:
len(training_sequence[0])

2

In [30]:
padded_training_sequence = []
for sequence in training_sequence:

  padded_training_sequence.append([0]*(max(len_list) - len(sequence)) + sequence)

In [31]:
len(padded_training_sequence[0])

51

In [32]:
padded_training_sequence = torch.tensor(padded_training_sequence, dtype=torch.long)

In [33]:
padded_training_sequence[:3]

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 2],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         1, 2, 3],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         2, 3, 4]])

In [34]:
X = padded_training_sequence[:, :-1]
y = padded_training_sequence[:,-1]

In [35]:
X

tensor([[   0,    0,    0,  ...,    0,    0,    1],
        [   0,    0,    0,  ...,    0,    1,    2],
        [   0,    0,    0,  ...,    1,    2,    3],
        ...,
        [   0,    0,    0,  ...,  677,    1,  551],
        [   0,    0,    0,  ...,    1,  551,  303],
        [   0,    0,    0,  ...,  551,  303, 2870]])

In [36]:
y

tensor([   2,    3,    4,  ...,  303, 2870, 2403])

## Dataset & Data Loader

In [37]:
class CustomDataset(Dataset):

  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __len__(self):
    return self.X.shape[0]

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

In [38]:
dataset = CustomDataset(X,y)

In [39]:
len(dataset)

55467

In [40]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

## Implimenting The LSTM Model

In [41]:

class LSTMModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, 100)
    self.lstm = nn.LSTM(100, 150, batch_first=True)
    self.fc = nn.Linear(150, vocab_size)

  def forward(self, x):
    embedded = self.embedding(x)
    intermediate_hidden_states, (final_hidden_state, final_cell_state) = self.lstm(embedded)
    output = self.fc(final_hidden_state.squeeze(0))
    return output

In [42]:
model = LSTMModel(len(vocab))

In [43]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [44]:
model.to(device)

LSTMModel(
  (embedding): Embedding(8347, 100)
  (lstm): LSTM(100, 150, batch_first=True)
  (fc): Linear(in_features=150, out_features=8347, bias=True)
)

In [45]:
epochs = 100
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

## Training The Model

In [46]:
for epoch in range(epochs):
  total_loss = 0
  for batch_x, batch_y in dataloader:

    batch_x, batch_y = batch_x.to(device), batch_y.to(device)
    optimizer.zero_grad()
    output = model(batch_x)
    loss = criterion(output, batch_y)
    loss.backward()
    optimizer.step()
    total_loss = total_loss + loss.item()

  print(f"Epoch: {epoch + 1}, Loss: {total_loss:.4f}")

Epoch: 1, Loss: 11016.3098
Epoch: 2, Loss: 9176.0848
Epoch: 3, Loss: 8035.9793
Epoch: 4, Loss: 7002.3894
Epoch: 5, Loss: 6071.0006
Epoch: 6, Loss: 5239.4070
Epoch: 7, Loss: 4530.9285
Epoch: 8, Loss: 3939.8066
Epoch: 9, Loss: 3441.6544
Epoch: 10, Loss: 3019.9767
Epoch: 11, Loss: 2665.0263
Epoch: 12, Loss: 2365.6522
Epoch: 13, Loss: 2106.2772
Epoch: 14, Loss: 1893.5748
Epoch: 15, Loss: 1715.6836
Epoch: 16, Loss: 1567.6132
Epoch: 17, Loss: 1443.0953
Epoch: 18, Loss: 1342.9446
Epoch: 19, Loss: 1257.8796
Epoch: 20, Loss: 1195.2730
Epoch: 21, Loss: 1140.1050
Epoch: 22, Loss: 1097.6618
Epoch: 23, Loss: 1064.1649
Epoch: 24, Loss: 1037.3698
Epoch: 25, Loss: 1013.4893
Epoch: 26, Loss: 996.5336
Epoch: 27, Loss: 981.1009
Epoch: 28, Loss: 968.4590
Epoch: 29, Loss: 958.6989
Epoch: 30, Loss: 946.8393
Epoch: 31, Loss: 942.9676
Epoch: 32, Loss: 934.4135
Epoch: 33, Loss: 927.6547
Epoch: 34, Loss: 925.0030
Epoch: 35, Loss: 917.3795
Epoch: 36, Loss: 910.4663
Epoch: 37, Loss: 907.5795
Epoch: 38, Loss: 907.

## Testing The Model

In [47]:
import time

def prediction(model, vocab, text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    tokenized_text = word_tokenize(text.lower())
    numerical_text = text_to_indices(tokenized_text, vocab)

    padded_text = torch.tensor([0] * (51 - len(numerical_text)) + numerical_text,
                                dtype=torch.long).unsqueeze(0).to(device)

    output = model(padded_text)
    _, index = torch.max(output, dim=1)

    predicted_token = list(vocab.keys())[index]
    return predicted_token

In [54]:
num_tokens = 20
input_text = "A Step-by-Step Implementation of"

print(input_text, end=" ")

for i in range(num_tokens):
    next_word = prediction(model, vocab, input_text)
    print(next_word, end=" ", flush=True)
    input_text += " " + next_word
    time.sleep(0.5)
print()

A Step-by-Step Implementation of gradient descent and backpropagation math exploratory career habit guide to code ( part 1 ) approach — made easy now 


In [55]:
torch.save({
    "model_state": model.state_dict(),
    "vocab": vocab,
    "max_length": 51
}, "checkpoint.pth")
