## Importing necessary libraries

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

## Reading cvs file

In [4]:
df = pd.read_csv("medium_data.csv")
df.head()

Unnamed: 0,id,url,title,subtitle,image,claps,responses,reading_time,publication,date
0,1,https://towardsdatascience.com/a-beginners-gui...,A Beginner’s Guide to Word Embedding with Gens...,,1.png,850,8,8,Towards Data Science,2019-05-30
1,2,https://towardsdatascience.com/hands-on-graph-...,Hands-on Graph Neural Networks with PyTorch & ...,,2.png,1100,11,9,Towards Data Science,2019-05-30
2,3,https://towardsdatascience.com/how-to-use-ggpl...,How to Use ggplot2 in Python,A Grammar of Graphics for Python,3.png,767,1,5,Towards Data Science,2019-05-30
3,4,https://towardsdatascience.com/databricks-how-...,Databricks: How to Save Files in CSV on Your L...,When I work on Python projects dealing…,4.jpeg,354,0,4,Towards Data Science,2019-05-30
4,5,https://towardsdatascience.com/a-step-by-step-...,A Step-by-Step Implementation of Gradient Desc...,One example of building neural…,5.jpeg,211,3,4,Towards Data Science,2019-05-30


In [5]:
print("Number of records: ", df.shape[0])
print("Number of fields: ", df.shape[1])

Number of records:  6508
Number of fields:  10


In [6]:
text = df['title']
text

0       A Beginner’s Guide to Word Embedding with Gens...
1       Hands-on Graph Neural Networks with PyTorch & ...
2                            How to Use ggplot2 in Python
3       Databricks: How to Save Files in CSV on Your L...
4       A Step-by-Step Implementation of Gradient Desc...
                              ...                        
6503    “We” vs “I” — How Should You Talk About Yourse...
6504                     How Donald Trump Markets Himself
6505        Content and Marketing Beyond Mass Consumption
6506    5 Questions All Copywriters Should Ask Clients...
6507               How To Write a Good Business Blog Post
Name: title, Length: 6508, dtype: object

## Removal of unwanted characters

In [7]:
df['title'] = df['title'].apply(lambda x: x.replace(u'\xa0',u' '))
df['title'] = df['title'].apply(lambda x: x.replace('\u200a',' ')) #removing unwanted characters from title column
df['title'][0]

'A Beginner’s Guide to Word Embedding with Gensim Word2Vec Model'

## Tokenizer object created

In [8]:
tokenizer = Tokenizer() #tokenizer object created
tokenizer.fit_on_texts(text)
total_words = len(tokenizer.word_index) + 1 #gives no. of unique words(vocab size) ,+1 used for special token(resevred for padding or unknown word)

In [9]:
tokenizer.word_index #each word is assigned with some index

{'to': 1,
 'the': 2,
 'strong': 3,
 'a': 4,
 'how': 5,
 'of': 6,
 'and': 7,
 'markup': 8,
 'your': 9,
 'in': 10,
 'for': 11,
 'you': 12,
 'is': 13,
 'with': 14,
 'class': 15,
 'h3': 16,
 'why': 17,
 'data': 18,
 'i': 19,
 'what': 20,
 'on': 21,
 'from': 22,
 'an': 23,
 'learning': 24,
 'can': 25,
 'are': 26,
 'my': 27,
 'be': 28,
 'using': 29,
 'do': 30,
 'ux': 31,
 'design': 32,
 'not': 33,
 'when': 34,
 'writing': 35,
 'that': 36,
 'we': 37,
 'about': 38,
 '5': 39,
 'machine': 40,
 'make': 41,
 'it': 42,
 'should': 43,
 'as': 44,
 'need': 45,
 'ai': 46,
 '3': 47,
 'more': 48,
 'don’t': 49,
 'life': 50,
 'marketing': 51,
 'or': 52,
 'will': 53,
 'have': 54,
 'ways': 55,
 'get': 56,
 'time': 57,
 'at': 58,
 'up': 59,
 'guide': 60,
 'science': 61,
 'use': 62,
 'by': 63,
 'write': 64,
 'business': 65,
 'new': 66,
 'python': 67,
 'if': 68,
 'deep': 69,
 'self': 70,
 'best': 71,
 'first': 72,
 'into': 73,
 'top': 74,
 'tips': 75,
 'things': 76,
 'stop': 77,
 'analysis': 78,
 'intelligence'

In [10]:
total_words #total unique words

10970

## Input Sequences

In [11]:
input_sequences = []
for line in text: #iterating over each line in text
    token_list = tokenizer.texts_to_sequences([line])[0] #converting line into a seq of integers using the tokenizer
    #print(token_list)

    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# print(input_sequences)
print("Total input sequences: ", len(input_sequences))
input_sequences

Total input sequences:  43439


[[4, 565],
 [4, 565, 60],
 [4, 565, 60, 1],
 [4, 565, 60, 1, 434],
 [4, 565, 60, 1, 434, 1309],
 [4, 565, 60, 1, 434, 1309, 14],
 [4, 565, 60, 1, 434, 1309, 14, 3507],
 [4, 565, 60, 1, 434, 1309, 14, 3507, 3508],
 [3509, 21],
 [3509, 21, 782],
 [3509, 21, 782, 111],
 [3509, 21, 782, 111, 157],
 [3509, 21, 782, 111, 157, 14],
 [3509, 21, 782, 111, 157, 14, 477],
 [3509, 21, 782, 111, 157, 14, 477, 477],
 [3509, 21, 782, 111, 157, 14, 477, 477, 1650],
 [5, 1],
 [5, 1, 62],
 [5, 1, 62, 3510],
 [5, 1, 62, 3510, 192],
 [3511, 5],
 [3511, 5, 1],
 [3511, 5, 1, 231],
 [3511, 5, 1, 231, 1073],
 [3511, 5, 1, 231, 1073, 10],
 [3511, 5, 1, 231, 1073, 10, 2216],
 [3511, 5, 1, 231, 1073, 10, 2216, 21],
 [3511, 5, 1, 231, 1073, 10, 2216, 21, 9],
 [3511, 5, 1, 231, 1073, 10, 2216, 21, 9, 3512],
 [4, 169],
 [4, 169, 63],
 [4, 169, 63, 169],
 [4, 169, 63, 169, 398],
 [4, 169, 63, 169, 398, 6],
 [4, 169, 63, 169, 398, 6, 3513],
 [4, 169, 63, 169, 398, 6, 3513, 2217],
 [4, 169, 63, 169, 398, 6, 3513, 2217

In [12]:
# pad sequences
#find length of longest sequence in input_sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')) #pre for adding padding to beginning of each sequence
input_sequences[1]#second sequence after padding
max_sequence_len

38

In [13]:
# create features and label
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)
ys.shape

(43439, 10970)

## Layerwise implementation

In [14]:
model = Sequential([
    Embedding(total_words, 100, input_length=max_sequence_len-1), #excludes the target word
    Bidirectional(LSTM(150)),
    Dense(total_words, activation='softmax')
])


In [15]:
opt = Adam(learning_rate = 0.004)
model.compile(optimizer = opt, loss= 'categorical_crossentropy', metrics = ['accuracy'])

In [16]:
model.fit(xs, ys, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7b937b533e80>

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 37, 100)           1097000   
                                                                 
 bidirectional (Bidirection  (None, 300)               301200    
 al)                                                             
                                                                 
 dense (Dense)               (None, 10970)             3301970   
                                                                 
Total params: 4700170 (17.93 MB)
Trainable params: 4700170 (17.93 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Testing with some words

In [19]:
seed_text = input("Please enter some words")
next_words = 5

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences(
        [token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted_probs = model.predict(token_list)
    predicted_word = tokenizer.index_word[np.argmax(predicted_probs)]
    seed_text += " " + predicted_word

print("Next predicted words:", seed_text)

Please enter some wordsA Step
Next predicted words: A Step by step implementation of gradient
