Importing libraries

In [None]:
import pandas as pd
import numpy as np
from scipy.special import softmax
import matplotlib.pyplot as plt
import sklearn
import random
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

Reading in data

In [None]:
data = pd.read_csv("data")

In [None]:
data.head(3)

In [None]:
data.info()

Only keeping news

In [None]:
news= data["News"].values

In [None]:
news= data["News"].values
isinstance(news, list)

False

In [None]:
news_list = []
for item in news:
  news_list.append(item)
news = news_list
isinstance(news, list)

True

Preprocessing: only keep most used words

In [None]:
MAX_VOCAB = 9999

In [None]:
# Based on word frequency
tokenizer = Tokenizer(num_words = MAX_VOCAB,
                      filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                      oov_token = 'UNK',
                      lower="True"
                      )

In [None]:
tokenizer.fit_on_texts(news)

In [None]:
seq = tokenizer.texts_to_sequences(news)

In [None]:
len(seq) == len(news)

True

In [None]:
seq[0]

In [None]:
tokenizer.word_index["hi"]

8168

In [None]:
word_index = {}
for k,v in tokenizer.word_index.items():
    if v< MAX_VOCAB:
        word_index[k] = v
word_index["START"] = 0
word_index["END"] = MAX_VOCAB

In [None]:
word_index["START"]

0

In [None]:
word_index["economy"]

419

In [None]:
index_word = { v : k for k,v in word_index.items()}

In [None]:
index_word[0]

'START'

Shuffle to make it random

In [None]:
random.shuffle(seq)

In [None]:
sequences = []
for sequence in seq:
    sequences.append([0] + sequence + [MAX_VOCAB])

In [None]:
sequences[0]

[0, 1, 540, 2127, 289, 1480, 36, 2536, 247, 1, 8810, 420, 1, 9999]

Let's try to create transition matrix

In [None]:
sequences_flat_list = []
for sublist in sequences:
    for item in sublist:
        sequences_flat_list.append(item)

In [None]:
sequences_flat_list[0:100]

In [None]:
def transition_matrix(V, sequence):
    Q = [[0]*(V+1) for _ in range(V+1)]
    for (i,j) in zip(sequence,sequence[1:]):
        Q[i][j] += 1

    for row in Q:
        n = sum(row)
        if n > 0:
            row[:] = [f/n for f in row]

    return Q

In [None]:
Q = transition_matrix(MAX_VOCAB, sequences_flat_list)

In [None]:
Q = np.array(Q)

Checks on transition matrix

In [None]:
Q.shape

(10000, 10000)

In [None]:
np.sum(Q[1])

1.0

In [None]:
np.sum(Q[0:10000,])

10000.000000000013

In [None]:
probabilities = np.sum(Q, axis = 1)
print(len(probabilities)-sum((probabilities >0.99)))

0


In [None]:
random.seed(2021)
true_sentences = random.sample(sequences,5)

In [None]:
true_sentences

In [None]:
def likelihood_test(sentences, T_matrix):
    n = len(sentences)
    likelihood = [0]*n
    for sentence in range(n):
        for (i,j) in zip(sentences[sentence],sentences[sentence][1:]):
            if T_matrix[i,j] != 0:
                likelihood[sentence] += np.log(T_matrix[i,j])
            else:
                likelihood[sentence] += np.log(10**(-18))
        likelihood[sentence] = likelihood[sentence]/len(sentences[sentence])

    return likelihood

In [None]:
true_likelihood = likelihood_test(true_sentences,Q)
print(true_likelihood)

[-3.863439835548537, -3.7136936630690256, -4.305679400513897, -3.8139928552872067, -4.833726249194242]


Average log-likelihood for true sentences

In [None]:
average_loglikelihood = np.mean(true_likelihood)
average_loglikelihood

-4.106106400722582

In [None]:
def max_min(sequences):
    lengths = [len(seq) for seq in sequences]
    maxim = max(lengths)
    minim = min(lengths)
    return minim, maxim

max_min(seq)

(2, 65)

Create fake sentences as a check

In [None]:
fake_sentences = []

for i in range(5):
    length_random = random.randint(2,65)
    sentence_fake = random.sample(range(1,MAX_VOCAB-1), length_random)
    fake_sentences.append(sentence_fake)

In [None]:
len(fake_sentences[0])

In [None]:
fake_sequences_comparison = []
for sentence in fake_sentences:
    fake_sequences_comparison.append([0] + sentence + [MAX_VOCAB])

In [None]:
fake_sequences_comparison[0]

In [None]:
fake_likelihood = likelihood_test(fake_sequences_comparison,Q)
print(fake_likelihood)

[-40.719399539263165, -36.84136148790473, -39.66672401607064, -40.76708033497658, -39.830874670944084]


Average log-likelihood for fake sentences

In [None]:
average_loglikelihood_fake = np.mean(fake_likelihood)
average_loglikelihood_fake

-39.56508800983185

In [None]:
def create_matrix(sequences, V):
  N = len(sequences)
  tensor_data = np.zeros((N, V))
  for i, sequence in enumerate(sequences):
    tensor_data[i, sequence] = 1.
  return tensor_data

For computational efficiency, let's test on one epoch
(in reality should be larger)

In [None]:
D = 8
learning_rate = 0.0001
epochs = 1

In [None]:
W_1 = np.array([ [ np.random.rand() for i in range(D) ] for j in range(MAX_VOCAB+2) ])
W_2 = np.array([ [ np.random.rand() for i in range(MAX_VOCAB+2) ] for j in range(D) ])

In [None]:
def new_sgd(sentence, learning_rate, W_1, W_2):

  sentence_matrix = create_matrix(sentence, MAX_VOCAB+2)
  feature_matrix = sentence_matrix[:len(sentence_matrix)-1,:]
  target_matrix = sentence_matrix[1:, :]


  hidden_matrix = np.tanh(W_1[sentence[:-1]])
  prediction_matrix = softmax(hidden_matrix.dot(W_2))

  # Gradients
  N_matrix = np.array([ [ 1 for i in range(D) ] for j in range(len(sentence_matrix)-1) ])
  gW_2 = np.transpose(hidden_matrix).dot(prediction_matrix - target_matrix)
  gW_1 = np.transpose(feature_matrix).dot(np.multiply((prediction_matrix - target_matrix).dot(np.transpose(W_2)), (N_matrix-np.multiply(hidden_matrix, hidden_matrix))))

  W_1 -= learning_rate*gW_1
  W_2 -= learning_rate*gW_2

  cost = tf.keras.losses.categorical_crossentropy(target_matrix, prediction_matrix).numpy()

  return W_1, W_2, cost.sum()


In [None]:
full_costs = []

random.shuffle(sequences)

for i in range(0,len(sequences)):
  W_1, W_2, tmp_cost = new_sgd(sequences[i], learning_rate, W_1, W_2)
  full_costs.append(tmp_cost)

Exponentially weighted moving average, with a smoothing factor of 0.01

In [None]:
ewma = pd.Series(full_costs).ewm(alpha=0.01).mean()

In [None]:
plotting_data = pd.DataFrame({'Loss': full_costs, "EWMA Loss": ewma, 'Batch': range(1, len(full_costs)+1)})
plotting_data.head()

Assessing cost curve as a function of the number of batches

In [None]:
sns.lineplot(x = 'Batch', y = 'EWMA Loss', data = plotting_data, color = "coral").set_title('EWMA Loss')