<a href="https://colab.research.google.com/github/MuyembeM/MachineLearning/blob/main/DataSubsampler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [90]:
# dependencies
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random

from tqdm import tqdm
from itertools import chain
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import *
from tensorflow.keras.models import *
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

In [91]:
# fix the seed for better reproducibility
SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

In [10]:
# make tqdm work with pandas
tqdm.pandas()

In [None]:
# load the dataset and preview
data = pd.read_pickle('dataframe_extractive.pkl')
data.head()

In [None]:
# story length distribution
d1_sent_count = data.groupby('story_id').size().reset_index(name='count_sentences')
d1_sent_count.head()

In [None]:
# median of the story lengths
d1_sent_count['count_sentences'].median()

In [None]:
# filter the stories that have a length < 30
list_stories_subset = list(d1_sent_count[d1_sent_count['count_sentences'] < 30]['story_id'])
len(list_stories_subset)

In [None]:
# now subset 50000 stories
SUBSET = 50000
list_stories_subset = random.sample(list_stories_subset, SUBSET)
len(np.unique(list_stories_subset))

In [None]:
# split training:validation:testing data 3:1:1
train_ids = list_stories_subset[:30000]
validation_ids = list_stories_subset[30000:40000]
test_ids = list_stories_subset[40000:]

train = data[data['story_id'].isin(train_ids)]
valid = data[data['story_id'].isin(validation_ids)]
test = data[data['story_id'].isin(test_ids)]

train.shape, valid.shape, test.shape

In [None]:
# story length distribution plot in the training dataset
train = train.sort_values(['story_id','sent_id'])
d2_sent_count = train.groupby('story_id').size().reset_index(name='count_sentences')
sns.displot(d2_sent_count['count_sentences'])
plt.show()

In [None]:
# descriptive statistics of story length in the training dataset
d2_sent_count['count_sentences'].describe()

In [None]:
# note the maximum story length from the training dataset
# to be used for preprocessing
max_len_story = d2_sent_count['count_sentences'].max()
max_len_story

In [None]:
# total number of unique sentences in the training dataset
sent_ls = set(train['sentence'].tolist())
len(sent_ls)

In [None]:
# unique labels in the training dataset
# 0: sentence is not a summary candidate
# 1: sentence is a summary candidate
tags = list(set(train['label_sent'].values))
n_tags = len(tags)
n_tags

In [None]:
# encoding the sentence labels and reversing them
tag2idx  = {t: i + 1 for i, t in enumerate(tags)}
tag2idx['PAD'] = 0
idx2tag = {i: s for s, i in tag2idx.items()}
print(tag2idx)

In [29]:
def add_word_lengths(df):
    df['word_len'] = df['sentence'].progress_apply(lambda x: len(x.split()))
    df['word_lis'] = df['sentence'].progress_apply(lambda x: x.split())
    return df

In [None]:
train = add_word_lengths(train)
valid = add_word_lengths(valid)
test = add_word_lengths(test)

train.head()

In [None]:
# sentence length distribution per story from the training dataset
sns.displot(train['word_len'])
plt.show()

In [None]:
# descriptive statistics of sentence length distribution per story
# from the training dataset
train['word_len'].describe()

In [None]:
# total number of unique words in our training corpus
word_ls = set(list(chain(*train['word_lis'].tolist())))
n_words = len(word_ls)
print(n_words)

In [35]:
# mapping the words to integers (tokenization) because machines don't
# understand raw text
word2idx = {c: i+2 for i,c in enumerate(word_ls)}
word2idx['UNK'] = 1
word2idx['PAD'] = 0

# reversing this dictionary as this would be needed for post-processing
idx2word = {i: s for s, i in word2idx.items()}

In [36]:
def append_sent_labels(df):
    df['sent_lab'] = df[['sentence','label_sent']].apply(tuple, axis=1)
    return df

In [None]:
train = append_sent_labels(train)
valid = append_sent_labels(valid)
test = append_sent_labels(test)

train.head()

In [None]:
# sanity check the unique story ids in each of the splits
train['story_id'].nunique(), valid['story_id'].nunique(), test['story_id'].nunique()

In [39]:
# we can play with this number but for now we will settle with 40
MAX_LEN_WORD = 40

def represent_stories(df):
    # filter out the labels from the data subset
    story_ids = df['story_id'].unique()
    stories_labels = []
    for s_id in tqdm(story_ids):
        temp_story = []
        # get all the sentences with respect to the givcen story id
        temp_story = list(df[df['story_id'] == s_id]['sent_lab'])
        stories_labels.append(temp_story)

    # initialize an all-zeros array in the shape of
    # (nb_stories, max_story_length, max_sentence_lenght)
    X_word = np.zeros((len(stories_labels), max_len_story, MAX_LEN_WORD))

    for idx, story in tqdm(enumerate(stories_labels)):
        story_seq = []

        # to give an upper bound on the maximum length of the word sequence for sentence
        for i in range(max_len_story):
            sent_seq = []

            # to give an upper bound on the maximum length of words to consider
            for j in range(MAX_LEN_WORD):
                try:
                    split_sent = story[i][0].split()
                    sent_seq.append(word2idx.get(split_sent[j]))
                except:
                    # exception will be there when there will not be any sentence for the length
                    # and will be padded 0
                    sent_seq.append(word2idx.get('PAD'))
            story_seq.append(sent_seq)

        X_word[idx] = np.array(story_seq)

    return (X_word, stories_labels)

In [None]:
# preprocess the training corpus
X_train, stories_labels_train = represent_stories(train)
X_train.shape

In [None]:
# preprocess the validation set
X_valid, stories_labels_valid = represent_stories(valid)
X_valid.shape

In [42]:
def prepare_labels(story_labels):
    y = [[tag2idx[w[1]] for w in s] for s in story_labels]
    y = pad_sequences(maxlen=max_len_story,
                  sequences=y,
                  value=tag2idx["PAD"],
                  padding='post',
                  truncating='post')
    y = y.reshape(-1, max_len_story, 1)

    return y

In [43]:
# preprocess the train and validation labels
train_labels = prepare_labels(stories_labels_train)
valid_labels = prepare_labels(stories_labels_valid)

train_labels.shape, valid_labels.shape

((30000, 1, 1), (10000, 1, 1))

In [None]:
# retrieve the GloVe 100d embeddings
!wget https://github.com/MuyembeM/MachineLearning/blob/main/downloads/glove.6B.100d.txt?raw=true

In [45]:
# using pre-trained word embeddings in the keras model
def get_embedding_matrix(word_index, embedding_path, embedding_dim):
    embedding_matrix_all = {}
    with open(embedding_path) as f:
        for line in f:  # every line contains word followed by the vector value
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embedding_matrix_all[word] = coefs

    # prepare embedding matrix with just the words in our word_index dictionary
    num_words = len(word_index)
    embedding_matrix = np.zeros((num_words, embedding_dim))

    for word, i in word_index.items():
        embedding_vector = embedding_matrix_all.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

In [49]:
# construct the model
EMBEDDING_PATH = 'glove.6B.100d.txt?raw=true'
EMBEDDING_DIM = 100

def get_baseline_model():
    word_in = Input(shape=(max_len_story, MAX_LEN_WORD,))

    emb_word = TimeDistributed(
        Embedding(input_dim=(n_words + 2),
            output_dim=EMBEDDING_DIM,
            input_length=MAX_LEN_WORD,
            weights=[get_embedding_matrix(word2idx,
                EMBEDDING_PATH, EMBEDDING_DIM)],
            trainable=True
        )
    )(word_in)


    conv_layer = TimeDistributed(Convolution1D(128, 3, activation='relu'))(emb_word)
    conv_layer = TimeDistributed(GlobalAveragePooling1D())(conv_layer)
    main_lstm = Bidirectional(LSTM(units=32, return_sequences=True))(conv_layer)
    out = TimeDistributed(Dense(n_tags + 1, activation="softmax"))(main_lstm)

    model = Model([word_in], out)


    return model


In [None]:
# model summary
model = get_baseline_model()
model.summary()

In [None]:
# plot the model architecture
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [None]:
# install wandb
!pip install wandb

In [None]:
# import wandb and authenticate it
import wandb
from wandb.keras import WandbCallback

wandb.login()

In [55]:
# set up an early stopping callback to prevent overfitting
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

In [None]:
# initialize wandb
wandb.init(entity='mkmuyembe', project='text-summarizer')

In [None]:
# reinitialize, compile, and train the model
model = get_baseline_model()
model.compile(optimizer=tf.keras.optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit([X_train],
     train_labels,
     validation_data=([X_valid], valid_labels),
     batch_size=64,
     epochs=10,
     callbacks=[es, WandbCallback()],
     verbose=1)

In [61]:
# serialize model for later use
model.save('my_model.keras')

In [63]:
def represent_single_story(df, story_id):
	stories_labels = []
	temp_story = []

	# get all the sentences with respect to the givcen story id
	temp_story = list(df[df['story_id'] == story_id]['sent_lab'])
	stories_labels.append(temp_story)

	# initialize an all-zeros array in the shape of
	# (nb_stories, max_story_length, max_sentence_lenght)
	X_word = np.zeros((len(stories_labels), max_len_story, MAX_LEN_WORD))

	for idx, story in tqdm(enumerate(stories_labels)):
		story_seq = []

		# to give an upper bound on the maximum length of the word sequence for sentence
		for i in range(max_len_story):
			sent_seq = []

			# to give an upper bound on the maximum length of words to consider
			for j in range(MAX_LEN_WORD):
				try:
					split_sent = story[i][0].split()
					sent_seq.append(word2idx.get(split_sent[j]))
				except:
					# exception will be there when there will not be any sentence for the length
					# and will be padded 0
					sent_seq.append(word2idx.get('PAD'))
			story_seq.append(sent_seq)

		X_word[idx] = np.array(story_seq)

	return (X_word, stories_labels)

In [None]:
# get a random story id from the test set
random_test_story_id = np.random.choice(np.unique(test['story_id'].tolist()), 1)[0]
random_test_story_id

In [None]:
# preprocess the story
X_word_test_single_story, story_labels_single = represent_single_story(test, random_test_story_id)
X_word_test_single_story.shape

In [None]:
# preprocess the labels associated with the story
y_test_single = prepare_labels(story_labels_single)
y_test_single.shape

In [None]:
# run the model to predict on the preprocessed story and take an argmax
# along the last dimension
summary_predicted = model.predict(X_word_test_single_story).argmax(axis=-1)
summary_predicted.shape

In [None]:
# let's manually see the predictions and compare them with the ground truth labels
print(summary_predicted)
print(y_test_single.squeeze(axis=-1))

In [None]:
# get the index from prediction array where the value is 2
# and use it to query the preprocessed test story
idx = np.where(summary_predicted==2)
summary_predicted = X_word_test_single_story[idx]

In [None]:
decoded_predictions = []

# decode the predictions
for i in range(len(summary_predicted)):
    # get the sentence
    sentence_encoded = summary_predicted[i]

    # initialize an empty list to populate the decoded words with
    sentence_deocded = [idx2word.get(idx) for idx in sentence_encoded if idx!=0 if idx2word.get(idx) is not None]

    # prepare the sentence
    sentence_deocded = ' '.join(sentence_deocded)

    decoded_predictions.append(sentence_deocded)

In [None]:
# preview
print(f'Predicted summaries of story id {random_test_story_id}')
decoded_predictions

In [None]:
# let's now see the original summary candidates
print(f'Original summaries of story id {random_test_story_id}')
test.query('story_id == @random_test_story_id & label_sent==1')['sentence'].tolist()

In [None]:
# first preprocess the entire test set
X_test, stories_labels_test = represent_stories(test)
X_test.shape

In [None]:
test_labels = prepare_labels(stories_labels_test)
test_labels = test_labels.squeeze(-1) # to make it compatible for the evaluation metrics
test_labels.shape

In [None]:
# retrieve the predictions from all the test data points and take argmax along the last dimension
test_predictions = model.predict(X_test).argmax(axis=-1)
test_predictions.shape

In [None]:
# evaluation metrics
from sklearn.metrics import precision_score, recall_score , f1_score

In [None]:
f1_macro = []
f1_micro = []
f1_weight = []

for i in tqdm(range(0, len(test_labels))) :

    metric_macro = f1_score(test_labels[i], test_predictions[i], average='macro')
    metric_micro = f1_score(test_labels[i], test_predictions[i], average='micro')
    metric_weight = f1_score(test_labels[i], test_predictions[i], average='weighted')

    f1_macro.append(metric_macro)
    f1_micro.append(metric_micro)
    f1_weight.append(metric_weight)

In [None]:
# create a dataframe from the numbers
df_metric = pd.DataFrame()
df_metric['f1_macro'] =  f1_macro
df_metric['f1_micro'] =  f1_micro
df_metric['f1_weight'] = f1_weight

df_metric.head()

In [None]:
train.to_csv('train.csv', index=False)
valid.to_csv('valid.csv', index=False)
test.to_csv('test.csv', index=False)

In [None]:
import pickle

pickle.dump(word2idx, open('word_dictionary.pkl', 'wb'))
pickle.dump(idx2word, open('inverse_word_dictionary.pkl', 'wb'))