<a href="https://colab.research.google.com/github/MuyembeM/MachineLearning/blob/main/TextPreprocessor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# dependencies
from tqdm.notebook import tqdm
from pickle import load
import pandas as pd
import numpy as np
import string

In [5]:
# load stories and summaries' list
stories = load(open('stories.pkl','rb'))
print('Loaded Stories %d' % len(stories))

Loaded Stories 92579


In [6]:
# clean a list of lines
def clean_lines(lines):
  cleaned = list()
  # prepare a translation table to remove punctuation
  table = str.maketrans('', '', string.punctuation)
  for line in lines:
    # strip source cnn if it exists
    index = line.find('(CNN) -- ')
    if index > -1:
      line = line[index+len('(CNN)'):]
    # tokenize on white space
    line = line.split()
    # convert to lower case
    line = [word.lower() for word in line]
    # remove punctuation from each token
    line = [w.translate(table) for w in line]
    # remove tokens with numbers in them
    line = [word for word in line if word.isalpha()]
    # store as string
    cleaned.append(''.join(line))
  # remove empty strings
  cleaned = [c for c in cleaned if len(c) > 0]
  return cleaned

In [7]:
# clean the stories and summaries
for example in tqdm(stories):
  example['story'] = clean_lines(example['story'].split('\\n'))
  example['highlights'] = clean_lines(example['highlights'])

  0%|          | 0/92579 [00:00<?, ?it/s]

In [8]:
# install the Rouge module for calculating the Rouge scores
!pip install -q Rouge

In [9]:
# import the Rouge module and instantiate it
from rouge import Rouge
rouge = Rouge()

In [10]:
# utility for calculating Rouge score between pairs of sentences
def get_rouge_fl(references, sentence):
  score_ls = []
  for ans in references:
    scores = rouge.get_scores(ans, sentence)
    score_ls.append(scores[0]['rouge-l']['f'])
  return max(score_ls)

In [12]:
def get_list_ans_each_story(story_inp, references_inp):
  scr = []
  hyp = []
  # iterate through each sentence of a given story
  for i in range(0, len(story_inp)):
    # calculate Rouge score between the current sentence and the provided
    #(abstractive summaries)
    hypothesis = story_inp[i]
    scores = get_rouge_fl(references_inp, hypothesis)

    # track sentences iterated and store their scores
    hyp.append(hypothesis)
    scr.append(scores)
  # convert to NumPy array
  hyp1 = np.array(hyp)
  # sort the scores to get the indices
  scr1 = np.array(scr)
  scr2 = np.sort(scr)[::-1]
  ind  = np.argsort(scr)[::-1]
  # take top 5
  ind1 = ind[0:5]
  list_ref = list(hyp1[ind1])
  return list_ref, scr2[0:5]

In [13]:
dict_id_summary = {}
dict_id_score = {}
# iterate through each story
for s_id in tqdm(range(0, len(stories))):
  # story inputs (each sentence of a story)
  story_inp = stories[s_id]['story']
  # reference inputs (abstractive summaries)
  references_inp = stories[s_id]['highlights']
  # get the list of references and scores
  list_ref, list_score = get_list_ans_each_story(story_inp, references_inp)
  # store the results in the dictionaries
  dict_id_summary[s_id] = list_ref
  dict_id_score[s_id] = list_score

  0%|          | 0/92579 [00:00<?, ?it/s]

In [14]:
story_id = []
label_sent = []
sent_id = []
list_sent = []
# iterate through each story
for i in tqdm(range(0, len(stories))):
  # list of references for the story
  list_ref = dict_id_summary[i]
  # iterate through each sentence of the current story
  for j, story in enumerate(stories[i]['story']):
    # check if the story is in the list reference
    ind =  int(story in list_ref)
    # append the indicator as the labels
    label_sent.append(ind)
    # 1. append the sentences per story\n
    list_sent.append(story)
    # 2. append the sent_ids
    sent_id.append(j)
    # 3. append the story_id
    story_id.append(i)

  0%|          | 0/92579 [00:00<?, ?it/s]

In [15]:
# create the dataframe
df_story_summary = pd.DataFrame()
df_story_summary['story_id'] = story_id
df_story_summary['sent_id'] = sent_id
df_story_summary['sentence'] = list_sent
df_story_summary['label_sent'] = label_sent

In [16]:
# preview the dataframe
df_story_summary.head()

Unnamed: 0,story_id,sent_id,sentence,label_sent
0,0,0,justdaysaftersuggestinglawmakersmightnotdebate...,1
1,1,0,mexicanauthoritieshavearrestedareputedseniorme...,1
2,2,0,acaliforniarepublicanofficialhasapologizedfors...,1
3,3,0,itsnotoverinsteubenvilleohioalthoughajudgehasf...,1
4,4,0,libyanauthoritieshavemademorearrestsinconnecti...,1


In [17]:
# serialize
df_story_summary.to_pickle('dataframe_extractive.pkl')