# Dissertation

# Data Wrangling

## Importing and installing the necessary libraries

In [None]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.0.0.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 3.3 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 12.6 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 51.6 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.0.15-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 2.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 53.4 MB/s 
Collecting huggingface-hub
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |██████████████

In [None]:
!python -m spacy download en_core_web_lg
# restart session

Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 1.3 MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-py3-none-any.whl size=829180943 sha256=62e1ecd5c8d46d0d8dcbd9c84a2b2e5f58b5dde61773c0333c5301b1313dce26
  Stored in directory: /tmp/pip-ephem-wheel-cache-nvq19sh1/wheels/11/95/ba/2c36cc368c0bd339b44a791c2c1881a1fb714b78c29a4cb8f5
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [None]:
# Importing necessary librairies
from google.colab import drive
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle 
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import spacy

## Importing Personal Drive to access the data

In [None]:
# Mount Google Drive to this Notebook instance.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Splitting each description and TLDR into list of sentences 

The descriptions and TLDRs are read into the spacy tokenizer format using the large English mode.
These are then split into sentences using spaCy's .sents method. The spacy tokenizer format carries a lot of memory overhead and the list is converted to a list of strings before completing this step.

BERT is sophisticated enough to parse meaning from all the nuances of language and steps such as stop word removal, stemming and lower-case transformations are purposely ignored. 


In [None]:
# Loading the clean dataset intp a pandas dataframe
file = '/content/drive/MyDrive/Data/SummaDevDocs_raw.csv'
df = pd.read_csv(file)

In [None]:
# Using spaCy to split text into sentences, clean up short sentences and embed using BERT via the sentence-transformer package
### Helper function

def text_to_sent_list(text, 
                      nlp = spacy.load("en_core_web_lg"), 
                      embedder = SentenceTransformer('distilbert-base-nli-mean-tokens'),
                      min_len=0):
  
    ''' Returns cleaned article sentences and BERT sentence embeddings'''
    
    #convert to list of sentences
    text = nlp(text)
    sents = list(text.sents)
    #remove short sentences by threshhold                                                                                                
    sents_clean = [sentence.text for sentence in sents if len(sentence)> min_len]
    #remove entries with empty list
    sents_clean = [sentence for sentence in sents_clean if len(sentence)!=0]
    #embed sentences (deafult uses BERT SentenceTransformer)
    sents_embedding= np.array(embedder.encode(sents_clean, convert_to_tensor=True))
    
    return sents_clean, sents_embedding

#load nlp and embedder
nlp = spacy.load("en_core_web_lg")
embedder = SentenceTransformer('distilbert-base-nli-mean-tokens')

#extract clean sentence list and sentence embedding for each article TEXT
f = lambda text: text_to_sent_list(text, nlp=nlp, embedder=embedder, min_len=0)
s_interim_tuple = df['document_text'].apply(f)

df['text_clean'] = s_interim_tuple.apply(lambda x: x[0])
df['text_embedding'] = s_interim_tuple.apply(lambda x: x[1])

#extract clean sentence list and sentence embedding for each article SUMMARY
f = lambda summ: text_to_sent_list(summ, nlp=nlp, embedder=embedder, min_len=0)
s_interim_tuple = df['summary_text'].apply(f)

df['summary_clean'] = s_interim_tuple.apply(lambda x: x[0])
df['summary_embedding'] = s_interim_tuple.apply(lambda x: x[1])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=690.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3968.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=550.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=122.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=229.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=265486777.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=53.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466081.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=450.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=190.0, style=ProgressStyle(description_…




A cosine similarity is calculated for each summary sentence shere each entry represents this measure between each article sentence with that summary sentence. SK-learn's inbuilt cosine similarity function is used. 

In [None]:
# Calculate the cosine-similarity matrix between each summary sentence and the article sentences
#helper function
def find_sim_single_summary(summary_sentence_embed, doc_emedding):
    '''returns array of indices for max cosine sim per summary sentences'''
    cos_sim_mat = cosine_similarity(doc_emedding, summary_sentence_embed)
    idx_arr = np.argmax(cos_sim_mat, axis=0)
    
    return idx_arr

#main function
def label_sent_in_summary(s_text, s_summary):
    '''returns index list and binary target labels in an array'''
    doc_num = s_text.shape[0]
    #initialize zeros
    labels = [np.zeros(doc.shape[0]) for doc in s_text.tolist()] 
    #calc idx for most similar
    idx_list = [np.sort(find_sim_single_summary(s_summary[j], s_text[j])) for j 
                                                            in range(doc_num)]
    for j in range(doc_num):
        labels[j][idx_list[j]]= 1 
        
    return idx_list, labels

#get index list and target labels
idx_list, labels = label_sent_in_summary(df.text_embedding, df.summary_embedding)

#wrap in dataframe
df['labels'] = labels

In [None]:
# View the data
df

Unnamed: 0,document_text,summary_text,text_clean,text_embedding,summary_clean,summary_embedding,labels
0,The training improved women’s knowledge on the...,The training improved women’s knowledge on the...,[The training improved women’s knowledge on th...,"[[0.76747984, -0.18944956, 0.51285803, -0.0211...",[The training improved women’s knowledge on th...,"[[0.76747984, -0.18944956, 0.51285803, -0.0211...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,Illegal oil refining in the Niger Delta is inc...,CEHRD in an effort to create awareness on the ...,[Illegal oil refining in the Niger Delta is in...,"[[-0.34167996, -0.6055787, -0.20679495, -1.148...",[CEHRD in an effort to create awareness on the...,"[[-0.436482, -0.07113252, -0.18081762, -0.4864...","[0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,CEHRD successfully set-up 4 formal and 4 infor...,CEHRD set-up the environmental clubs with the ...,[CEHRD successfully set-up 4 formal and 4 info...,"[[-0.4896432, -1.2085572, 1.0374498, 0.0213696...",[CEHRD set-up the environmental clubs with the...,"[[-0.15335679, -0.2943071, 0.58692193, -1.1263...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,PROJECT NAME : Delivering Accelerated Family P...,Delivering Accelerated Family Planning in Paki...,"[PROJECT NAME :, Delivering Accelerated Family...","[[-0.23698464, 0.15983887, -0.07119872, -1.200...",[Delivering Accelerated Family Planning in Pak...,"[[-0.50266284, -1.2923898, 0.42068344, -1.2567...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,Road traffic injuries are world's eighth leadi...,The Cardiff Trauma Pack Research and Develop...,[Road traffic injuries are world's eighth lead...,"[[-0.466682, -1.1917696, 0.99453795, -0.957597...",[ The Cardiff Trauma Pack Research and Develo...,"[[-0.4679709, -0.43502185, 0.83883774, -1.4754...","[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...
2980,Metta has been responding to the humanitarian ...,"ProjectGoal: To improve the condition of 2,854...",[Metta has been responding to the humanitarian...,"[[-0.6875753, -1.0828757, 0.32236812, -1.33987...","[ProjectGoal:, To improve the condition of 2,8...","[[-0.5820974, 0.02791187, 0.2928526, -1.098855...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ..."
2981,"Destined Women is local not for profit\, non-r...",GOAL: Contribute towards changing the socioeco...,"[Destined Women is local not for profit\,, non...","[[-0.91105515, -0.780988, -0.013189635, -0.504...",[GOAL: Contribute towards changing the socioec...,"[[0.06080835, -0.2223898, 0.48705336, -1.54801...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2982,The project will empower secondary school stud...,The project will empower secondary school stud...,[The project will empower secondary school stu...,"[[0.2446245, -0.58844894, 1.0700818, -0.348946...",[The project will empower secondary school stu...,"[[0.31522802, -0.6220454, 1.084553, -0.3928656...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
2983,The project has been specifically designed to ...,The project has been specifically designed to ...,[The project has been specifically designed to...,"[[0.22308932, -0.042834148, 0.5815844, -0.4814...",[The project has been specifically designed to...,"[[0.24488513, -0.04303686, 0.55787843, -0.5100...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]"


## Adding doc_label manually

In [None]:
# Creating a list of numbers from 0 to 2985, the number of documents 
document_number = list(range(0, 2985))

# Computing the number of sentences in every single documents
#creatin an empty list 
num_sentences = []

#looping through every list in labels
for i in df['labels']:
    #appending the lenght of each list to the num_sentences lists 
    num_sentences.append(len(i))

# Creating an empty list of labels to later assign each sentence the label of the document it belongs to
doc_label = []

# Looping through each number from O to 2985, and creating a list of lists
# Each list within the list will correspond to the label of the document times the number of sentences in the document 
# This allows to track back each sentence to its original document
for i in document_number:
    labels = [i]*num_sentences[i]
    doc_label.append(labels)
    
# Adding the document labels as a column in the main dataframe 
df['doc_label']=doc_label

In [None]:
# View the data with the updated column 
df

Unnamed: 0,document_text,summary_text,text_clean,text_embedding,summary_clean,summary_embedding,labels,doc_label
0,The training improved women’s knowledge on the...,The training improved women’s knowledge on the...,[The training improved women’s knowledge on th...,"[[0.76747984, -0.18944956, 0.51285803, -0.0211...",[The training improved women’s knowledge on th...,"[[0.76747984, -0.18944956, 0.51285803, -0.0211...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Illegal oil refining in the Niger Delta is inc...,CEHRD in an effort to create awareness on the ...,[Illegal oil refining in the Niger Delta is in...,"[[-0.34167996, -0.6055787, -0.20679495, -1.148...",[CEHRD in an effort to create awareness on the...,"[[-0.436482, -0.07113252, -0.18081762, -0.4864...","[0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,CEHRD successfully set-up 4 formal and 4 infor...,CEHRD set-up the environmental clubs with the ...,[CEHRD successfully set-up 4 formal and 4 info...,"[[-0.4896432, -1.2085572, 1.0374498, 0.0213696...",[CEHRD set-up the environmental clubs with the...,"[[-0.15335679, -0.2943071, 0.58692193, -1.1263...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
3,PROJECT NAME : Delivering Accelerated Family P...,Delivering Accelerated Family Planning in Paki...,"[PROJECT NAME :, Delivering Accelerated Family...","[[-0.23698464, 0.15983887, -0.07119872, -1.200...",[Delivering Accelerated Family Planning in Pak...,"[[-0.50266284, -1.2923898, 0.42068344, -1.2567...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
4,Road traffic injuries are world's eighth leadi...,The Cardiff Trauma Pack Research and Develop...,[Road traffic injuries are world's eighth lead...,"[[-0.466682, -1.1917696, 0.99453795, -0.957597...",[ The Cardiff Trauma Pack Research and Develo...,"[[-0.4679709, -0.43502185, 0.83883774, -1.4754...","[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."
...,...,...,...,...,...,...,...,...
2980,Metta has been responding to the humanitarian ...,"ProjectGoal: To improve the condition of 2,854...",[Metta has been responding to the humanitarian...,"[[-0.6875753, -1.0828757, 0.32236812, -1.33987...","[ProjectGoal:, To improve the condition of 2,8...","[[-0.5820974, 0.02791187, 0.2928526, -1.098855...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ...","[2980, 2980, 2980, 2980, 2980, 2980, 2980, 298..."
2981,"Destined Women is local not for profit\, non-r...",GOAL: Contribute towards changing the socioeco...,"[Destined Women is local not for profit\,, non...","[[-0.91105515, -0.780988, -0.013189635, -0.504...",[GOAL: Contribute towards changing the socioec...,"[[0.06080835, -0.2223898, 0.48705336, -1.54801...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2981, 2981, 2981, 2981, 2981, 2981, 2981, 298..."
2982,The project will empower secondary school stud...,The project will empower secondary school stud...,[The project will empower secondary school stu...,"[[0.2446245, -0.58844894, 1.0700818, -0.348946...",[The project will empower secondary school stu...,"[[0.31522802, -0.6220454, 1.084553, -0.3928656...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[2982, 2982, 2982, 2982, 2982, 2982, 2982]"
2983,The project has been specifically designed to ...,The project has been specifically designed to ...,[The project has been specifically designed to...,"[[0.22308932, -0.042834148, 0.5815844, -0.4814...",[The project has been specifically designed to...,"[[0.24488513, -0.04303686, 0.55787843, -0.5100...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[2983, 2983, 2983, 2983, 2983, 2983]"


In [None]:
# Exporting the file as a pickle
output_file = 'SummaDevDocs_preprocesssed.pickle'

with open(output_file, 'wb') as handle:                                     
    pickle.dump(df, handle)