In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations_with_replacement

In [2]:
from pathlib import Path as pathl
import os, sys
import numpy as np
#append relevant file paths
new_path = pathl('.')
parent = new_path.resolve().parent
sys.path.append(str(parent))

In [3]:
from pdf_parser import pipeline

In [4]:
import spacy
import pickle
nlp = spacy.load('en_core_web_sm')
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~\n'

In [21]:
current_dir = os.walk(sys.path[-1] + '/Data/downloaded_files')
files = []
for file in current_dir:
    files.append(file[-1])
files = sum(files, [])

In [54]:
data_path = parent / 'Data/downloaded_files'
def get_text(filename, parser='pypdf'):
    'Return text from a filename'
    pdf_file = data_path / filename
    if parser == 'pypdf':
        text_dict = pipeline(filepath = str(pdf_file), parser=parser)
        text = list(text_dict.values())
        text = sum(text, [])
        text = [sentence.strip() for sentence in text]
        text = ' '.join(text)
        return text.split('.')
    elif parser == 'textract':
        text = pipeline(filepath = str(pdf_file), parser=parser)
        tab_pattern = '(\\\\[tr])*(\\\\(x0c))*'
        new_lines = []
        for line in text:
            new_line = re.sub(tab_pattern, '',line)
            new_line = new_line.strip()
            new_lines.append(new_line)
        return new_lines
    
def lemmatizer(text):
    'Lemmatizes text'
    doc = nlp.pipe(text)
    lemmatized = []
    for sentence in doc:
        sent = []
        for word in sentence:
            if str(word) in punctuation:
                continue
            lemma = word.lemma_.strip() 
            sent.append(lemma)
        
        lemmatized.append(' '.join(sent))
    return lemmatized

In [82]:
vectorizer = TfidfVectorizer(min_df=1)
corpus = []
counter = 0
file_name = []
for file in files:
    try:
        if file[-3:] == 'pdf':
            text = get_text(file, parser='textract')
            lemmatized = lemmatizer(text)
            if len(lemmatized) > 1:
                print(f'Success: {file}')
                file_name.append(file)
                corpus.append(' '.join(lemmatized))
            else:
                print(f'failed: {file}')
    except:
        print(f'failed: {file}. Not pdf')

Success: 115_HRModernSlaveryStatement.pdf
Success: 110_Slavery-Statement.pdf
Success: 128_2018-Preliminary-Modern-Slavery-Statement_FINAL.pdf
Success: 159_HUGO_BOSS_Statement_on_Xinjiang.pdf
Success: 5_FY20-Modern-Slavery-Statement-.pdf
Success: 92_modern-slavery-statements-london.pdf
Success: 135_modern-slavery-2019-annual-statement_20200325134455_72980.pdf
Success: 96_15756.pdf
Success: 14_GOZ-FY20-Modern-Slavery-Statement.pdf
Success: 107_ALDI-Australia-FY19-Modern-Slavery-Statement.pdf
Success: 11_Modern-Slavery-Statement-11-02-2021.pdf
Success: 100_IBM_Modern_Slavery_Act_Transparency_Statement_2020.pdf
Success: 164_Origin_2020_Modern_Slavery_Statement.pdf
failed: 85_Modern-Slavery-Capability-Statement-Strategic-Development-Group-digital.pdf
Success: 178_TISC-Consultation-Response_FINAL_160919.pdf
Success: 27_Accenture-Modern-Slavery-Act-Statement-Final.pdf
Success: 149_11.2-telecom-plus-plc-modern-slavery-statement-fy2021-03.2021.pdf
Success: 76_modern-slavery-report.pdf
Success: 

In [83]:
corpus = [i for i in corpus if len(i) > 1]

In [84]:
model = vectorizer.fit_transform(corpus)
dense = model.todense()

In [85]:
similarity = [cosine_similarity(model_1, model_2) for model_1, model_2 in list(combinations_with_replacement(dense, 2))]
similarity = [similarity[i][0][0] for i in range(len(similarity))]

In [86]:
similarity = np.array(similarity)
similarity = np.round(similarity, 2)
similarity_of_pdfs = list(zip(list(combinations_with_replacement(range(0,len(corpus)), 2)), similarity))
len(similarity_of_pdfs)

6555

In [87]:
import pandas as pd

In [88]:
df_list = []
for (text1, text2), similarity in similarity_of_pdfs:
    df_list.append((text1, text2, similarity))

In [89]:
df = pd.DataFrame(df_list, columns=['text_1', 'text_2', 'similarity'])

In [103]:
flipped = df[['text_2', 'text_1', 'similarity']]
flipped.rename(columns={'text_2': 'text_1', 'text_1': 'text_2'}, inplace=True)

In [104]:
combined_df = flipped.append(df)
combined_df = combined_df.drop_duplicates()

In [105]:
sim_scores_for_models = combined_df.groupby('text_1')['similarity'].mean()

In [106]:
similarity_scores = sim_scores_for_models.sort_values()

In [107]:
similarity_scores

text_1
67     0.322719
104    0.325351
43     0.331404
20     0.332982
35     0.335088
         ...   
77     0.663772
28     0.671491
98     0.678246
61     0.678246
90     0.682193
Name: similarity, Length: 114, dtype: float64

In [113]:
print('The least 10 relevant statements are the following: \n')
for i in similarity_scores[:10].index:
    print(files[i])

The least 10 relevant statements are the following: 

38_Modern-Day-Slavery-Statement.pdf
30_Modern_Slavery_Statement.pdf
170_l0Spi_the-rights-and-responsibilities-of-the-modern-univ_IQbqP.pdf
74_Modern_Slavery_Statement_FINAL_09112020.pdf
19_2019_Santos_Modern_Slavery_Statement.pdf
167_cisco-modern-slavery-statement.pdf
12_Modern-Slavery-Statement.pdf
53_avaya-uk-modern-slavery-act-transparency-statement.pdf
72_wcx-modern-slavery-statement.pdf
75_Modern-Slavery-Statement.pdf


In [114]:
print('The most 10 relevant statements are the following: \n')
for i in similarity_scores[-10:].index:
    print(files[i])

The most 10 relevant statements are the following: 

16_1191-modern-slavery-statement.pdf
115_HRModernSlaveryStatement.pdf
80_modern-slavery-health-check.pdf
143_ihg-modern-slavery-statement-2019-final.pdf
86_australia-modern-slavery-statement.pdf
77_Intega-2021-Modern-Slavery-Statement.pdf
125_aviva-modern-day-slavery-statement-2018.pdf
87_Modern_Slavery_Act_Statement.pdf
104_Hollard-Modern-Slavery-Statement-2019-20.pdf
9_modern-slavery-statement.pdf
