In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations_with_replacement

In [2]:
from pathlib import Path as pathl
import os, sys
import numpy as np
#append relevant file paths
new_path = pathl('.')
parent = new_path.resolve().parent
sys.path.append(str(parent))

In [3]:
from pdf_parser import pipeline

In [4]:
import spacy
import pickle
nlp = spacy.load('en_core_web_sm')
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~\n'

In [5]:
current_dir = os.walk(sys.path[-1] + '/Data/downloaded_files')
files = []
for file in current_dir:
    files.append(file[-1])
files = sum(files, [])

In [11]:
data_path = parent / 'Data/downloaded_files'
def get_text(filename, parser='pypdf'):
    'Return text from a filename'
    pdf_file = data_path / filename
    if parser == 'pypdf':
        text_dict = pipeline(filepath = str(pdf_file), parser=parser)
        text = list(text_dict.values())
        text = sum(text, [])
        text = [sentence.strip() for sentence in text]
        text = ' '.join(text)
        return text
    elif parser == 'textract':
        text = pipeline(filepath = str(pdf_file), parser=parser)
        return text
    
def lemmatizer(text):
    'Lemmatizes text'
    doc = nlp.pipe(text)
    lemmatized = []
    for sentence in doc:
        sent = []
        for word in sentence:
            if str(word) in punctuation:
                continue
            lemma = word.lemma_.strip() 
            sent.append(lemma)
        
        lemmatized.append(' '.join(sent))
    return lemmatized

In [12]:
get_text(files[0], parser='textract')

["b'Modern Slavery Statement Introduction This statement sets out Lovehoney Group Ltd and all Group Companies actions to understand all potential modern slavery risks related to its business and to put in place steps that are aimed at ensuring that there is no slavery or human trafficking in its own business and its supply chains",
 'This statement relates to actions and activities during the financial year 1st April 2018 to 31st March 2019',
 'As the UKs biggest online adult retailer, Lovehoney recognises that it has a responsibility to take a robust approach to slavery and human trafficking',
 'The Organisation is absolutely committed to preventing slavery and human trafficking in its corporate activities, and to ensuring that its supply chains are free from slavery and human trafficking',
 'Organisation Structure Lovehoney are the sexual happiness people, and we make a fun, fulfilling sex life available to everyone',
 'Were the UKs biggest online adult retailer and 1 in 3 sex toys s

In [52]:
vectorizer = TfidfVectorizer(min_df=1)
corpus = []
counter = 0
file_name = []
for file in files[:10]:
    if file[-3:] == 'pdf':
        try:
            text = get_text(file)
            text = text.split('.')
            lemmatized = lemmatizer(text)
            if len(lemmatized) > 1:
                print(f'Success: {file}')
                file_name.append(file)
                corpus.append(' '.join(lemmatized))
        except:
            print(f'Failed: {counter}, name: {file}')
            counter+=1

Failed: 0, name: 115_HRModernSlaveryStatement.pdf
Failed: 1, name: 110_Slavery-Statement.pdf
Failed: 2, name: 128_2018-Preliminary-Modern-Slavery-Statement_FINAL.pdf
Failed: 3, name: 159_HUGO_BOSS_Statement_on_Xinjiang.pdf
Failed: 4, name: 5_FY20-Modern-Slavery-Statement-.pdf
Failed: 5, name: 92_modern-slavery-statements-london.pdf
Failed: 6, name: 135_modern-slavery-2019-annual-statement_20200325134455_72980.pdf
Failed: 7, name: 96_15756.pdf
Failed: 8, name: 14_GOZ-FY20-Modern-Slavery-Statement.pdf
Failed: 9, name: 107_ALDI-Australia-FY19-Modern-Slavery-Statement.pdf


In [31]:
len(file_name)

60

In [32]:
corpus = [i for i in corpus if len(i) > 1]

In [33]:
model = vectorizer.fit_transform(corpus)
dense = model.todense()

In [34]:
similarity = [cosine_similarity(model_1, model_2) for model_1, model_2 in list(combinations_with_replacement(dense, 2))]
similarity = [similarity[i][0][0] for i in range(len(similarity))]

In [35]:
similarity = np.array(similarity)
similarity = np.round(similarity, 2)
similarity_of_pdfs = list(zip(list(combinations_with_replacement(range(0,len(corpus)), 2)), similarity))
len(similarity_of_pdfs)

1830

In [36]:
import pandas as pd

In [37]:
df_list = []
for (text1, text2), similarity in similarity_of_pdfs:
    df_list.append((text1, text2, similarity))

In [38]:
df = pd.DataFrame(df_list, columns=['text_1', 'text_2', 'similarity'])

In [39]:
flipped = df[['text_2', 'text_1', 'similarity']]
flipped.rename(columns={'text_2': 'text_1', 'text_1': 'text_2'}, inplace=True)

In [40]:
combined_df = flipped.append(df)

In [41]:
sim_scores_for_models = combined_df.groupby('text_1')['similarity'].mean()

In [42]:
similarity_scores = sim_scores_for_models.sort_values()

In [49]:
file_name[18]

'19_2019_Santos_Modern_Slavery_Statement.pdf'

In [48]:
corpus[18]

'be responsible for deliver the strategic direction and goal  approve by the Board the CEO be responsible for instill a culture that align with the company ™ s value  10  Modern Slavery Statement 2019  stage 2 end 2019 stage 3  2021 Management Systems Risk Management Procurement and Supply Chain Human Resources Recruitment Customers and Stakeholders Actions take to assess and address risk Modern Slavery Statement 2019  17  Year 2 2020 Monitoring Year 3 2021 e˚ectiveness review and monitor modern slavery action road map Modern Slavery Statement 2019  19  20   Modern Slavery statement 2019'

In [43]:
similarity_scores

text_1
10    0.038361
36    0.127705
18    0.238197
20    0.248197
11    0.270164
51    0.274098
23    0.299344
6     0.304426
55    0.310492
40    0.317541
38    0.323443
3     0.340328
14    0.345902
58    0.350000
43    0.358689
8     0.362459
25    0.367377
2     0.377541
52    0.379836
13    0.388525
41    0.390164
4     0.390164
24    0.397869
42    0.411967
9     0.421475
17    0.422131
27    0.424590
33    0.427705
28    0.430820
56    0.434754
15    0.436393
37    0.436557
21    0.440820
53    0.447869
54    0.449508
45    0.458852
47    0.460492
57    0.460820
35    0.461475
5     0.469016
34    0.483607
1     0.490000
12    0.490328
30    0.492131
7     0.492295
29    0.494262
50    0.502787
22    0.503443
0     0.506066
32    0.506721
26    0.513607
39    0.513607
48    0.528852
59    0.533115
19    0.533279
49    0.543115
46    0.552787
16    0.553279
31    0.560492
44    0.574754
Name: similarity, dtype: float64