In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [81]:
import os, sys
from pathlib import Path as pathl
from pdf_parser import pipeline

In [95]:
import spacy
import pickle
nlp = spacy.load('en_core_web_sm')
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~\n'

In [96]:
#append relevant file paths
new_path = pathl('.')
parent = new_path.resolve().parent
sys.path.append(str(parent))

In [87]:
current_dir = os.walk(sys.path[-1] + '/Data')
files = []
for file in current_dir:
    files.append(file[-1])
files = sum(files, [])

In [42]:
data_path = parent / 'Data'
def get_text(filename):
    'Return text from a filename'
    pdf_file = data_path / filename
    text_dict = pipeline(filepath = str(pdf_file))
    text = list(text_dict.values())
    text = sum(text, [])
    text = [sentence.strip() for sentence in text]
    text = ' '.join(text)
    return text

def lemmatizer(text):
    'Lemmatizes text'
    doc = nlp.pipe(text)
    lemmatized = []
    for sentence in doc:
        sent = []
        for word in sentence:
            if str(word) in punctuation:
                continue
            lemma = word.lemma_.strip() 
            sent.append(lemma)
        
        lemmatized.append(' '.join(sent))
    return lemmatized

In [70]:
vectorizer = TfidfVectorizer(min_df=1)
corpus = []
for file in files:
    if file[-3:] == 'pdf':
        text = get_text(file)
        text = text.split('.')
        lemmatized = lemmatizer(text)
        corpus.append(' '.join(lemmatized))

In [71]:
model = vectorizer.fit_transform(corpus)
dense = model.todense()

In [72]:
cosine_similarity(model[0], model[1])

array([[0.62116493]])

In [73]:
cosine_similarity(model[0], model[2])

array([[0.64637026]])

In [74]:
cosine_similarity(model[1], model[2])

array([[0.64236199]])

In [198]:
test = 'https://www.qantas.com/content/dam/qantas/pdfs/about-us/corporate-governance/modern-slavery-and-human-trafficking-statement.pdf'

In [226]:
pattern = '[^/]+(\.pdf)'
match = re.search(pattern, url_list[0])

In [233]:
match[0]

'modern-slavery-and-human-trafficking-statement.pdf'

In [234]:
import urllib.request

In [236]:
match[0][-3:]

'pdf'

In [238]:
def download_file(download_url, filename):
    response = urllib.request.urlopen(download_url)
    if filename[-3:] == 'pdf':
        file = open(filename, 'wb')
    else:
        file = open(filename + ".pdf", 'wb')
    file.write(response.read())
    file.close()

pattern = '[^/]+(\.pdf)'  
for url in url_list[:1]:
    match = re.search(pattern, url)
    title = match[0]
    download_file(url, title)

In [250]:
pattern = '.*(\.pdf)' 
match = re.search(pattern, url_list[3])

In [251]:
match[0]

'https://www.orica.com/ArticleDocuments/311/20200712_Orica_Modern_Slavery_Statement.pdf'

In [252]:
url_list[8]

'https://modernslaveryregister.gov.au/statements/file/cba2a76f-4097-458d-8195-11f4c56aedb7/'

In [258]:
all_links = url_list
pattern = '.*(\.pdf)' 
relevant_links = []
for link in all_links:
    match = re.search(pattern, link)
    if match:
        relevant_links.append(link)

In [270]:
from bs4 import BeautifulSoup

In [273]:
import requests
all_links = []
query = 'modern slavery statements pdf'
query = query.replace(' ', '+')
for page_number in range(0, 30,10):
    url = f"https://google.com/search?q={query}&start={page_number}&end={page_number+10}'"
    resp = requests.get(url)
    soup = BeautifulSoup(resp.content, "html.parser")   
    result = soup.find_all('div', attrs = {'class': 'ZINbbc'})
    results=[re.search('\/url\?q\=(.*)\&sa',str(i.find('a', href = True)['href'])) for i in result if "url" in str(i)]
    #this is because in rare cases we can't get the urls
    links=[i.group(1) for i in results if i != None]
    all_links.append(links)

In [275]:
all_links[0]

['https://www.qantas.com/content/dam/qantas/pdfs/about-us/corporate-governance/modern-slavery-and-human-trafficking-statement.pdf',
 'https://www.orica.com/ArticleDocuments/311/20200712_Orica_Modern_Slavery_Statement.pdf.aspx',
 'https://www.fmgl.com.au/docs/default-source/default-document-library/fmg_modern-slavery-voluntary-statement.pdf',
 'https://www.goodman.com/-/media/Files/Sites/Global/Who-we-are/corporate-governance/Statements/Modern-Slavery-Statement.pdf%3Fla%3Den%26hash%3DBEB9C6B244ECCB79A6E45B76C2242CB2FF32F684',
 'https://www.suncorpgroup.com.au/uploads/FY20-Modern-Slavery-Statement-.pdf',
 'https://investors.sunrice.com.au/FormBuilder/_Resource/_module/2weQNICYSUy13FE_jxQXvg/file/sustainability-reports/SunRice_Modern_Slavery_Statement.pdf',
 'https://ampstek.com/ampstek-modernslavery.pdf',
 'https://www.bhp.com/-/media/documents/investors/annual-reports/2020/200915_bhpmodernslaverystatement2020.pdf%3Fla%3Den',
 'https://modernslaveryregister.gov.au/statements/file/cba2a76

In [276]:
all_links = sum(all_links, [])

In [277]:
all_links

['https://www.qantas.com/content/dam/qantas/pdfs/about-us/corporate-governance/modern-slavery-and-human-trafficking-statement.pdf',
 'https://www.orica.com/ArticleDocuments/311/20200712_Orica_Modern_Slavery_Statement.pdf.aspx',
 'https://www.fmgl.com.au/docs/default-source/default-document-library/fmg_modern-slavery-voluntary-statement.pdf',
 'https://www.goodman.com/-/media/Files/Sites/Global/Who-we-are/corporate-governance/Statements/Modern-Slavery-Statement.pdf%3Fla%3Den%26hash%3DBEB9C6B244ECCB79A6E45B76C2242CB2FF32F684',
 'https://www.suncorpgroup.com.au/uploads/FY20-Modern-Slavery-Statement-.pdf',
 'https://investors.sunrice.com.au/FormBuilder/_Resource/_module/2weQNICYSUy13FE_jxQXvg/file/sustainability-reports/SunRice_Modern_Slavery_Statement.pdf',
 'https://ampstek.com/ampstek-modernslavery.pdf',
 'https://www.bhp.com/-/media/documents/investors/annual-reports/2020/200915_bhpmodernslaverystatement2020.pdf%3Fla%3Den',
 'https://modernslaveryregister.gov.au/statements/file/cba2a76

In [288]:
pattern = '.*(\.pdf)' 
relevant_links = []
for link in all_links:
    match = re.search(pattern, link)
    if match:
        relevant_links.append(link)

In [291]:
pattern = '[^/]+(\.pdf)'  
count = 0

print(relevant_links)

['https://www.qantas.com/content/dam/qantas/pdfs/about-us/corporate-governance/modern-slavery-and-human-trafficking-statement.pdf', 'https://www.orica.com/ArticleDocuments/311/20200712_Orica_Modern_Slavery_Statement.pdf.aspx', 'https://www.fmgl.com.au/docs/default-source/default-document-library/fmg_modern-slavery-voluntary-statement.pdf', 'https://www.goodman.com/-/media/Files/Sites/Global/Who-we-are/corporate-governance/Statements/Modern-Slavery-Statement.pdf%3Fla%3Den%26hash%3DBEB9C6B244ECCB79A6E45B76C2242CB2FF32F684', 'https://www.suncorpgroup.com.au/uploads/FY20-Modern-Slavery-Statement-.pdf', 'https://investors.sunrice.com.au/FormBuilder/_Resource/_module/2weQNICYSUy13FE_jxQXvg/file/sustainability-reports/SunRice_Modern_Slavery_Statement.pdf', 'https://ampstek.com/ampstek-modernslavery.pdf', 'https://www.bhp.com/-/media/documents/investors/annual-reports/2020/200915_bhpmodernslaverystatement2020.pdf%3Fla%3Den', 'https://www.anz.com.au/content/dam/anzcomau/documents/pdf/aboutus/mo