In [1]:
import PyPDF2 as ppdf2
import PyPDF4 as ppdf4
import pdfrw
import pdfminer.high_level
import requests
from pprint import pprint
from bs4 import BeautifulSoup
from zipfile import ZipFile
from tqdm.notebook import tqdm
import os
import random
from pdfminer.layout import LAParams
from io import StringIO
import shutil
import json
import pandas as pd
import re

DATA_PATH = "../data"

# Retrieving the documents

In [3]:
# scrape the etendering search result website for tender ids
search_results_url = "https://etendering.ted.europa.eu/cft/cft-search.html?_caList=1&_procedureTypeForthcoming=1&_procedureTypeOngoing=1&caList=67&closingDateFrom=&closingDateTo=&confirm=Search&procedureTypeForthcoming=&procedureTypeOngoing=&startDateFrom=&startDateTo=&status=&text=&maxResults=250"
res = requests.get(search_results_url)
soup = BeautifulSoup(res.content, 'html.parser')

table = soup.find(id="row")
tbody = table.find("tbody")
rows = tbody.find_all("tr")

cftIds = []
for row in rows:
    cftId = row.find_all("td")[1].find("a")["href"].split("cftId")[-1][1:]
    cftIds.append(cftId)
print(len(cftIds), " cftIds found!")

128  cftIds found!


In [54]:
# get all the zip files containing tender documents
base_zip_url = "https://etendering.ted.europa.eu/document/archive-download.html?cftId=[CFTID]&lngIso=en"

for i, cftId in tqdm(enumerate(cftIds), total=len(cftIds)):
    url = base_zip_url.replace("[CFTID]", cftId)
    
    with open(DATA_PATH + "/ZIP/" + cftId + ".zip", "wb") as f:
        f.write(requests.get(url).content)
        
print(i, " zips downloaded!")

  0%|          | 0/128 [00:00<?, ?it/s]

127  zips downloaded!


In [3]:
def contains_annex_1(name):
    """
    Determines whether filename contains any variation of Annex 1
    """
    
    for s in ["Annex I", "Annex 01", "Annex 1", "ANNEX I"]:
        res = name.find(s)

        if res != -1:
            next_char = name[res + len(s)]
            if next_char in [' ', '_', '-'] and "Appendix" not in name:
                return True
    return False

def contains_annex_2(name):
    """
    Determines whether filename contains any variation of Annex 2
    """
    
    for s in ["Annex II", "Annex 02", "Annex 2", "ANNEX II"]:
        res = name.find(s)

        if res != -1:
            next_char = name[res + len(s)]
            if next_char in [' ', '_', '-'] and "Appendix" not in name:
                return True
    return False

# extract the relevant documents
counter = 0
for i, cftId in enumerate(cftIds):
    with ZipFile(DATA_PATH + "/ZIP/" + cftId + ".zip", "r") as f:
        namelist = f.namelist()
        
        found_names_A1 = []
        found_names_A2 = []
        for name in namelist:
            if "Replaced versions" in name:
                # subfolder containing oder versions, ignore
                continue
            
            if contains_annex_2(name) and name[-3:] == "pdf":
                found_names_A2.append(name)
            elif contains_annex_1(name) and name[-3:] == "pdf":
                found_names_A1.append(name)
                
        if len(found_names_A1 + found_names_A2) > 0:
            # if there are multiple docs, choose whichever was found first
            name = (found_names_A2 + found_names_A1)[0]
            
            # extract and rename relevant pdfs
            f.extract(name, path=DATA_PATH + "/PDF/")
            os.rename(DATA_PATH + "/PDF/" + name, DATA_PATH + "/PDF/" + cftId + " - " + name)
            counter += 1
        
        if len(found_names_A1 + found_names_A2) == 0: print(f"No Annex 2 or Annex 1 found in {cftId}.zip!")
        if len(found_names_A2) > 1: print(f"Multiple annex 2 found in {cftId}.zip!")
        if len(found_names_A1) > 1: print(f"Multiple annex 1 found in {cftId}.zip!")
print(f"{counter} pdf's extracted")

# Extracting text

### Extracting full text

In [None]:
# skip because its not the right document
skip = ["2800 - EN-Annex II_declaration on honour_OP_529_2017_RS.pdf", "3095 - EN-Annex II MarInt Standard Reply Form_1187.pdf",
        "7317 - EN-Annex II - Declaration on Honour - FSA I.pdf",
        "7952 - EN-Annex II - Personnel requirements.pdf"]
# skip because they are scanned documents and can't easily be processed
skip += ["1811 - EN-Annex 1 - Tender Specifications.pdf", "2189 - EN-Annex II ToR part B and part A.pdf"]

# save the dates of all documents
date_dict = {}
for row in rows:
    cftId = row.find_all("td")[1].find("a")["href"].split("cftId")[-1][1:]
    date_dict[cftId] = row.find(id="cft.search.start_date").text.replace("/", "-")

# create dataframe
data = pd.DataFrame(columns=[
    "filename",
    "date",
    "full_text",
    "selected_sections"
])

# fill dataset with full text and date
for pdf_filename in tqdm(os.listdir(DATA_PATH + "/PDF")):
    if pdf_filename in skip:
        continue
        
    # load the full text of a given tender document
    pdf_path = DATA_PATH + '/PDF/' + pdf_filename
    full_text = pdfminer.high_level.extract_text(pdf_path)
    cftId = pdf_filename.split(" -")[0]
    
    # add document to dataset
    data = data.append({
        "filename" : pdf_filename,
        "date" : date_dict[cftId],
        "full_text" : full_text,
        "selected_sections" : None
    }, ignore_index=True)

In [77]:
data.to_pickle(DATA_PATH + "/processed/dataset_TENDOC.pkl")

In [40]:
data = pd.read_pickle(DATA_PATH + "/processed/dataset_TENDOC.pkl")

### Extracting contents by section

In [4]:
def get_content_from_section(section, full_text, section_matches):
    """
    Returns the contents of a section given the section, full text and section indices.
    """
    
    for i, (s, m) in enumerate(section_matches):
        if s == section:
            # starting index
            current_index = m
            
            # ending index
            if i != len(section_matches) - 1:
                next_index = section_matches[i + 1][1]
            else:
                # the section to find is the last section
                next_index = len(full_text)
            return full_text[current_index:next_index]
    return None

In [144]:
# load the dictionary containing the section names of the documents
with open(DATA_PATH + "/JSON/section_names.json", "r", encoding='utf-8') as f:
    section_names = json.load(f)

# save the contents of all sections of all pdfs to a dict
pdf_contents = {}
n = 0
for pdf_filename in tqdm(os.listdir(DATA_PATH + "/PDF")):
    if pdf_filename in skip:
        continue
        
    # get the full text of a document
    full_text = data[data["filename"] == pdf_filename]["full_text"].iloc[0]

    # find the indices of the section headers in the full text
    section_matches = []
    section_strings = section_names["files"][pdf_filename]["sections"]
    for section in list(section_strings):
        # find index of section header
        if section_strings[section] is None:
            # if section header can be found using regex, use regex
            regex = "(\d|I|V)\.?\s+?[WORD]\s+?\n".replace("[WORD]", re.escape(section))
            matches = list(re.finditer(regex, full_text))
            
            if len(matches) == 0:
                n += 1
                print(n, "- Cannot find any matches for: ", section, " in ", pdf_filename, "using regex")
                continue

            if len(matches) > 1:
                # assumption: if section title is found multiple times,
                # the first one is in the table of contents
                match = matches[1]
            else:
                match = matches[0]
                
            match_i = match.span()[0]
        else:
            # use fallback if regex doesn't work
            search_string = section_strings[section]
            match_i = full_text.find(search_string)
            
            if match_i == -1:
                n += 1
                print(n, "- Cannot find any matches for: ", search_string, " in ", pdf_filename, "using str.find")
                continue

        section_matches.append((section, match_i))
    
    # find the content of all sections
    sections_content = {}
    for section in section_names["files"][pdf_filename]["sections"]:
        content = get_content_from_section(section, full_text, section_matches)
        sections_content[section] = content
        
    # save the section contents per file
    pdf_contents[pdf_filename] = sections_content

  0%|          | 0/126 [00:00<?, ?it/s]

In [123]:
with open(DATA_PATH + "/JSON/section_names.json", "r", encoding='utf-8') as f:
    section_names = json.load(f)

In [143]:
with open(DATA_PATH + "/JSON/section_names.json", "w", encoding='utf-8') as f:
    json.dump(d, f, indent=4)

In [141]:
d = {"files":{}}
for file in section_names["files"]:
    if file[0] == "_":
        d["files"][file] = section_names["files"][file]
    else:
        new_d = {"sections" : section_names["files"][file], "selected_sections" : ["s"]}
        d["files"][file] = new_d

In [139]:
pprint(d)

{'files': {'1347 - EN-Annex II - ToR.pdf': {'sections': {'Appendices': None,
                                                         'Background': None,
                                                         'Description of the current and target situation': None,
                                                         'General Requirements': None,
                                                         'Implementation of the Contract': None,
                                                         'Key Requirements': None,
                                                         'Objectives': None,
                                                         'Offer': None,
                                                         'Scope': None,
                                                         'Stakeholders': None,
                                                         'Terms and Definitions': None},
                                            'selected_sections': ['s']},
   

                                                                      'Contact person': None,
                                                                      'Contract implementation': None,
                                                                      'Contractual information': None,
                                                                      'Dress code and a code of conduct of interim personnel': None,
                                                                      'Protection of personal data': None,
                                                                      'Rights and entitlements': None,
                                                                      'Scope of the contract': None,
                                                                      'Temporary absence of the interim personnel and replacement procedures': '7.  '
                                                                                                          

In [131]:
pprint(section_names)

{'files': {'1347 - EN-Annex II - ToR.pdf': {'Appendices': None,
                                            'Background': None,
                                            'Description of the current and target situation': None,
                                            'General Requirements': None,
                                            'Implementation of the Contract': None,
                                            'Key Requirements': None,
                                            'Objectives': None,
                                            'Offer': None,
                                            'Scope': None,
                                            'Stakeholders': None,
                                            'Terms and Definitions': None},
           '1415 - EN-Annex II_Terms of Reference_OP_87_2016_RS.pdf': {'Appendices': None,
                                                                       'General Information': None,
                            

                                                                 'Form and content of the application': None,
                                                                 'General Information on Frontex': None,
                                                                 'Legal form to be taken by the group of economic operators to whom the contract is to be awarded:': 'I.4  '
                                                                                                                                                                     'Legal  '
                                                                                                                                                                     'form  '
                                                                                                                                                                     'to  '
                                                                                           