In [19]:
import PyPDF2 as ppdf2
import PyPDF4 as ppdf4
import pdfrw
import pdfminer.high_level
import requests
from pprint import pprint
from bs4 import BeautifulSoup
from zipfile import ZipFile
from tqdm.notebook import tqdm
import os
import random
from pdfminer.layout import LAParams
from io import StringIO
import shutil
import json
import pandas as pd
import re

DATA_PATH = "../data"

# Retrieving the documents

In [3]:
# scrape the etendering search result website for tender ids
search_results_url = "https://etendering.ted.europa.eu/cft/cft-search.html?_caList=1&_procedureTypeForthcoming=1&_procedureTypeOngoing=1&caList=67&closingDateFrom=&closingDateTo=&confirm=Search&procedureTypeForthcoming=&procedureTypeOngoing=&startDateFrom=&startDateTo=&status=&text=&maxResults=250"
res = requests.get(search_results_url)
soup = BeautifulSoup(res.content, 'html.parser')

table = soup.find(id="row")
tbody = table.find("tbody")
rows = tbody.find_all("tr")

cftIds = []
for row in rows:
    cftId = row.find_all("td")[1].find("a")["href"].split("cftId")[-1][1:]
    cftIds.append(cftId)
print(len(cftIds), " cftIds found!")

128  cftIds found!


In [54]:
# get all the zip files containing tender documents
base_zip_url = "https://etendering.ted.europa.eu/document/archive-download.html?cftId=[CFTID]&lngIso=en"

for i, cftId in tqdm(enumerate(cftIds), total=len(cftIds)):
    url = base_zip_url.replace("[CFTID]", cftId)
    
    with open(DATA_PATH + "/ZIP/" + cftId + ".zip", "wb") as f:
        f.write(requests.get(url).content)
        
print(i, " zips downloaded!")

  0%|          | 0/128 [00:00<?, ?it/s]

127  zips downloaded!


In [3]:
def contains_annex_1(name):
    for s in ["Annex I", "Annex 01", "Annex 1", "ANNEX I"]:
        res = name.find(s)

        if res != -1:
            next_char = name[res + len(s)]
            if next_char in [' ', '_', '-'] and "Appendix" not in name:
                return True
    return False

In [4]:
def contains_annex_2(name):
    for s in ["Annex II", "Annex 02", "Annex 2", "ANNEX II"]:
        res = name.find(s)

        if res != -1:
            next_char = name[res + len(s)]
            if next_char in [' ', '_', '-'] and "Appendix" not in name:
                return True
    return False

In [10]:
counter = 0
for i, cftId in enumerate(cftIds):
    with ZipFile(DATA_PATH + "/ZIP/" + cftId + ".zip", "r") as f:
        namelist = f.namelist()
        
        found_names_A1 = []
        found_names_A2 = []
        for name in namelist:
            if "Replaced versions" in name:
                # subfolder containing oder versions, ignore
                continue
            
            if contains_annex_2(name) and name[-3:] == "pdf":
                found_names_A2.append(name)
            elif contains_annex_1(name) and name[-3:] == "pdf":
                found_names_A1.append(name)
                
        if len(found_names_A1 + found_names_A2) > 0:
            # if there are multiple docs, choose whichever was found first
            name = (found_names_A2 + found_names_A1)[0]
            
            # extract and rename relevant pdfs
            f.extract(name, path=DATA_PATH + "/PDF/")
            os.rename(DATA_PATH + "/PDF/" + name, DATA_PATH + "/PDF/" + cftId + " - " + name)
            counter += 1
        
        if len(found_names_A1 + found_names_A2) == 0: print(f"No Annex 2 or Annex 1 found in {cftId}.zip!")
        if len(found_names_A2) > 1: print(f"Multiple annex 2 found in {cftId}.zip!")
        if len(found_names_A1) > 1: print(f"Multiple annex 1 found in {cftId}.zip!")
print(f"{counter} pdf's extracted")

Multiple annex 2 found in 7863.zip!
Multiple annex 1 found in 7191.zip!
Multiple annex 2 found in 6954.zip!
Multiple annex 2 found in 6428.zip!
Multiple annex 2 found in 4030.zip!
Multiple annex 2 found in 3751.zip!
No Annex 2 or Annex 1 found in 3294.zip!
Multiple annex 2 found in 3058.zip!
Multiple annex 1 found in 3058.zip!
Multiple annex 2 found in 3034.zip!
Multiple annex 1 found in 3034.zip!
Multiple annex 2 found in 2947.zip!
Multiple annex 1 found in 2947.zip!
Multiple annex 1 found in 2662.zip!
Multiple annex 2 found in 2570.zip!
No Annex 2 or Annex 1 found in 1775.zip!
Multiple annex 2 found in 1556.zip!
Multiple annex 2 found in 1415.zip!
Multiple annex 1 found in 1347.zip!
Multiple annex 1 found in 2799.zip!
126 pdf's extracted
TODO: Check for both Annex 2 and 1 before reporting missing document.


# Extracting text

In [17]:
def get_content_from_section(section, full_text, section_matches):
    for i, (s, m) in enumerate(section_matches):
        if s == section:
            # starting index
            current_index = m.span()[1]
            
            # ending index
            if current_index != len(section_matches) - 1:
                next_index = section_matches[i + 1][1].span()[0]
            else:
                # the section to find is the last section
                next_index = len(full_text)
            return full_text[current_index:next_index]
    return None

In [60]:
# load the dictionary containing the section names of the documents
with open(DATA_PATH + "/JSON/section_names.json", "r") as f:
    section_names = json.load(f)

# skip because its not the right document
skip = ["2800 - EN-Annex II_declaration on honour_OP_529_2017_RS.pdf", "3095 - EN-Annex II MarInt Standard Reply Form_1187.pdf",
        "7317 - EN-Annex II - Declaration on Honour - FSA I.pdf",
        "7952 - EN-Annex II - Personnel requirements.pdf"]
# skip because they are scanned documents and can't easily be processed
skip += ["1811 - EN-Annex 1 - Tender Specifications.pdf", "2189 - EN-Annex II ToR part B and part A.pdf"]

In [61]:
print(f"Documents with sections listed: {len(section_names['files']) / (len(os.listdir(DATA_PATH + '/PDF'))-2)*100:.2f}%")

Documents with sections listed: 99.19%


In [7]:
# create dataset, load in the full text of all tender documents
data = pd.DataFrame(columns=[
    "filename",
    "full_text",
    "selected_sections"
])

for pdf_filename in tqdm(os.listdir(DATA_PATH + "/PDF")):
    if pdf_filename in skip:
        continue
    # load the full text of a given tender document
    pdf_path = DATA_PATH + '/PDF/' + pdf_filename
    full_text = pdfminer.high_level.extract_text(pdf_path)
    
    data = data.append({
        "filename" : pdf_filename,
        "full_text" : full_text,
        "selected_sections" : None
    }, ignore_index=True)

  0%|          | 0/126 [00:00<?, ?it/s]

In [12]:
data.to_pickle(DATA_PATH + "/processed/dataset_TENDOC.pkl")

# Testing/indev:

In [62]:
# for now, this is just checking if all sections can be found in 
n = 0
for pdf_filename in tqdm(os.listdir(DATA_PATH + "/PDF")):
    if pdf_filename in skip:
        continue
    # load the full text of a given tender document
#     pdf_filename = "1773 - EN-Annex II - Terms of Reference.pdf"
    pdf_path = DATA_PATH + '/PDF/' + pdf_filename
#     full_text = pdfminer.high_level.extract_text(pdf_path)
    full_text = data[data["filename"] == pdf_filename]["full_text"].iloc[0]

    # find the section headers in the full text
    section_matches = []
    for section in section_names["files"][pdf_filename]:
        regex = "(\d|I|V)\.?\s+?[WORD]\s+?\n".replace("[WORD]", re.escape(section))
        matches = list(re.finditer(regex, full_text))

        if len(matches) == 0:
            n += 1
            print(n, "- Cannot find any matches for: ", section, " in ", pdf_filename)
            continue

        if len(matches) > 1:
            match = matches[1]
        else:
            match = matches[0]

        section_matches.append((section, match))

  0%|          | 0/126 [00:00<?, ?it/s]

1 - Cannot find any matches for:  Legal form to be taken by the group of economic operators to whom the contract is to be awarded:  in  2323 - EN-Annex I - Additiona Information to Candidates_159.pdf
2 - Cannot find any matches for:  Other particular conditions  in  2323 - EN-Annex I - Additiona Information to Candidates_159.pdf
3 - Cannot find any matches for:  LIST OF ABBREVIATIONS  in  2958 - EN-Annex II_841_Terms of reference.pdf
4 - Cannot find any matches for:  FRONTEXâ€™ STRATEGIC OBJECTIVES IN THE FIELD OF TRAINING AND EDUCATION OF BORDER GUARDS  in  2958 - EN-Annex II_841_Terms of reference.pdf
5 - Cannot find any matches for:  FRONTEX TRAINING  in  2958 - EN-Annex II_841_Terms of reference.pdf
6 - Cannot find any matches for:  REQUESTED SERVICES  in  2958 - EN-Annex II_841_Terms of reference.pdf
7 - Cannot find any matches for:  Contractorâ€™s liability and additional information  in  2998 - EN-Annex II  Terms of reference.pdf
8 - Cannot find any matches for:  Equipment to be

In [351]:
pdf_path = DATA_PATH + '/PDF/' + "1972 - EN-Annex II - ToR - eSignature-2_001.pdf"
full_text = pdfminer.high_level.extract_text(pdf_path)
regex = "(\d|I|V)\.?\s+?[WORD]\s+?\n".replace("[WORD]", re.escape("Current Situation"))
matches = list(re.finditer(regex, full_text))
matches

[<_sre.SRE_Match object; span=(20255, 20279), match='1. \n\nCurrent Situation \n'>]

In [349]:
i = full_text[970+20:].find("Description of the current and target situation")+970+20
print(i)
print(repr(full_text[i-200:i+200]))

20280
'and \n\ndeveloping workflows integrated with the solution. \n\n●  3  software developers (Frontex staff or contractors) who will be using the delivered API. \n\n \n\n\x0cTOR \n\n \n\n5. \n\n5.1. \n\nCurrent Situation \n\nDescription of the current and target situation \n\nP a g e  | 9 \n\nCurrently Frontex processes 120 000 documents a year of different types and volumes, versions included \n(for example: 60 Administrative'


In [323]:
s = "3 \n\nLicenses and intellectual property rights (IPR) \n\n"
print(repr(re.escape(s)))
list(re.finditer("Licenses and intellectual property rights", s))

'3\\ \\\n\\\nLicenses\\ and\\ intellectual\\ property\\ rights\\ \\(IPR\\)\\ \\\n\\\n'


[<_sre.SRE_Match object; span=(4, 45), match='Licenses and intellectual property rights'>]

In [261]:
print(get_content_from_section("TECHNICAL DESCRIPTION", full_text, section_matches))


1. OBJECTIVES 

The objective of the contract is provision of conference services - organization, contracting and payment for the 
services related to the events organized by Frontex outside of Poland.  
 
Events  in  Poland  are  covered  by  a  separate  service  contract  not  linked  with  this  call  for  tenders,  however,  if 
required, Frontex reserves the right to include Poland in the area of subject of the contract. 

2. DESCRIPTION OF THE TASKS  
  
The  specific  tasks  required  under  the  contract  are:  market  research,  obtaining  offers  from  potential  sub-
contractors, negotiating best conditions of offers, preparation of comparison of offers, contracting of services and 
supplies  on  behalf  of  Frontex,  covering  of  payment  of  advances  for  the  services  and  final  payments  after  the 
events.  The  costs  covered  will  be  afterwards  reimbursed  by  Frontex  on  the  basis  of  an  invoice  issued  by  the 
contractor given that the costs are in li