In [181]:
import PyPDF2 as ppdf2
import PyPDF4 as ppdf4
import pdfrw
import pdfminer.high_level
import requests
from pprint import pprint
from bs4 import BeautifulSoup
from zipfile import ZipFile
from tqdm.notebook import tqdm
import os
import random
from pdfminer.layout import LAParams
from io import StringIO
import shutil
import json

DATA_PATH = "../data"

# Retrieving the documents

In [2]:
# scrape the etendering search result website for tender ids
search_results_url = "https://etendering.ted.europa.eu/cft/cft-search.html?_caList=1&_procedureTypeForthcoming=1&_procedureTypeOngoing=1&caList=67&closingDateFrom=&closingDateTo=&confirm=Search&procedureTypeForthcoming=&procedureTypeOngoing=&startDateFrom=&startDateTo=&status=&text=&maxResults=250"
res = requests.get(search_results_url)
soup = BeautifulSoup(res.content, 'html.parser')

table = soup.find(id="row")
tbody = table.find("tbody")
rows = tbody.find_all("tr")

cftIds = []
for row in rows:
    cftId = row.find_all("td")[1].find("a")["href"].split("cftId")[-1][1:]
    cftIds.append(cftId)
print(len(cftIds), " cftIds found!")

128  cftIds found!


In [54]:
# get all the zip files containing tender documents
base_zip_url = "https://etendering.ted.europa.eu/document/archive-download.html?cftId=[CFTID]&lngIso=en"

for i, cftId in tqdm(enumerate(cftIds), total=len(cftIds)):
    url = base_zip_url.replace("[CFTID]", cftId)
    
    with open(DATA_PATH + "/ZIP/" + cftId + ".zip", "wb") as f:
        f.write(requests.get(url).content)
        
print(i, " zips downloaded!")

  0%|          | 0/128 [00:00<?, ?it/s]

127  zips downloaded!


In [3]:
def contains_annex_1(name):
    for s in ["Annex I", "Annex 01", "Annex 1", "ANNEX I"]:
        res = name.find(s)

        if res != -1:
            next_char = name[res + len(s)]
            if next_char in [' ', '_', '-'] and "Appendix" not in name:
                return True
    return False

In [4]:
def contains_annex_2(name):
    for s in ["Annex II", "Annex 02", "Annex 2", "ANNEX II"]:
        res = name.find(s)

        if res != -1:
            next_char = name[res + len(s)]
            if next_char in [' ', '_', '-'] and "Appendix" not in name:
                return True
    return False

In [132]:
contains_annex_1("EN-Annex I Tender specifications.pdf")

True

In [10]:
counter = 0
for i, cftId in enumerate(cftIds):
    with ZipFile(DATA_PATH + "/ZIP/" + cftId + ".zip", "r") as f:
        namelist = f.namelist()
        
        found_names_A1 = []
        found_names_A2 = []
        for name in namelist:
            if "Replaced versions" in name:
                # subfolder containing oder versions, ignore
                continue
            
            if contains_annex_2(name) and name[-3:] == "pdf":
                found_names_A2.append(name)
            elif contains_annex_1(name) and name[-3:] == "pdf":
                found_names_A1.append(name)
                
        if len(found_names_A1 + found_names_A2) > 0:
            # if there are multiple docs, choose whichever was found first
            name = (found_names_A2 + found_names_A1)[0]
            
            # extract and rename relevant pdfs
            f.extract(name, path=DATA_PATH + "/PDF/")
            os.rename(DATA_PATH + "/PDF/" + name, DATA_PATH + "/PDF/" + cftId + " - " + name)
            counter += 1
        
        if len(found_names_A1 + found_names_A2) == 0: print(f"No Annex 2 or Annex 1 found in {cftId}.zip!")
        if len(found_names_A2) > 1: print(f"Multiple annex 2 found in {cftId}.zip!")
        if len(found_names_A1) > 1: print(f"Multiple annex 1 found in {cftId}.zip!")
print(f"{counter} pdf's extracted")

print("TODO: Check for both Annex 2 and 1 before reporting missing document.")

Multiple annex 2 found in 7863.zip!
Multiple annex 1 found in 7191.zip!
Multiple annex 2 found in 6954.zip!
Multiple annex 2 found in 6428.zip!
Multiple annex 2 found in 4030.zip!
Multiple annex 2 found in 3751.zip!
No Annex 2 or Annex 1 found in 3294.zip!
Multiple annex 2 found in 3058.zip!
Multiple annex 1 found in 3058.zip!
Multiple annex 2 found in 3034.zip!
Multiple annex 1 found in 3034.zip!
Multiple annex 2 found in 2947.zip!
Multiple annex 1 found in 2947.zip!
Multiple annex 1 found in 2662.zip!
Multiple annex 2 found in 2570.zip!
No Annex 2 or Annex 1 found in 1775.zip!
Multiple annex 2 found in 1556.zip!
Multiple annex 2 found in 1415.zip!
Multiple annex 1 found in 1347.zip!
Multiple annex 1 found in 2799.zip!
126 pdf's extracted
TODO: Check for both Annex 2 and 1 before reporting missing document.


# Extracting text

In [96]:
random.choice(os.listdir(DATA_PATH + '/PDF/'))

'2945 - EN-Annex I - Tender Specifications.pdf'

In [102]:
# pdf_filename = random.choice(os.listdir(DATA_PATH + '/PDF/'))
for pdf_filename in tqdm(os.listdir(DATA_PATH + '/PDF/')):
    pdf_path = DATA_PATH + '/PDF/' + pdf_filename
    full_text = pdfminer.high_level.extract_text(pdf_path)

    p2_1_1_onwards = full_text[full_text.find("II.1.1"):]

    if p2_1_1_onwards != -1:
    #     print(p2_1_1_onwards)
        pass
    else:
        print(f"Subject of tender not found in {pdf_filename}")

  0%|          | 0/125 [00:00<?, ?it/s]

In [39]:
import re
pdf_filename = "5360 - EN-Annex II - Terms of Reference OP308.pdf"
pdf_path = DATA_PATH + '/PDF/' + pdf_filename
full_text = pdfminer.high_level.extract_text(pdf_path)

In [47]:
regex = "\d\.?\s+?[WORD]\s+\n".replace("[WORD]", "Specific Requirements")
matches = list(re.finditer(regex, full_text))
matches

[<_sre.SRE_Match object; span=(807, 834), match='4.  Specific Requirements \n'>,
 <_sre.SRE_Match object; span=(10942, 10972), match='4. \n\nSpecific Requirements  \n\n'>]

In [31]:
full_text

' \n\n \n\n \n\n \n\nAnnex II to the Invitation to Tender \n\n \n\nFrontex/OP/720/2018/RS \n\nTerms of Reference \nDesign, Transition & Operation  \n\nProvision  of  Nautical  Charts  Web  Map  Service \n\n \n\nFrontex - European Border and Coast Guard Agency \n\nwww.frontex.europa.eu | Pl. Europejski 6, 00-844 Warsaw, Poland | Tel. +48 22 205 95 00 | Fax +48 22 205 95 01 \n\n\x0c1. \n\nBackground Information \n\n1.1.  Situation Monitoring and supporting IT systems \n\nOne of Frontex’ principle tasks is situation monitoring. For that purpose Frontex uses a set of IT systems \nthat allow to compile and maintain the situational picture. These systems are built on the Microsoft and \nESRI technology stack and their main applications are based on the ESRI Silverlight API and JavaScript. These \nsystems are used within the EU Member States (MS) and Schengen Associate Countries (SAC) border guard \ncommunity.   \n \n\n1.2. \n\nScope \n\nThe envisaged solution is an externally hosted nautical

In [46]:
match = matches[1]
print("Match: ", repr(match.group()))
print("="*80)
print("Context:")
window_size = 100
print(full_text[match.span()[0]-window_size:match.span()[1]+window_size])

Match:  'Specific Requirements  \n\n'
Context:
reactive services 

Annex II – Terms of Reference

to Data Centre equipment and systems   

 

4. 

Specific Requirements  

This section describes the maintenance and support services to be delivered to Frontex and defines t


In [144]:
pdf_filename = "1456 - EN-Annex II TOR-FWC-SPServices2016-19.pdf"
pdf_path = DATA_PATH + '/PDF/' + pdf_filename

# extract the full text string
full_text = pdfminer.high_level.extract_text(pdf_path)

# convert pdf to html using pdfminer
output_string = StringIO()
with open(pdf_path, 'rb') as fin:
    # get number of pages in pdf
    pdf = ppdf2.PdfFileReader(fin)
    n_pages = pdf.getNumPages()
    
    # convert everything but the coverpage to html
    pdfminer.high_level.extract_text_to_fp(fin,
                                           output_string,
                                           laparams=LAParams(),
                                           output_type='html',
                                           codec=None,
                                           page_numbers=list(range(1, n_pages)))
    
    # read html using beautifulsoup
    output_string.seek(0)
    soup = BeautifulSoup(output_string.read(), 'html.parser')

In [129]:
with open(DATA_PATH + "/test.html", "w", encoding='utf-8') as f:
    f.write(str(soup))

In [124]:
def isNotNumbering(string):
    return len("".join([c for c in text if not c.isdigit() and c != "."])) != 0

# find all section headers
sections = []
for tag in soup.find_all('span'):
    styles = tag.attrs['style'].split(";")
    for style in styles:
        if "font-size" in style.strip():
            font_size = int(style.split(":")[1].split("px")[0])
            text = tag.getText().strip()
            if font_size >= 14 and len(text) != 0 and isNotNumbering(text):
                if text not in sections: sections.append(text)
                    
for s in sections:
    print(s)

Terms and Definitions
Scope
General Requirements
Specific Requirements
Objectives
Background
Stakeholders
Context
Implementation of FWC
The tender
Evaluation
Appendices


In [154]:
match = match_dict["Scope"][2]
print("Match: ", repr(match.group()))
print("="*80)
print("Context:")
window_size = 100
print(full_text[match.span()[0]-window_size:match.span()[1]+window_size])

Match:  '8 \n\nScope \n'
Context:
coming from 3rd parties. 

Ref. FRONTEX/OP/131/2016/AH 

30/57 

TOR 

 

 

FWC 

 

 

 

 

 

78 

Scope 

3rd Level Support shall be provided for all software components including server and client 
softwa


# ===========================================================

In [250]:
# load the dictionary containing the section names of the documents
with open(DATA_PATH + "/JSON/section_names.json", "r") as f:
    section_names = json.load(f)

In [251]:
print(f"Documents with sections listed: {len(section_names['files']) / len(os.listdir(DATA_PATH + '/PDF'))*100:.2f}%")

Documents with sections listed: 4.76%


In [252]:
# load the full text of a given tender document
pdf_filename = "1773 - EN-Annex II - Terms of Reference.pdf"
pdf_path = DATA_PATH + '/PDF/' + pdf_filename
full_text = pdfminer.high_level.extract_text(pdf_path)

In [256]:
# find the section headers in the full text
section_matches = []
for section in section_names["files"][pdf_filename]:
    regex = "(\d|I|V)\.?\s+?[WORD]\s+?\n".replace("[WORD]", section)
    matches = list(re.finditer(regex, full_text))
    
    if len(matches) == 0:
        print("Cannot find any matches for: ", section, " in ", pdf_filename)
        continue
    
    if len(matches) > 1:
        match = matches[1]
    else:
        match = matches[0]

    section_matches.append((section, match))

In [248]:
def get_content_from_section(section):
    for i, (s, m) in enumerate(section_matches):
        if s == section:
            # starting index
            current_index = m.span()[1]
            
            # ending index
            if current_index != len(section_matches) - 1:
                next_index = section_matches[i + 1][1].span()[0]
            else:
                # the section to find is the last section
                next_index = len(full_text)
            return full_text[current_index:next_index]
    return None

In [258]:
print(get_content_from_section("TECHNICAL DESCRIPTION"))


1. OBJECTIVES 

The objective of the contract is provision of conference services - organization, contracting and payment for the 
services related to the events organized by Frontex outside of Poland.  
 
Events  in  Poland  are  covered  by  a  separate  service  contract  not  linked  with  this  call  for  tenders,  however,  if 
required, Frontex reserves the right to include Poland in the area of subject of the contract. 

2. DESCRIPTION OF THE TASKS  
  
The  specific  tasks  required  under  the  contract  are:  market  research,  obtaining  offers  from  potential  sub-
contractors, negotiating best conditions of offers, preparation of comparison of offers, contracting of services and 
supplies  on  behalf  of  Frontex,  covering  of  payment  of  advances  for  the  services  and  final  payments  after  the 
events.  The  costs  covered  will  be  afterwards  reimbursed  by  Frontex  on  the  basis  of  an  invoice  issued  by  the 
contractor given that the costs are in li