In [27]:
from mair_tools.pdf_processing import parse
import glob
import spacy
from collections import Counter
from tqdm import tqdm

In [2]:
pdfs = []
for filename in glob.glob('../regulations/*.pdf'):
    try:
        pdf = parse(filename)
        pdfs.append(pdf)
    except Exception as e:
        print(f"Error parsing ({filename}):", e)



Error parsing (../regulations/us-algorithmic_accountability_act.pdf): file has not been decrypted


In [3]:
len(pdfs)

8

In [4]:
text = pdfs[1].full_text
print(text)

4.5.2016 
EN     
Official Journal of  the European Union 
L 119/1 
I 
(Legislative acts) 
REGULATIONS 
REGULATION (EU) 2016/679 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL 
of 27 April 2016 
on the protection of natural persons with regard to the processing of personal data and on the free 
movement of such data, and repealing Directive 95/46/EC (General Data Protection Regulation) 
THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION, 
(Text with EEA relevance) 
Having regard to the Treaty on the Functioning of  the European Union, and in particular Article 16 thereof, 
Having regard to the proposal from the European Commission, 
After  transmission of  the draft legislative act to the national parliaments, 
Having regard to the opinion of the European Economic and Social Committee  (1), 
Having regard to the opinion of the Committee of  the Regions  (2), 
Acting in accordance with the ordinary legislative procedure  (3), 
Whereas: 
(1)  
(2)  
The  protection  of  natural

In [5]:
text = text.replace('\n', '')

In [6]:
en = spacy.load('en_core_web_lg') # if not working run `python -m spacy download en_core_web_lg` 

In [7]:
doc=en(text)

In [8]:
spacy.displacy.render(doc, style='ent', jupyter=True)

### Filtering and counting labels

In [11]:
entities = [ent for ent in doc.ents if ent.label_ not in {'CARDINAL', 'DATE', 'TIME', 'ORDINAL', 'MONEY', 'PERCENT'}]
labels = Counter([ent.label_ for ent in entities])
labels

Counter({'ORG': 347,
         'NORP': 35,
         'LAW': 327,
         'EVENT': 5,
         'PRODUCT': 16,
         'GPE': 14,
         'FAC': 2,
         'PERSON': 6,
         'QUANTITY': 1})

### Checking organizations

In [14]:
[ent for ent in entities if ent.label_=='ORG']

[EN     Official Journal of  the European Union L,
 EU,
 95/46/EC,
 General Data Protection Regulation,
 THE COUNCIL OF THE EUROPEAN UNION,
 EEA,
 the European Commission,
 the European Economic and Social Committee,
 the Committee of  the Regions  (2,
 EC,
 the European Parliament of,
 the Official Journal,
 the Official Journal,
 the European Parliament,
 EC,
 the European Parliament,
 the Council of 24,
 23.11.1995,
 Union,
 EC,
 Directive 95/46/EC,
 EC,
 EN     Official Journal of  the European Union L,
 EC,
 EC,
 EC,
 EC,
 EC,
 the European Parliament,
 the Council of 18,
 Community,
 EU,
 EU,
 EC,
 Union,
 EU,
 the European Parliament,
 JHA,
 this Official Journal,
 the European Parliament,
 EN     Official Journal of  the European Union L,
 Union,
 State,
 the European Union,
 the ‘Court of Justice,
 the European Court of Human Rights,
 EN     ,
 EC,
 EC,
 the European Parliament,
 the Council of 16 December 2008,
 Community,
 EN     ,
 EN     Official Journal of  the European U

## Global analysis

In [18]:
text = pdfs[2].full_text
text = text.replace('\n', '')
doc=en(text)

In [21]:
entities = [
    ent
    for ent in doc.ents
    if ent.label_ not in {"CARDINAL", "DATE", "TIME", "ORDINAL", "MONEY", "PERCENT"}
]
labels = Counter([ent.label_ for ent in entities])
labels

Counter({'LAW': 17, 'ORG': 6, 'PERSON': 5, 'GPE': 2})

In [24]:
[ent for ent in entities if ent.label_=='ORG']

[State, SEC, SEC, lik eness, U.S.C., SEC]

In [28]:
docs =[]
for pdf in tqdm(pdfs):
    text = pdf.full_text.replace('\n','')
    doc = en(text)
    docs.append(doc)

100%|██████████| 8/8 [00:19<00:00,  2.41s/it]


In [31]:
entities = [
    ent
    for doc in docs
    for ent in doc.ents
    if ent.label_ not in {"CARDINAL", "DATE", "TIME", "ORDINAL", "MONEY", "PERCENT"}
]
labels = Counter([ent.label_ for ent in entities])
labels

Counter({'ORG': 1149,
         'PERSON': 102,
         'WORK_OF_ART': 16,
         'LAW': 482,
         'PRODUCT': 52,
         'QUANTITY': 6,
         'NORP': 40,
         'GPE': 61,
         'EVENT': 5,
         'FAC': 3,
         'LOC': 1,
         'LANGUAGE': 1})