In [1]:
import re
import json
import pandas as pd
from tqdm.auto import tqdm
import pathlib

import spacy
from spacy.tokens import Doc
from spacy.language import Language

from emfdscore.load_mfds import emfd
from emfdscore.scoring import score_docs

In [2]:
files = list(pathlib.Path("D:/data").glob("*/text_files*/*.txt"))

In [3]:
query_results = []
result_files = list(pathlib.Path("queries/sparql_query_results").glob("*moral*.json"))
for query_results_file in tqdm(result_files):
    df = pd.DataFrame(json.load(open(query_results_file))["results"]["bindings"])
    df = df.assign(domain=re.search(r"query_results_\d+_([^\d]+)",query_results_file.stem).group(1))
    query_results.append(df)

  0%|          | 0/130 [00:00<?, ?it/s]

In [4]:
query_results = pd.concat(query_results,axis=0,ignore_index=True)

In [5]:
query_results = query_results.map(lambda x: x["value"] if type(x) == dict else x)
query_results["cellarIds"] = query_results["cellarURIs"].str.split("/").str[-1]
query_results["workTypes"] = query_results["workTypes"].str.split("#").str[-1]
query_results["subject_ids"] = query_results["subject_ids"].str.split("|").apply(lambda x: [x.split("/")[-1] for x in x])
query_results = query_results.loc[:, ["cellarIds", "workTypes", "subjects", "dates", "domain"]]

In [6]:
# documents have multiple subjects -> we can have duplicates from moral & nonmoral domains
remove = []
for i in tqdm(query_results[query_results.duplicated(subset=["cellarIds"])].index):
    duplicates = query_results[query_results["cellarIds"] == query_results.at[i, "cellarIds"]].index
    query_results.at[duplicates[0], "subjects"] += "|" + query_results.at[duplicates[1], "subjects"]
    query_results.at[duplicates[0], "domain"] = "moral|nonmoral"
    remove.append(duplicates[1])

  0%|          | 0/5843 [00:00<?, ?it/s]

In [7]:
query_results = query_results.drop(remove).reset_index(drop=True)

In [8]:
# the EU was formerly established on 1 November 1993
# we therefore filter out all documents that were published before 1994
query_results = query_results[(pd.to_datetime(query_results["dates"]).dt.year >= 1994)].reset_index(drop=True)

In [9]:
query_results = query_results.assign(text="")
query_results = query_results.set_index("cellarIds")
for file in tqdm(files):
    if file.stem in query_results.index:
        query_results.at[file.stem, "text"] = file.read_text(encoding="utf-8")

  0%|          | 0/51468 [00:00<?, ?it/s]

In [10]:
query_results = query_results[query_results["text"] != ""].reset_index()

In [11]:
query_results["domain"].value_counts()

domain
moral             22256
nonmoral          14451
moral|nonmoral     5428
Name: count, dtype: int64

In [12]:
query_results["workTypes"].value_counts()

workTypes
act_preparatory            24719
decision                    6744
regulation                  5758
agreement_international     2747
regulation_implementing     1243
decision_implementing        914
treaty                        10
Name: count, dtype: int64

In [13]:
from transformers import AutoTokenizer

In [14]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

In [15]:
from joblib import Parallel, delayed

In [16]:
def tokenize_batch(batch):
    return tokenizer(batch, max_length=1024, truncation=True, add_special_tokens=False)

tokenized_texts = Parallel(n_jobs=4)(
    delayed(tokenize_batch)(query_results["text"][i:i + 16].tolist())
    for i in tqdm(range(0, len(query_results), 16))
)

  0%|          | 0/2634 [00:00<?, ?it/s]

In [17]:
query_results["n_tokens"] = [
    len(tokenized_texts[j]["input_ids"][i])
    for j in range(len(tokenized_texts))
    for i in range(len(tokenized_texts[j]["input_ids"]))
]

In [18]:
query_results["span_end"] = [
    tokenized_texts[j].token_to_chars(i, len(tokenized_texts[j].tokens(i))-2).end 
    for j in range(len(tokenized_texts))
    for i in range(len(tokenized_texts[j]["input_ids"]))
]

In [19]:
query_results = query_results[query_results["n_tokens"] > 512].reset_index(drop=True)

In [20]:
nlp = spacy.load('en_core_web_sm', disable=["tagger", "attribute_ruler", "lemmatizer"])
nlp.disable_pipe("parser")
nlp.enable_pipe("senter")

shorter_texts = query_results.apply(lambda row: row["text"][:row["span_end"]*2], axis=1)
query_results = query_results.assign(sentence_end=0)
for i, doc in tqdm(enumerate(nlp.pipe(shorter_texts, n_process=12, batch_size=16)), total=len(query_results)):
    for sent in doc.sents:
        if sent.end_char >= query_results.at[i,"span_end"]:
            query_results.at[i, "sentence_end"] = sent.end_char
            break

  0%|          | 0/37379 [00:00<?, ?it/s]

In [21]:
(query_results["sentence_end"] == 0).sum()

0

In [22]:
query_results["text"] = query_results.apply(lambda row: row["text"][:row["sentence_end"]], axis=1)

In [23]:
query_results = query_results[~query_results["text"].str.lower().str.contains("table of contents")].reset_index(drop=True)

In [24]:
query_results.to_csv("D:/data/eu_documents.csv", index=False)

In [16]:
query_results = pd.read_csv("D:/data/eu_documents.csv")

In [17]:
scores = score_docs(query_results["text"].to_frame().rename(columns={"text":0}),"emfd","all","bow","sentiment", n_processes=12, batch_size=16)

  0%|          | 0/35330 [00:00<?, ?it/s]

In [18]:
data = pd.concat([query_results.reset_index(drop=True).iloc[:,:-2],scores], axis=1)

In [19]:
cols = [c for c in data.columns if "_p" in c]

In [26]:
nlp = spacy.load("en_core_web_lg")
keep = []
mandatory = []
docs = []
for c in cols:
    ids = []
    for i in list(data.sort_values(by=c, ascending=False).head(35).index):
        doc = nlp(data.at[i, "text"])
        if len([d for d in docs if doc.similarity(d) > 0.99]) == 0:
            ids.append(i)
            docs.append(doc)
    keep += ids
    mandatory += ids[:2]

In [27]:
filtered_data = data.loc[list(set(keep)), list(data.columns[:6])]
filtered_data = filtered_data.assign(mandatory=False)
filtered_data.loc[mandatory, "mandatory"] = True

In [28]:
print("Mandatory docs: ",filtered_data["mandatory"].sum())
print("Total docs: ",len(filtered_data))

Mandatory docs:  10
Total docs:  114


In [29]:
prev_mandatory = pd.read_csv("mandatory_ids.csv", header=None)
print([i for i in prev_mandatory[0].tolist() if i not in filtered_data[filtered_data["mandatory"]]["cellarIds"].tolist()])
print([i for i in filtered_data[filtered_data["mandatory"]]["cellarIds"].tolist() if i not in prev_mandatory[0].tolist()])

[]
[]


In [70]:
filtered_data.to_csv("annotation_docs.csv")

In [2]:
filtered_data = pd.read_csv("annotation_docs.csv", index_col=0)

In [3]:
filtered_data[filtered_data["mandatory"]]

Unnamed: 0,cellarIds,workTypes,subjects,dates,domain,text,mandatory
21549,961c99ef-815b-4695-adc6-1c7028c2d13a,act_preparatory,racism|xenophobia|extreme right,1997-02-20,moral|nonmoral,Avis juridique important\n\nResolution on raci...,True
7255,ac6d079a-7fe3-4f81-a76c-b70d3dbad941,act_preparatory,corruption|penalty,2012-07-11,moral,\n\nProposal for a DIRECTIVE OF THE EUROPEAN P...,True
23137,00617d3b-726c-4a01-8a8f-a5404353e11a,decision,health control|tropical disease,1994-09-20,moral,Avis juridique important\n\n94/622/EC: Commiss...,True
9902,5828bf9a-13cb-4472-be25-d5a379a3ced3,regulation,fraud,2010-05-28,moral,29.5.2010 EN Official Journal of the European ...,True
18643,47183bf4-fae6-4275-a085-5510d4cb5408,act_preparatory,cruel and degrading treatment|human rights,2002-12-30,moral,Avis juridique important\n\nProposal for a Cou...,True
1781,00db531f-76e7-11ed-9887-01aa75ed71a1,act_preparatory,equal treatment|fundamental rights|gender equa...,2022-12-07,moral,"EUROPEAN COMMISSION\n\nBrussels, 7.12.2022\n\n...",True
2334,02beb5e4-6cb0-11ed-9887-01aa75ed71a1,act_preparatory,equal treatment|gender equality|position of wo...,2020-03-05,moral,EXECUTIVE SUMMARY\n\nThis document evaluates t...,True
22844,42211802-427b-4748-9df0-3eb13c84549d,decision,health insurance|maternity benefit|social secu...,1995-04-07,moral,Avis juridique important\n\n95/419/EC: Workers...,True
15177,5f529d2c-d739-4899-afd5-ff9089329d59,act_preparatory,exploitation of resources,2006-04-21,moral,21.4.2006 EN Official Journal of the European ...,True
31740,ed3fed03-5a14-4fc9-8524-3bdc4a339519,act_preparatory,Community certification|EC conformity marking,2003-12-12,nonmoral,Avis juridique important\n\nList of notified b...,True


In [71]:
filtered_data["subjects"].str.split("|", expand=True).stack().value_counts().head(10)

human rights                18
waste management             9
environmental protection     9
help for victims             7
technical standard           6
equal treatment              6
sexual discrimination        5
waste recycling              5
gender equality              4
violence                     4
Name: count, dtype: int64

In [77]:
filtered_data["domain"].value_counts()

domain
moral             83
nonmoral          20
moral|nonmoral    11
Name: count, dtype: int64

In [72]:
filtered_data[filtered_data["mandatory"]]["cellarIds"].to_csv("mandatory_ids.csv", index=False, header=False)
filtered_data[~filtered_data["mandatory"]]["cellarIds"].to_csv("non_mandatory_ids.csv", index=False, header=False)

In [73]:
emfd_df = pd.DataFrame(emfd).T
highlights = emfd_df[(emfd_df > emfd_df.mean()+emfd_df.std()).iloc[:,:5].any(axis=1)].index.tolist()

In [74]:
@Language.component("highlight_tokens")
def highlight_tokens(doc):
    return Doc(doc.vocab, [f"<b>{token.text}</b>" if token.text.lower() in highlights else token.text for token in doc], spaces=[token.whitespace_ for token in doc])

nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
nlp.add_pipe("highlight_tokens")

<function __main__.highlight_tokens(doc)>

In [75]:
highlighted_texts = nlp.pipe(filtered_data["text"])
filtered_data["text"] = [doc.text for doc in tqdm(highlighted_texts, total=len(filtered_data))]

  0%|          | 0/114 [00:00<?, ?it/s]

In [76]:
for i, row in filtered_data.iterrows():
    row.to_json(f"D:/data/documents/{row["cellarIds"]}.json")