In [None]:
import pandas as pd
from mair.pdf_parsing import parse
from glob import glob
from tqdm import tqdm
import os
import json
import spacy
from collections import defaultdict
from spacy import displacy
from mair.data_loading import load_legal_documents, load_legal_documents_metadata
from mair.doc_ids import legal_doc_path_to_id
import joblib
import numpy as np
from collections import Counter
import mair.coreference_resulution #importing to set spacy's extensions
import types
from dataclasses import dataclass

In [None]:
@dataclass
class EmptyToken:
    lemma_=""
    children=[]
    def __len__(self):
        return 0

EMPTY_TOKEN = EmptyToken()

In [None]:
nlp=spacy.load('en_core_web_sm')
# nlp.max_length=3000000
tqdm.pandas()
os.chdir('../..')

In [None]:
# df = pd.read_csv("../data/oecd_meta.csv")
COULD = "can"
SHOULD = "shall"
MUST = "must"
MODAL_VERBS_MAPPING = {
    "can": COULD,
    "could": COULD,
    "may": COULD,
    "might": COULD,
    "shall": SHOULD,
    "should": SHOULD,
    "must": MUST,
}

In [None]:
r = joblib.load("data/processed/intermediate/parsed_legal_texts.joblib")

In [None]:
df = pd.Series(r, name="doc")
df = df.reset_index()
df = df.set_index("index")
docs = df["doc"]

In [None]:
def get_all_conjucted_tokens(token):
    tokens = []
    for child in token.children:
        if child.dep_ == "conj":
            tokens.append(child)
    return tokens

def find_subjects(verb_token, modal_token):
    subject = [child for child in verb_token.children if child.dep_ == "nsubj"]
    passive_subject = [child for child in verb.children if child.dep_ == "nsubjpass"]
    csubj = [
        c
        for child in verb.children
        if child.dep_ == "csubj"
        for c in child.children
        if c.dep_ == "nsubj"
    ]

    if len(subject) == 0 and len(passive_subject) == 0:
        # if no subject found, check if there is conjunction on verb, and add subjects of conjucted verb
        if verb_token.dep_ == "conj":
            head = verb_token.head
            subject = [child for child in head.children if child.dep_ == "nsubj"]
            passive_subject = [
                child for child in head.children if child.dep_ == "nsubjpass"
            ]
    if len(subject) == 0 and len(passive_subject) == 0:
        subject = [child for child in modal_token.children if child.dep_ == "nsubj"]
        passive_subject = [
            child for child in modal_token.children if child.dep_ == "nsubjpass"
        ]

    if len(subject) != 0:
        subject += get_all_conjucted_tokens(subject[0])
        # check conjucted subjects, and add them to subjects

    return subject, passive_subject, csubj

In [None]:
# modal_sentences = defaultdict(list)
results = []
for id, doc in docs.iteritems():
    for token in doc:
        modal = token.lemma_.lower()
        if modal_category := MODAL_VERBS_MAPPING.get(modal):
            verb = next(token.ancestors, EMPTY_TOKEN)
            if len(verb) != 0:
                subject, passive_subject, clausal_subject = find_subjects(verb, token)
                negated = any([c.dep_ == "neg" for c in verb.children])
            else:
                subject = []
                passive_subject = []
                clausal_subject = []
                negated = False
            is_question = token.sent[-1].norm_ == "?" or token.sent[-2].norm_ == "?"
            result = {
                "modal": modal_category,
                "sent": token.sent,  # .text.replace('\n', ' '),
                "raw_text_path": id,
                "verb": verb,
                "subject": subject,
                "passiveSubject": passive_subject,
                "clausalSubject": clausal_subject,
                "token": token,
                "isQuestion": is_question,
                "negated": negated,
            }
            results.append(result)

In [None]:
result_df = pd.DataFrame(results)

In [None]:
def get_coref_text(tokens):
    if len(tokens) == 0:
        return ""
    token = tokens[0]
    corefs = token._.corefs
    if len(corefs) == 0 or token.pos_ != "PRON":
        return ""
    return corefs[0]


def get_subjects_from_noun_phrase(text):
    """Get all subjects from noun phrase"""
    if text == "":
        return []
    doc = nlp(text)
    root = None
    for t in doc:
        if t.dep_ == "ROOT":
            root = t
            break
    if not root:
        return []
    subjects = [root] + get_all_conjucted_tokens(root)

    return subjects


result_df["subjectCorefText"] = result_df["subject"].apply(get_coref_text)
result_df["subjectCoref"] = result_df["subjectCorefText"].apply(
    get_subjects_from_noun_phrase
)

In [None]:
def final_subjects(subjects, coref_subjects):
    results=[]
    append_coref=False
    for s in subjects:
        if s.pos_ == "PRON":
            append_coref = True
        else:
            results.append(s.lemma_.lower())
    
    if append_coref:
        for s in coref_subjects:
            results.append(s.lemma_.lower())
    return results

In [None]:
final_df = pd.DataFrame()
final_df["id"] = result_df["raw_text_path"].apply(legal_doc_path_to_id)
final_df["verb"] = result_df["verb"].apply(lambda t: t.lemma_.lower())
final_df["subject"] = result_df.apply(
    lambda row: final_subjects(row["subject"], row["subjectCoref"]), axis=1
)
final_df["sent"] = result_df["sent"].apply(
    lambda sent: sent.text.replace("\n", " "),
)
final_df["modal"] = result_df["modal"]
final_df["isQuestion"] = result_df["isQuestion"]
final_df["negated"] = result_df["negated"]

df_meta = load_legal_documents_metadata()
final_df = final_df.merge(df_meta, on="id")
final_df = final_df.explode("subject")  # convert lists to multiple rows}

In [None]:
final_df

In [None]:
final_df.to_csv('data/processed/deontics.csv')

In [None]:
displacy.serve(result_df.sent.iloc[0])

### Pronouns distribution

In [None]:
subjs = result_df['subject'].str[0]
subjs = subjs[~subjs.isna()]
subjs = subjs[subjs.apply(lambda x: x.pos_=='PRON')]
Counter([s.orth_.lower() for s in subjs])