In [None]:
import pandas as pd
from mair.pdf_parsing import parse
from glob import glob
from tqdm import tqdm
import os
import json
import spacy
from collections import defaultdict
from spacy import displacy
from mair.data_loading import load_legal_documents, load_legal_documents_metadata
from mair.doc_ids import legal_doc_path_to_id

In [None]:
nlp=spacy.load('en_core_web_sm')
nlp.max_length=3000000
tqdm.pandas()
os.chdir('../..')

In [None]:
# df = pd.read_csv("../data/oecd_meta.csv")
MODAL_VERBS = [
    "can",
    "could",
    "may",
    "might",
#     "will",
#     "would",
    "shall",
    "should",
    "must",
]

In [None]:
texts = load_legal_documents()
results = {t: nlp(text) for t, text in tqdm(texts.items())}

In [None]:
df = pd.Series(results, name='doc')
df = df.reset_index()
df = df.set_index('index')
docs = df['doc']

In [None]:
def find_subjects(verb_token, modal_token):
    subject = [child for child in verb_token.children if child.dep_ == "nsubj"]
    passive_subject = [child for child in verb.children if child.dep_ == "nsubjpass"]
    csubj = [
        c
        for child in verb.children
        if child.dep_ == "csubj"
        for c in child.children
        if c.dep_ == "nsubj"
    ]
    if len(subject) == 0 and len(passive_subject) == 0:
        if verb_token.dep_ == "conj":
            head = verb_token.head
            subject = [child for child in head.children if child.dep_ == "nsubj"]
            passive_subject = [
                child for child in head.children if child.dep_ == "nsubjpass"
            ]
    if len(subject) == 0 and len(passive_subject) == 0:
        subject = [child for child in modal_token.children if child.dep_ == "nsubj"]
        passive_subject = [
            child for child in modal_token.children if child.dep_ == "nsubjpass"
        ]

    return subject, passive_subject, csubj

In [None]:
# modal_sentences = defaultdict(list)
results = []
for id, doc in docs.iteritems():
    for token in doc:
        modal = token.lemma_.lower()
        if modal in MODAL_VERBS:
            verb = next(token.ancestors, "")
            if len(verb) != 0:
                subject, passive_subject, clausal_subject = find_subjects(verb, token)
            else:
                subject = ""
                passive_subject = ""

            is_question = (token.sent[-1].norm_ == "?" or token.sent[-2].norm_ == "?")
            result = {
                "modal": modal,
                "sent": token.sent,  # .text.replace('\n', ' '),
                "doc_id": id,
                "verb": verb,
                "subject": subject,
                "passiveSubject": passive_subject,
                "clausalSubject": clausal_subject,
                "token": token,
                "isQuestion": is_question,
            }
            results.append(result)

In [None]:
result_df = pd.DataFrame(results)

In [None]:
df2 = result_df[~(result_df.verb.str.len()==0)]
df3 = df2[
    (df2["subject"].str.len() == 0)
    & (df2["passiveSubject"].str.len() == 0)
    & (df2["clausalSubject"].str.len() == 0)
]

In [None]:
row = df3.iloc[4]
print(row.sent)
row

In [None]:
result_df['sent'] = result_df['sent'].apply(lambda sent: sent.text.replace('\n', ' '),)
result_df['subject'] = result_df['subject'].str[0]
result_df['passiveSubject'] = result_df['passiveSubject'].str[0]
result_df['clausalSubject'] = result_df['clausalSubject'].str[0]
result_df['id']=result_df['doc_id'].apply(legal_doc_path_to_id)

In [None]:
df_meta = load_legal_documents_metadata()
results = result_df.merge(df_meta, on='id')

In [None]:
results.to_csv('data/results.csv')

In [None]:
for id, doc in docs.iteritems():


In [None]:
len(list(doc.sents))

In [None]:
list(doc.sents)[-5]

In [None]:
displacy.serve(row.sent)

In [None]:
displacy.serve(result_df.sent.iloc[0])

In [None]:
result_df.iloc[2].sent

In [None]:
result_df.iloc[2]

In [None]:
displacy.serve(result_df.iloc[2].sent)