In [1]:
! pip install python-docx
! pip install spacy_annotator



In [2]:

from docx import Document
from os import walk
from tqdm.notebook import tqdm
import spacy_annotator
import pandas as pd
import spacy

In [12]:
def annotate_by_sentence2(courts, nlp):
    annotated_data = []

    annotated_court = []
    # Get all individual sentences (using spaCy tokenizer)
    sentences = [str(x) for x in nlp(courts).sents]

    # For every sentence
    for sentence in sentences:
      annotated_sentence = []
      for token in nlp(sentence):
        annotated_sentence.append((token, token.ent_iob_, token.ent_type_))

      # Add sentence to court list
      annotated_court.append(annotated_sentence)

    # Add court to all data
    annotated_data.append(annotated_court)
    return annotated_data

In [9]:

## Do the processing for Jan's dataset

data = pd.read_csv("../file002.csv")
nlp = spacy.load("en_core_web_sm")

print(len(data))
print(data.head())
# Load model

135
   Unnamed: 0                                               Text  \
0           0                                      THIRD SECTION   
1           1                 CASE OF BUCEA AND BUCIA v. ROMANIA   
2           2                         (Application no. 32185/04)   
3           3                                           JUDGMENT   
4           4  This version was rectified on 16 December 2014...   

                                                Word  POS  \
0                               ['THIRD', 'SECTION']  NaN   
1  ['CASE', 'OF', 'BUCEA', 'AND', 'BUCIA', 'v.', ...  NaN   
2   ['(', 'Application', 'no', '.', '32185/04', ')']  NaN   
3                                       ['JUDGMENT']  NaN   
4  ['This', 'version', 'was', 'rectified', 'on', ...  NaN   

                                                 Tag  Word_idx  \
0                             ['B-COURT', 'L-COURT']       NaN   
1  ['O', 'O', 'O', 'O', 'B-CASE', 'I-CASE', 'L-CA...       NaN   
2  ['O', 'O', 'B-APPLI

In [75]:
# Spacy's annotation
spacy_df = []
# Iterate and label for each sentence as we go
for index, row in data.iterrows():
    par = ''.join(row['Word']).replace("['", "").replace("']", "").replace("', '", " ")
    print(par)

    s = annotate_by_sentence2(par, nlp)[0]



    sub_tags = []
    for r in s:
        for ind_tag in r:
            if ind_tag[1] == "O":
                sub_tags.append("O")
            else:
                sub_tags.append(ind_tag[1]+ "-" + ind_tag[2])
        print(sub_tags)
    spacy_df.append(sub_tags)
    #break

THIRD SECTION
['B-ORG', 'I-ORG']
CASE OF BUCEA AND BUCIA v. ROMANIA
['O', 'O', 'O', 'O', 'B-ORG', 'O', 'O']
( Application no . 32185/04 )
['O', 'O', 'O', 'O', 'B-CARDINAL', 'O']
JUDGMENT
['O']
This version was rectified on 16 December 2014 under Rule 81 of the Rules of Court
['O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'O', 'B-LAW', 'I-LAW', 'O', 'O', 'O', 'O', 'O']
STRASBOURG
['B-ORG']
1 July 2014
['O', 'B-DATE', 'I-DATE']
FINAL
['O']
01/10/2014
['O']
This judgment has become final under Article 44 § 2 of the Convention . It may be subject to editorial revision .
['O', 'O', 'O', 'O', 'O', 'O', 'B-LAW', 'I-LAW', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'B-LAW', 'I-LAW', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
In the case of Bucea and Bucia v. Romania ,
['O', 'O', 'O', 'O', 'B-ORG', 'O', 'B-PERSON', 'O', 'B-GPE', 'O']
The European Court of Human Rights ( Third Section ) , sitting as a Chamber compomsed of :
['B-ORG', 'I-ORG', 

In [76]:
print(len(spacy_df))
print(len(data))

135
135


In [78]:
# Now apply the regex:
regex_words = []
for index, row in data.iterrows():
    row_tag = row['Tag']
    sub_spacy_row = spacy_df[index]
    #print(str(sub_spacy_row) + " " + str(row_tag))

    sub_regex = []
    # Process the tags one by one
    sub_index = 0

    #print(str(index) + " ---")
    #print(len(sub_spacy_row))
    #print(sub_spacy_row)
    #print(len(row['Tag'].replace("['", "").replace("']", "").split("', '")))
    #print(row['Tag'].replace("['", "").replace("']", "").split("', '"))

    for label in row_tag.replace("['", "").replace("']", "").split("', '"):

        if label == "O":    # O may not hold anything interesting
            sub_regex.append(sub_spacy_row[sub_index])
        else:
            sub_regex.append(label)
        sub_index += 1

    regex_words.append(sub_regex)

    #break

In [79]:
for regex in regex_words:
    print(regex)

['B-COURT', 'L-COURT']
['O', 'O', 'O', 'O', 'B-CASE', 'I-CASE', 'L-CASE']
['O', 'O', 'B-APPLICATION', 'I-APPLICATION', 'L-APPLICATION', 'O']
['O']
['O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'L-DATE', 'O', 'B-LAW', 'I-LAW', 'O', 'O', 'O', 'O', 'O']
['B-ORG']
['O', 'B-DATE', 'L-DATE']
['O']
['O']
['O', 'O', 'O', 'O', 'O', 'O', 'B-ARTICLE', 'L-ARTICLE', 'B-PARAGRAPH', 'L-PARAGRAPH', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'B-ORG', 'O', 'B-CASE', 'I-CASE', 'L-CASE', 'O']
['B-ORG', 'B-COURT', 'I-COURT', 'I-COURT', 'I-COURT', 'L-COURT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O']
['B-JUDGE', 'L-JUDGE', 'O', 'O', 'O', 'B-JUDGE', 'L-JUDGE', 'O', 'B-JUDGE', 'I-JUDGE', 'L-JUDGE', 'O', 'B-JUDGE', 'I-JUDGE', 'I-JUDGE', 'L-JUDGE', 'O', 'B-JUDGE', 'L-JUDGE', 'O', 'B-JUDGE', 'I-JUDGE', 'L-JUDGE', 'O', 'B-JUDGE', 'I-JUDGE', 'L-JUDGE', 'O', 'O', 'O', 'O', 'B-REGISTRAR', 'L-REGISTRAR', 'O', 'B-LOC', 'I-LOC', 'O']
['O', 'O', 'O', 'O', 'O', '

In [81]:
print(data.head())
print(data.columns)

   Unnamed: 0                                               Text  \
0           0                                      THIRD SECTION   
1           1                 CASE OF BUCEA AND BUCIA v. ROMANIA   
2           2                         (Application no. 32185/04)   
3           3                                           JUDGMENT   
4           4  This version was rectified on 16 December 2014...   

                                                Word  POS  \
0                               ['THIRD', 'SECTION']  NaN   
1  ['CASE', 'OF', 'BUCEA', 'AND', 'BUCIA', 'v.', ...  NaN   
2   ['(', 'Application', 'no', '.', '32185/04', ')']  NaN   
3                                       ['JUDGMENT']  NaN   
4  ['This', 'version', 'was', 'rectified', 'on', ...  NaN   

                                                 Tag  Word_idx  \
0                             ['B-COURT', 'L-COURT']       NaN   
1  ['O', 'O', 'O', 'O', 'B-CASE', 'I-CASE', 'L-CA...       NaN   
2  ['O', 'O', 'B-APPLICATI

In [95]:
df = [data['Text'], data['Word'], data['POS'], regex_words]

df = []

for i in range(len(regex_words)):
    df.append([data['Text'][i], data['Word'][i], data['POS'][i], regex_words[i]])


In [99]:
regex_df = pd.DataFrame(data=df,columns=["Text", "Word", "POS", "Tag"])
print(regex_df.head())
regex_df.to_csv("../file_regex001.csv")

                                                Text  \
0                                      THIRD SECTION   
1                 CASE OF BUCEA AND BUCIA v. ROMANIA   
2                         (Application no. 32185/04)   
3                                           JUDGMENT   
4  This version was rectified on 16 December 2014...   

                                                Word  POS  \
0                               ['THIRD', 'SECTION']  NaN   
1  ['CASE', 'OF', 'BUCEA', 'AND', 'BUCIA', 'v.', ...  NaN   
2   ['(', 'Application', 'no', '.', '32185/04', ')']  NaN   
3                                       ['JUDGMENT']  NaN   
4  ['This', 'version', 'was', 'rectified', 'on', ...  NaN   

                                                 Tag  
0                                 [B-COURT, L-COURT]  
1               [O, O, O, O, B-CASE, I-CASE, L-CASE]  
2  [O, O, B-APPLICATION, I-APPLICATION, L-APPLICA...  
3                                                [O]  
4  [O, O, O, O, O, B-