In [82]:
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import re

In [83]:
# load json
import json
df = json.load(open('NER_TRAIN_JUDGEMENT.json'))
len(df)

9435

In [84]:
df[0].keys()

dict_keys(['id', 'annotations', 'data', 'meta'])

In [85]:
df[2]['data']

{'text': ' \n5.2 CW3 Mr Vijay Mishra , Deputy Manager, HDFC Bank, Noida, UP has deposed that complainant had a current account with HDFC Bank in the year 2004\xad2005.'}

In [86]:
for i in df[0]['annotations'][0]['result']:
   print(i)

{'value': {'start': 90, 'end': 103, 'text': 'Hongkong Bank', 'labels': ['ORG']}, 'id': 'C8HPTIM1', 'from_name': 'label', 'to_name': 'text', 'type': 'labels'}
{'value': {'start': 267, 'end': 278, 'text': 'Rahul & Co.', 'labels': ['ORG']}, 'id': 'KOWE3RAM', 'from_name': 'label', 'to_name': 'text', 'type': 'labels'}


In [87]:
start = df[0]['annotations'][0]['result'][0]['value']['start']
end = df[0]['annotations'][0]['result'][0]['value']['end']
df[0]['data']["text"][start:end]

'Hongkong Bank'

In [88]:
text = df[0]['data']["text"]
for i in df[0]['annotations'][-1]['result']:
    # replace the text with the entity
    start = i['value']['start']
    end = i['value']['end']
    label = i['value']['labels'][0]
    text = i['value']['text'].split()
    new_text = "B"+"_"+label+ (" " +"I"+"_"+label)*(len(text)-1)
    # replace the text 

In [89]:
print(text[:5])

['Rahul', '&', 'Co.']


In [90]:
text = df[0]['data']["text"]
tokens = text.split(" ")
print(tokens)

['\n\n(7)', 'On', 'specific', 'query', 'by', 'the', 'Bench', 'about', 'an', 'entry', 'of', 'Rs.', '1,31,37,500', 'on', 'deposit', 'side', 'of', 'Hongkong', 'Bank', 'account', 'of', 'which', 'a', 'photo', 'copy', 'is', 'appearing', 'at', 'p.', '40', 'of', "assessee's", 'paper', 'book,', 'learned', 'authorised', 'representative', 'submitted', 'that', 'it', 'was', 'related', 'to', 'loan', 'from', 'broker,', 'Rahul', '&', 'Co.', 'on', 'the', 'basis', 'of', 'his', 'submission', 'a', 'necessary', 'mark', 'is', 'put', 'by', 'us', 'on', 'that', 'photo', 'copy.']


In [91]:
def BIO_Encoder(text,annotations):
    # text = df[0]['data']["text"]
    tokens = text.split(" ")

    start_idx = 0
    token_positions = []
    for token in tokens:
        end_idx = start_idx + len(token)
        token_positions.append((start_idx, end_idx))
        # print(text[start_idx:end_idx])
        start_idx = end_idx + 1 # +1 accounts for the space
        
    bio_labels = ["O"] * len(tokens)
    # annotations = df[0]['annotations'][0]['result']

    for annotation in annotations:
        span_start = annotation['value']['start']
        span_end = annotation['value']['end']
        label = annotation['value']['labels'][0]
                
        for idx, (token_start, token_end) in enumerate(token_positions):
                if token_start >= span_start and token_end <= span_end:
                    if token_start == span_start:
                        bio_labels[idx] = f"B_{label}"
                    else:
                        bio_labels[idx] = f"I_{label}"
    return bio_labels

In [92]:
# print(token_positions)
# print(bio_labels)

In [93]:
# split data (randomly stratified) into train and test
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.15, random_state=42)
len(df_train), len(df_test)

(8019, 1416)

In [94]:
new_df_train = {}
new_df_test = {}

for i in df_train:
    id = i['id']
    text = i['data']["text"]
    annotations = i['annotations'][-1]['result']

    new_df_train[id] = {"text":text, "labels":  BIO_Encoder(text,annotations)}

for i in df_test:
    id = i['id']
    text = i['data']["text"]
    annotations = i['annotations'][-1]['result']

    new_df_test[id] = {"text":text, "labels":  BIO_Encoder(text,annotations)}

In [95]:
# save the json files
with open('train.json', 'w') as f:
    json.dump(new_df_train, f)

with open('test.json', 'w') as f:
    json.dump(new_df_test, f)

In [96]:
c = 2
for i in new_df_train:
    print(new_df_train[i]['text'])
    print(new_df_train[i]['labels'])

    if c == 0:
        break
    c -= 1
    

Therefore, while interpreting statutory provisions, the courts should keep in mind the objectives or purpose for which statute has been enacted.
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
The petitioner in W.P.No.15821 of 2008 was never considered for appointment under the National Rural Employment Guarantee Scheme either through Employment Exchange sponsorship or by Outsourcing Agencies.
['O', 'O', 'O', 'B_CASE_NUMBER', 'I_CASE_NUMBER', 'I_CASE_NUMBER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_ORG', 'I_ORG', 'I_ORG', 'I_ORG', 'I_ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
The factum of accident, allegation of rash and negligent driving causing death of Sukendra Pal Singh were denied.
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_OTHER_PERSON', 'I_OTHER_PERSON', 'I_OTHER_PERSON', 'O', 'O']


In [78]:
df_train[1]["data"]["text"][18:38]

'W.P.No.15821 of 2008'