In [2]:
import json
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import string
import pickle
import numpy as np

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nalishjain/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nalishjain/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
with open('Data task1/NER_TRAIN_JUDGEMENT.json', 'r') as f:
    dataset = json.load(f)

with open('Data task1/NER_TEST_JUDGEMENT.json', 'r') as f:
    test_dataset = json.load(f)
print(dataset)
train_dataset, val_dataset = train_test_split(dataset, test_size=0.15, random_state=42)





In [4]:
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))

    tokens = [word for word in tokens if word.lower() not in stop_words]
    tokens = [word for word in tokens if word not in string.punctuation]
    tokens = [word for word in tokens if word.strip() != '']

    return tokens

In [5]:
mapping = {'ORG' : 0, 'RESPONDENT' : 1, 'JUDGE' : 2, 'STATUTE' : 3, 'OTHER_PERSON' : 4, 'COURT' : 5, 'GPE' : 6, 'PETITIONER' : 7, 'WITNESS' : 8, 
 'CASE_NUMBER' : 9, 'PRECEDENT' : 10, 'DATE' : 11, 'PROVISION' : 12}

In [6]:

def bio_encoding(id, error, text, annotations):
    tokens = preprocess_text(text)
    bio_labels = ['O'] * len(tokens)
    try:
    # print(tokens)
        index = 0
        for annotation in annotations:
            # print(annotation['value'])
            label_words = preprocess_text(annotation['value']['text'])

            while index < len(tokens) and tokens[index] != label_words[0]:
                index +=1

            for i in range(len(label_words)):
                if i == 0:
                    bio_labels[index] = "B_" + annotation['value']['labels'][0]
                    error[mapping[annotation['value']['labels'][0]]] += 1

                else:
                    bio_labels[index] = "I_" + annotation['value']['labels'][0]
                index +=1
    except:
        # error.append(id)
        return ""
    return bio_labels

In [7]:
def bio_encoding_f(id, text, annotations):

    tokens = preprocess_text(text)
    bio_labels = ['O'] * len(tokens)
    
    print(tokens)
    index = 0
    for annotation in annotations:
        print(annotation['value'])
        label_words = preprocess_text(annotation['value']['text'])
        while index < len(tokens) and tokens[index] != label_words[0]:
            index +=1

        for i in range(len(label_words)):
            if i == 0:
                bio_labels[index] = "B_" + annotation['value']['labels'][0]
            else:
                bio_labels[index] = "I_" + annotation['value']['labels'][0]
            index +=1

    return bio_labels

In [10]:
train_processed_data = {}
error1 = np.zeros(13)
for id in range(len(train_dataset)):
    text = train_dataset[id]['data']['text']
    annotations = train_dataset[id]['annotations'][0]['result']
    
    labels = bio_encoding(id, error1, text, annotations)
    if labels != "":
        train_processed_data[id] = {'text': text, 'labels': labels}

print(error1)

val_processed_data = {}
error2 = np.zeros(13)
for id in range(len(val_dataset)):
    text = val_dataset[id]['data']['text']
    annotations = val_dataset[id]['annotations'][0]['result']
    
    labels = bio_encoding(id, error2, text, annotations)
    if labels != "":
        val_processed_data[id] = {'text': text, 'labels': labels}

print(error2)
print(error2/error1)

test_processed_data = {}
error = np.zeros(13)
for id in range(len(test_dataset)):
    text = test_dataset[id]['data']['text']
    annotations = test_dataset[id]['annotations'][0]['result']
    
    labels = bio_encoding(id, error, text, annotations)
    if labels != "":
        test_processed_data[id] = {'text': text, 'labels': labels}

print(error/error1)

[1176.  260.  473. 1451. 2134. 1092. 1140.  388.  713.  868. 1091. 1548.
 1976.]
[207.  44.  78. 271. 362. 189. 205.  60. 118. 151. 227. 290. 337.]
[0.17602041 0.16923077 0.16490486 0.18676775 0.16963449 0.17307692
 0.17982456 0.15463918 0.1654979  0.17396313 0.20806599 0.1873385
 0.17054656]
[0.1335034  0.01153846 0.01691332 0.15024121 0.12324274 0.16208791
 0.15526316 0.02319588 0.0743338  0.13479263 0.15398717 0.13824289
 0.12803644]


In [11]:
train_processed_data

{0: {'text': 'Therefore, while interpreting statutory provisions, the courts should keep in mind the objectives or purpose for which statute has been enacted.',
  'labels': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']},
 1: {'text': 'The petitioner in W.P.No.15821 of 2008 was never considered for appointment under the National Rural Employment Guarantee Scheme either through Employment Exchange sponsorship or by Outsourcing Agencies.',
  'labels': ['O',
   'B_CASE_NUMBER',
   'I_CASE_NUMBER',
   'O',
   'O',
   'O',
   'B_ORG',
   'I_ORG',
   'I_ORG',
   'I_ORG',
   'I_ORG',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O']},
 2: {'text': 'The factum of accident, allegation of rash and negligent driving causing death of Sukendra Pal Singh were denied.',
  'labels': ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B_OTHER_PERSON',
   'I_OTHER_PERSON',
   'I_OTHER_PERSON',
   'O']},
 3: {'text': '..36.. \n\n W.A.No.655/2012 & others Meaning thereby that except i

In [12]:
with open('Json Task1/train_processed.json', 'w') as file:
    json.dump(train_processed_data, file, indent=2)

with open('Json Task1/val_processed.json', 'w') as file:
    json.dump(val_processed_data, file, indent=2)

with open('Json Task1/test_processed.json', 'w') as file:
    json.dump(test_processed_data, file, indent=2)