In [2]:
import spacy
from spacy.tokens import DocBin
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import json
from datasets import load_dataset
from collections import defaultdict

### Testing data

In [None]:

# Load spaCy's language model if necessary (for example, the English model)
nlp = spacy.blank("en")  # Replace "en" with the appropriate language code

# Load your .spacy file
train_doc_bin = DocBin().from_disk("data/train.spacy")
dev_doc_bin = DocBin().from_disk("data/dev.spacy")
# Deserialize the docs
train_docs = list(train_doc_bin.get_docs(nlp.vocab))
dev_docs = list(dev_doc_bin.get_docs(nlp.vocab))

In [None]:
# Now you can work with the docs
for doc in train_docs[2:3]:
    for ent in doc.ents:
        print(ent.text, ent.label_)

In [None]:
def getLabelsCounts(docs):
    labels = []
    for doc in docs:
        for ent in doc.ents:
            labels.append(ent.label_)

    # Convert the list of labels to a NumPy array
    labels_array = np.array(labels)
    unique_labels, counts = np.unique(labels_array, return_counts=True)
    counts = dict(zip(unique_labels, counts))
    return counts

In [None]:
trainLabelsCounts = getLabelsCounts(train_docs)
devLabelsCounts = getLabelsCounts(dev_docs)

In [None]:
def saveLabelsPie(LabelsCounts, name):
    plt.figure(figsize=(8, 8))
    colors = plt.cm.hsv(np.linspace(0, 1, len(LabelsCounts)))
    patches, texts, autotexts = plt.pie(LabelsCounts.values(), labels=LabelsCounts.keys(), 
            autopct='%1.1f%%', 
            colors=colors, 
            startangle=60,
            wedgeprops=dict(edgecolor='w'))
    for text in texts + autotexts:
        text.set_fontsize(9)
    plt.axis('equal')  # Equal aspect ratio ensures the pie chart is circular.
    plt.title(name, pad=30, fontdict = {'fontsize':20, 'fontstyle' : 'oblique'})
    plt.savefig(f"./plots/{name}.png", bbox_inches='tight', transparent=True)
    plt.show()



In [None]:
saveLabelsPie(trainLabelsCounts, "Named entity proportions in training")
saveLabelsPie(devLabelsCounts, "Named entity proportions in development")

### Creating train and dev csv

In [None]:
import spacy
import pandas as pd
from collections import defaultdict
import re

# categories = 

def format_text(text):
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('\n', ' ')
    # Strip leading and trailing whitespace
    text = text.strip()
    return text

# Load your .spacy file
def load_spacy_file(file_path):
    nlp = spacy.blank("en")  # replace "en" with your model's language if different
    docs = DocBin().from_disk(file_path)
    return list(docs.get_docs(nlp.vocab))
    # return list(nlp.from_disk(file_path))

# Process documents and extract entities
# def process_docs(docs):
#     data = []
#     for doc in docs:
#         text = doc.text
#         entities = defaultdict(set)
#         for ent in doc.ents:
#             entities[ent.label_].add(format_text(ent.text))
#         entities = {label: list(ents) for label, ents in entities.items()}
#         data.append([text, entities])
#     return data


def process_docs(docs):
    data = []
    for doc in docs:
        text = doc.text
        # Using a dict to maintain insertion order and uniqueness
        entities = defaultdict(dict)
        for ent in doc.ents:
            entities[ent.label_][format_text(ent.text)] = None  # Key is the entity, value is a placeholder
        # Extracting the keys (unique entities) from each dictionary
        entities = {label: list(ents.keys()) for label, ents in entities.items()}
        data.append([text, entities])
    return data

# Convert to DataFrame
def to_dataframe(data):
    # Find all unique entity labels
    all_labels = set()
    for _, entities in data:
        all_labels.update(entities.keys())
    all_labels = sorted(all_labels)

    # Create DataFrame
    df_data = []
    for text, entities in data:
        row = [format_text(text)] + [entities.get(label, []) for label in all_labels]
        df_data.append(row)

    columns = ['sentence'] + all_labels
    return pd.DataFrame(df_data, columns=columns)

# Load data
train_docs = load_spacy_file('data/train.spacy')
dev_docs = load_spacy_file('data/dev.spacy')

# Process documents
train_data = process_docs(train_docs)
dev_data = process_docs(dev_docs)

# Convert to DataFrame
train_df = to_dataframe(train_data)
dev_df = to_dataframe(dev_data)

# Export to CSV (optional)
train_df.to_csv('./data/raw/train_data.csv', index=False)
dev_df.to_csv('./data/raw/dev_data.csv', index=False)


In [17]:
train = pd.read_csv("./data/raw/train_data.csv")
test = pd.read_csv("./data/raw/dev_data.csv")

In [18]:
dev = train.sample(frac=0.1, random_state=42) # random_state for reproducibility
train = train.drop(dev.index)

In [19]:
print(test['sentence'].iloc[900])
print(train.iloc[900])

These are plainly disputed questions of facts and it is also apposite to examine the same in these proceedings.
sentence        3) The National Human Rights Commission undert...
CASE_NUMBER                                                    []
COURT                                                          []
DATE                                                           []
GPE                                                            []
JUDGE                                                          []
LAWYER                                                         []
ORG                          ['National Human Rights Commission']
OTHER_PERSON                                                   []
PETITIONER                                                     []
PRECEDENT                                                      []
PROVISION                                                      []
RESPONDENT                                                     []
STATUTE                       

In [20]:
print(dev['sentence'].iloc[-4])
print(dev.iloc[-4])

Whether ignoring the oral and the documentary evidence, the First Appellate Court decided that the suit property is not an ancestral property and that the settlement deed, Ex.B4 executed was valid?
sentence        Whether ignoring the oral and the documentary ...
CASE_NUMBER                                                    []
COURT                                                          []
DATE                                                           []
GPE                                                            []
JUDGE                                                          []
LAWYER                                                         []
ORG                                                            []
OTHER_PERSON                                                   []
PETITIONER                                                     []
PRECEDENT                                                      []
PROVISION                                                      []
RESPONDENT

### Creating dataset that contains the prompts

In [21]:
def create_raw_entities_column(df):
    def entities_to_string(row):
        # Build a dictionary of non-empty entity categories
        entities_dict = {category: entities for category, entities in row.items() if category != 'sentence' and "entities" not in category }
        # Convert the dictionary to a JSON string
        return json.dumps(entities_dict)

    # Apply the function to each row and create the new column
    df['raw_entities'] = df.apply(entities_to_string, axis=1)
    return df

def create_dict_column(df):
    # Function to convert a JSON string to a dictionary
    def string_to_dict(json_str):
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            return {}  # Returns an empty dictionary in case of a decoding error

    # Apply the function to the 'raw_entities' column to create a new dictionary column
    df['entities_dict'] = df['raw_entities'].apply(string_to_dict)
    return df

In [22]:
train_data = create_raw_entities_column(train)
dev_data = create_raw_entities_column(dev)
test_data = create_raw_entities_column(test)

In [23]:
train_data = create_dict_column(train_data)

dev_data = create_dict_column(dev_data)

test_data = create_dict_column(test_data)

In [24]:
dev_data['raw_entities'].iloc[-5]

'{"CASE_NUMBER": "[]", "COURT": "[\'District Magistrate, Muzaffarnagar\']", "DATE": "[\'13/14.1.1999\']", "GPE": "[]", "JUDGE": "[]", "LAWYER": "[]", "ORG": "[]", "OTHER_PERSON": "[]", "PETITIONER": "[]", "PRECEDENT": "[]", "PROVISION": "[]", "RESPONDENT": "[]", "STATUTE": "[]", "WITNESS": "[]"}'

In [25]:
dev_data['entities_dict'].iloc[-1]

{'CASE_NUMBER': '[]',
 'COURT': '[]',
 'DATE': "['December 23, 2004']",
 'GPE': '[]',
 'JUDGE': '[]',
 'LAWYER': '[]',
 'ORG': '[]',
 'OTHER_PERSON': '[]',
 'PETITIONER': '[]',
 'PRECEDENT': '[]',
 'PROVISION': "['Rule 141']",
 'RESPONDENT': '[]',
 'STATUTE': "['West Bengal Motor Vehicles Rules']",
 'WITNESS': '[]'}

In [26]:
len(train_data), len(dev_data), len(test_data)

(9895, 1100, 1074)

In [37]:
def create_text_col(row):
    instruction = "You are solving the NER problem in indian legal documents. You have to extract from the text, entities related to each of the following categories: CASE_NUMBER, COURT, DATE, GPE, JUDGE, LAWYER, ORG, OTHER_PERSON, PETITIONER, PRECEDENT, PROVISION, RESPONDENT, STATUTE, WITNESS. Extract them exactly as they are in the text (Don't format them). Your output always should be a dictionary in a json readable format (category: list of entities)."
    text_row = f"""<s> [INST] {instruction} Find the entities in the following text: {row['sentence']} [/INST]\n {row['raw_entities']} </s>"""
    return text_row

In [38]:
train_data['text'] = train_data.apply(create_text_col, axis=1)
dev_data['text'] = dev_data.apply(create_text_col, axis=1)
test_data['text'] = test_data.apply(create_text_col, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['text'] = train_data.apply(create_text_col, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dev_data['text'] = dev_data.apply(create_text_col, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['text'] = test_data.apply(create_text_col, axis=1)


In [39]:
# train_data['train'] = train_data['raw_entities']
# dev_data['train'] = dev_data['raw_entities']
# test_data['train'] = test_data['raw_entities']

# train_data['test'] = train_data['raw_entities']
# dev_data['test'] = dev_data['raw_entities']
# test_data['test'] = test_data['raw_entities']

In [40]:
selected_columns = ['sentence', 'raw_entities', 'entities_dict', 'text']
train_data = train_data[selected_columns]
dev_data = dev_data[selected_columns]
test_data = test_data[selected_columns]

In [41]:
json.loads(test_data['raw_entities'].iloc[0])

{'CASE_NUMBER': '[]',
 'COURT': "['High Court Of Delhi At New Delhi']",
 'DATE': '[]',
 'GPE': '[]',
 'JUDGE': "['Najmi Waziri']",
 'LAWYER': "['S.P. Jain', 'Himanshu Gambhir', 'Nar Singh', 'Pushkar Singh Kanwal', 'Arvind Chaudhary', 'Ram Kawar', 'Amit Kumar']",
 'ORG': '[]',
 'OTHER_PERSON': '[]',
 'PETITIONER': "['Oriental Insurance Co Ltd.']",
 'PRECEDENT': '[]',
 'PROVISION': '[]',
 'RESPONDENT': "['Zaixhu Xie', 'Qualcomm India Pvt Ltd']",
 'STATUTE': '[]',
 'WITNESS': '[]'}

In [42]:
test_data['text'].iloc[0]

'<s> [INST] You are solving the NER problem in indian legal documents. You have to extract from the text, entities related to each of the following categories: CASE_NUMBER, COURT, DATE, GPE, JUDGE, LAWYER, ORG, OTHER_PERSON, PETITIONER, PRECEDENT, PROVISION, RESPONDENT, STATUTE, WITNESS. Extract them exactly as they are in the text (Don\'t format them). Your output always should be a dictionary in a json readable format (category: list of entities). Find the entities in the following text: $~40 * In The High Court Of Delhi At New Delhi % Decided on: 31.07.2019 + Mac.App. 976/2018 & Cm Nos. 46122/2018, 15243/2019, 34195/2019 Oriental Insurance Co Ltd. ..... Appellant Through: Mr. S.P. Jain, Mr. Himanshu Gambhir, Mr. Nar Singh and Mr. Pushkar Singh Kanwal, Advocates. Versus Zaixhu Xie & Ors (M/S Qualcomm India Pvt Ltd ) ..... Respondents Through: Mr. Arvind Chaudhary, Advocate for Respondent Nos. 1& 2. Mr. Ram Kawar, Advocate for Mr. Amit Kumar Gupta, Advocate for Respondent No.4. Coram:

In [43]:
path = "./Data/Finetuning/"

In [44]:

train_data.to_csv(path+'train.csv', index=False)
dev_data.to_csv(path+'dev.csv', index=False)
test_data.to_csv(path+'test.csv', index=False)

In [None]:
import pandas as pd
from typing import List, Dict, Tuple
import ast
import spacy


nlp = spacy.load('en_core_web_sm')
categories = ['CASE_NUMBER', 'COURT', 'DATE', 'GPE', 'JUDGE', 'LAWYER', 'ORG', 'OTHER_PERSON', 'PETITIONER', 'PRECEDENT', 'PROVISION', 'RESPONDENT', 'STATUTE', 'WITNESS']
def tokenize_and_tag(df: pd.DataFrame, categories: List[str]) -> pd.DataFrame:
    # Define tag prefixes
    B_PREFIX = 'B-'
    I_PREFIX = 'I-'
    O_TAG = 'O'

    # Prepare output data
    output_data = {'tokens': [], 'ner_tags': []}

    for _, row in df.iterrows():
        sentence = row['sentence']
        entities = row['entities_dict']
        # print(entities)

        # Tokenize the sentence
        # tokens = sentence.split()  # Simple tokenization, can be replaced with a more robust tokenizer
        doc = nlp(sentence)
        tokens = [token.text for token in doc]

        # Initialize tags as 'Outside' for each token
        tags = [O_TAG for _ in tokens]

        # Update tags based on entities
        for category, entity_list in entities.items():
            entity_lista = ast.literal_eval(entity_list)
            for entity in entity_lista:
                entity_tokens = entity.split()
                # Find all occurrences of the entity in the tokens
                for i in range(len(tokens)):
                    # print(entity_tokens, tokens[i:i+len(entity_tokens)])
                    if tokens[i:i+len(entity_tokens)] == entity_tokens:
                        # Update the tags for this occurrence of the entity
                        tags[i] = B_PREFIX + category
                        for j in range(i + 1, i + len(entity_tokens)):
                            tags[j] = I_PREFIX + category

        output_data['tokens'].append(tokens)
        output_data['ner_tags'].append(tags)

    return pd.DataFrame(output_data)


In [None]:
test_prova = tokenize_and_tag(test_data, categories)

In [None]:
print(test_data['entities_dict'].iloc[0])
list(zip(test_prova['tokens'].iloc[0], test_prova['ner_tags'].iloc[0]))

### Computing f1 score from mistral model results

In [6]:
results = pd.read_csv("./results/model_results.csv")

In [7]:
results

Unnamed: 0,GroundTruth,ModelOutput,ExecutionTime
0,"{""CASE_NUMBER"": ""[]"", ""COURT"": ""[]"", ""DATE"": ""...",[INST] [INST] You are solving the NER problem ...,12.684053
1,"{""CASE_NUMBER"": ""[]"", ""COURT"": ""[]"", ""DATE"": ""...",[INST] [INST] You are solving the NER problem ...,12.374997
2,"{""CASE_NUMBER"": ""[]"", ""COURT"": ""[]"", ""DATE"": ""...",[INST] [INST] You are solving the NER problem ...,7.277846
3,"{""CASE_NUMBER"": ""[]"", ""COURT"": ""[]"", ""DATE"": ""...",[INST] [INST] You are solving the NER problem ...,8.017436
4,"{""CASE_NUMBER"": ""[]"", ""COURT"": ""[]"", ""DATE"": ""...",[INST] [INST] You are solving the NER problem ...,7.230112
...,...,...,...
532,"{""CASE_NUMBER"": ""['O.S.No.31/2009']"", ""COURT"":...",[INST] [INST] You are solving the NER problem ...,9.965929
533,"{""CASE_NUMBER"": ""['F.C.O.P.No.41 of 2012']"", ""...",[INST] [INST] You are solving the NER problem ...,8.196631
534,"{""CASE_NUMBER"": ""['Special Case (NDPS) No.17 o...",[INST] [INST] You are solving the NER problem ...,9.686907
535,"{""CASE_NUMBER"": ""[]"", ""COURT"": ""[]"", ""DATE"": ""...",[INST] [INST] You are solving the NER problem ...,7.096219


In [5]:
results['ModelOutput'].iloc[0]

'[INST] [INST] You are solving the NER problem in indian legal documents. You have to extract from the text, entities related to each of the following categories: CASE_NUMBER, COURT, DATE, GPE, JUDGE, LAWYER, ORG, OTHER_PERSON, PETITIONER, PRECEDENT, PROVISION, RESPONDENT, STATUTE, WITNESS. Extract them exactly as they are in the text (Don\'t format them). Be aware of synonyms, for instance, lawyers may be called advocates. This may happen with other categories. Your output always should be a dictionary (category: list of entities). Find the entities in the following text: The execution court held that the 5th judgment debtor can continue as the Manager only till a new Manager is elected, as per the 1934 Constitution. [/INST]\n {"CASE_NUMBER": "[]", "COURT": "[]", "DATE": "[]", "GPE": "[]", "JUDGE": "[]", "LAWYER": "[]", "ORG": "[]", "OTHER_PERSON": "[]", "PETITIONER": "[]", "PRECEDENT": "[]", "PROVISION": "[]", "RESPONDENT": "[]", "STATUTE": "[]", "WITNESS": "[]"}'

In [16]:
import pandas as pd
import json
import re

# Assuming df is your DataFrame
# df = pd.read_csv('your_dataset.csv')

# Function to correct syntax errors
def correct_syntax_errors(string):
    # Example: using regular expression to fix a specific pattern
    # This is just an example and should be tailored to the specific errors in your dataset
    corrected_string = re.sub(r'\["(.*?)"\]', r'[\1]', string)
    return corrected_string

def parse_json_string(json_str):
    try:
        # Replace single quotes with double quotes for valid JSON, if necessary
        corrected_string = correct_syntax_errors(json_str)
        return json.loads(corrected_string)
    except json.JSONDecodeError as e:
        # Print the error and the problematic string for inspection
        print(f"Error: {e}")
        print(f"Problematic string: {json_str}")
        return None

def extract_ground_truth_dict(row):
    # Extract and parse the JSON string from GroundTruth
    return parse_json_string(row['GroundTruth'])

def extract_model_output_dict(row):
    # Extract and parse the JSON string from ModelOutput
    model_output_part = row['ModelOutput'].split("[/INST]\n")[-1]
    return parse_json_string(model_output_part)

# Apply the functions to each row to create new columns
# results['GroundTruthDict'] = results.apply(extract_ground_truth_dict, axis=1)
results['ModelOutputDict'] = results.apply(extract_model_output_dict, axis=1)

Error: Expecting ',' delimiter: line 1 column 218 (char 217)
Problematic string:  {"CASE_NUMBER": "[]", "COURT": "[]", "DATE": "[]", "GPE": "[]", "JUDGE": "[]", "LAWYER": "[]", "ORG": "[]", "OTHER_PERSON": "[]", "PETITIONER": "[]", "PRECEDENT": "['Tirlok Nath v. Union of India [1967 SLR 759 (SC)]"]", "PROVISION": "[]", "RESPONDENT": "[]", "STATUTE": "[]", "WITNESS": "[]"}
Error: Unterminated string starting at: line 1 column 361 (char 360)
Problematic string:  {"CASE_NUMBER": "[]", "COURT": "[]", "DATE": "[]", "GPE": "[]", "JUDGE": "[]", "LAWYER": "[]", "ORG": "[]", "OTHER_PERSON": "[]", "PETITIONER": "[]", "PRECEDENT": "['R.K. Upadhyaya v. Shamabhai P. Patel [1987] 166 ITR 163 (SC); Jai Hanuman Trading Co. v. CIT [1977] 110 ITR 36 (P&H)(FB); CIT v. Sheo Kumari Devi [1986] 157 ITR 13 (Patna)(FB)']", "PROVISION": "['s. 147', '
