In [3]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("token-classification", model="dominguesm/legal-bert-ner-base-cased-ptbr")

Device set to use cpu


In [4]:
import pandas as pd
df = pd.read_pickle("pickled.pkl")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 592 entries, 0 to 591
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   number             592 non-null    object
 1   desc               592 non-null    object
 2   subsections        592 non-null    object
 3   cases              592 non-null    object
 4   has_desc           592 non-null    bool  
 5   clean_desc         592 non-null    object
 6   subsec_codes       592 non-null    object
 7   subsec_texts       592 non-null    object
 8   num_subsections    592 non-null    int64 
 9   clean_cases        592 non-null    object
 10  num_cases          592 non-null    int64 
 11  valid_subsections  592 non-null    bool  
 12  valid_cases        592 non-null    bool  
dtypes: bool(3), int64(2), object(8)
memory usage: 48.1+ KB


In [6]:
test='''"The parties hereto agree that any dispute arising out of or in connection with this Agreement, including any question regarding its existence, validity, or termination, shall be referred to and finally resolved by arbitration under the Rules of the London Court of International Arbitration (LCIA)."

Entities to test: 
- "London Court of International Arbitration (LCIA)" (ORGANIZATION)
- "Agreement" (DOCUMENT)
- "arbitration" (LEGAL_TERM)'''

In [7]:
out=pipe(test)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [8]:
import re

def merge_tokens(entities):
    formatted_entities = []
    temp_entity = None
    
    for entity in entities:
        word = entity['word'].replace('##', '')  # Merge subword tokens
        score = entity['score']
        start, end = entity['start'], entity['end']
        
        if entity['entity'].startswith('B-'):  # Beginning of a new entity
            if temp_entity:  # Store previous entity
                formatted_entities.append(temp_entity)
            temp_entity = {
                'type': entity['entity'][2:],
                'name': word,
                'start': start,
                'end': end,
                'confidence': [score]
            }
        elif entity['entity'].startswith('I-') and temp_entity:
            temp_entity['name'] += word  # Append to existing entity
            temp_entity['end'] = end
            temp_entity['confidence'].append(score)
    
    if temp_entity:
        formatted_entities.append(temp_entity)
    
    return formatted_entities

def format_output(entities):
    merged_entities = merge_tokens(entities)
    
    output = "\n### Recognized Entities:\n"
    entity_groups = {}
    
    for ent in merged_entities:
        ent_type = ent['type']
        if ent_type not in entity_groups:
            entity_groups[ent_type] = []
        entity_groups[ent_type].append(ent)
    
    for ent_type, ents in entity_groups.items():
        output += f"\n#### {ent_type.capitalize()}s:\n"
        for ent in ents:
            avg_confidence = sum(ent['confidence']) / len(ent['confidence'])
            output += f"- **{ent['name']}** (Start: {ent['start']}, End: {ent['end']}, Confidence: {avg_confidence:.2f})\n"
    
    return output

# Example input
data = [
    {'entity': 'B-ORGANIZACAO', 'score': 0.9923943, 'index': 75, 'word': 'London', 'start': 250, 'end': 256},
    {'entity': 'I-ORGANIZACAO', 'score': 0.95155066, 'index': 76, 'word': 'Cour', 'start': 257, 'end': 261},
    {'entity': 'I-ORGANIZACAO', 'score': 0.9279648, 'index': 77, 'word': '##t', 'start': 261, 'end': 262},
    {'entity': 'I-ORGANIZACAO', 'score': 0.9921262, 'index': 78, 'word': 'of', 'start': 263, 'end': 265},
    {'entity': 'I-ORGANIZACAO', 'score': 0.99729985, 'index': 79, 'word': 'International', 'start': 266, 'end': 279},
    {'entity': 'I-ORGANIZACAO', 'score': 0.9976617, 'index': 80, 'word': 'Ar', 'start': 280, 'end': 282},
    {'entity': 'I-ORGANIZACAO', 'score': 0.9967339, 'index': 81, 'word': '##bit', 'start': 282, 'end': 285},
    {'entity': 'I-ORGANIZACAO', 'score': 0.9964142, 'index': 82, 'word': '##ration', 'start': 285, 'end': 291},
    {'entity': 'B-PESSOA', 'score': 0.8477672, 'index': 113, 'word': 'O', 'start': 375, 'end': 376},
    {'entity': 'B-PESSOA', 'score': 0.8387395, 'index': 114, 'word': '##R', 'start': 376, 'end': 377},
    {'entity': 'B-PESSOA', 'score': 0.8440088, 'index': 115, 'word': '##GA', 'start': 377, 'end': 379},
    {'entity': 'B-PESSOA', 'score': 0.8135658, 'index': 116, 'word': '##N', 'start': 379, 'end': 380},
    {'entity': 'B-PESSOA', 'score': 0.81518734, 'index': 117, 'word': '##I', 'start': 380, 'end': 381},
    {'entity': 'B-PESSOA', 'score': 0.7810171, 'index': 118, 'word': '##Z', 'start': 381, 'end': 382},
    {'entity': 'B-PESSOA', 'score': 0.8123766, 'index': 119, 'word': '##AT', 'start': 382, 'end': 384},
    {'entity': 'B-PESSOA', 'score': 0.81403464, 'index': 120, 'word': '##IO', 'start': 384, 'end': 386},
    {'entity': 'B-PESSOA', 'score': 0.82404333, 'index': 121, 'word': '##N', 'start': 386, 'end': 387}
]

# Print formatted output
print(format_output(data))



### Recognized Entities:

#### Organizacaos:
- **LondonCourtofInternationalArbitration** (Start: 250, End: 291, Confidence: 0.98)

#### Pessoas:
- **O** (Start: 375, End: 376, Confidence: 0.85)
- **R** (Start: 376, End: 377, Confidence: 0.84)
- **GA** (Start: 377, End: 379, Confidence: 0.84)
- **N** (Start: 379, End: 380, Confidence: 0.81)
- **I** (Start: 380, End: 381, Confidence: 0.82)
- **Z** (Start: 381, End: 382, Confidence: 0.78)
- **AT** (Start: 382, End: 384, Confidence: 0.81)
- **IO** (Start: 384, End: 386, Confidence: 0.81)
- **N** (Start: 386, End: 387, Confidence: 0.82)



In [9]:
import re

def merge_tokens(entities):
    formatted_entities = []
    temp_entity = None
    
    for entity in entities:
        word = entity['word'].replace('##', '')  # Merge subword tokens
        score = entity['score']
        start, end = entity['start'], entity['end']
        
        if entity['entity'].startswith('B-'):  
            # Handle case where B- appears repeatedly but should be merged (e.g., "O R GA N I Z AT IO N")
            if temp_entity and temp_entity['type'] == entity['entity'][2:]:
                temp_entity['name'] += word
                temp_entity['end'] = end
                temp_entity['confidence'].append(score)
            else:
                if temp_entity:  # Store previous entity
                    formatted_entities.append(temp_entity)
                temp_entity = {
                    'type': entity['entity'][2:],
                    'name': word,
                    'start': start,
                    'end': end,
                    'confidence': [score]
                }
        elif entity['entity'].startswith('I-') and temp_entity:
            temp_entity['name'] += ' ' + word  # Preserve spacing
            temp_entity['end'] = end
            temp_entity['confidence'].append(score)
    
    if temp_entity:
        formatted_entities.append(temp_entity)
    
    return formatted_entities

def format_output(entities):
    print(entities)
    merged_entities = merge_tokens(entities)
    
    output = "\n### Recognized Entities:\n"
    entity_groups = {}
    
    for ent in merged_entities:
        ent_type = ent['type']
        if ent_type not in entity_groups:
            entity_groups[ent_type] = []
        entity_groups[ent_type].append(ent)
    
    for ent_type, ents in entity_groups.items():
        output += f"\n#### {ent_type.capitalize()}s:\n"
        for ent in ents:
            avg_confidence = sum(ent['confidence']) / len(ent['confidence'])
            output += f"- **{ent['name']}** (Start: {ent['start']}, End: {ent['end']}, Confidence: {avg_confidence:.2f})\n"
    
    return output


In [10]:
def split_and_process(text):
    # print(text)
    chunks = [text[i:i+500] for i in range(0, len(text), 500)]  # Split into 500-char chunks
    print(len(chunks))
    results = [pipe(chunk) for chunk in chunks]
    # print(results)
    return results


In [11]:
df["ner_clean_desc"] = df["desc"].apply(split_and_process)


2
4
6
3
7
7
2
2
1
1
3
1
1
1
1
1
1
1
1
3
3
5
2
11
5
7
3
7
3
6
7
6
0
0
4
1
5
3
3
3
6
3
7
6
4
8
8
2
1
3
4
3
1
1
1
1
1
2
4
0
0
4
0
0
5
6
0
0
1
3
1
1
3
1
2
2
7
7
1
3
4
5
6
11
2
4
2
8
8
10
4
3
23
5
9
3
6
2
4
9
4
5
5
10
3
10
2
4
5
3
4
3
4
12
6
8
12
5
3
3
10
14
0
0
5
2
6
4
3
6
4
6
5
4
4
3
0
0
0
0
3
0
0
3
3
2
0
0
3
2
2
3
2
3
4
1
2
2
1
2
2
2
0
1
2
3
1
7
2
2
1
4
4
13
5
7
2
3
0
0
0
0
0
0
6
3
2
3
2
2
2
0
3
0
0
0
0
0
5
4
4
1
3
0
0
0
0
0
0
0
0
0
4
5
4
0
3
4
8
3
6
2
3
5
5
2
3
4
5
8
6
3
8
4
13
3
6
0
4
2
2
4
2
5
3
3
4
4
4
3
3
4
4
5
5
4
4
3
8
0
0
4
3
2
4
4
4
2
7
9
0
0
1
3
3
0
2
0
2
3
3
2
1
4
2
2
1
3
2
3
2
2
2
1
1
1
2
1
2
2
2
2
3
2
2
2
1
1
1
2
1
2
0
2
2
3
2
5
3
2
2
7
4
5
5
5
2
2
2
2
1
4
1
2
2
2
3
2
6
1
2
26
9
5
0
2
0
2
6
4
11
15
12
2
4
5
0
0
2
1
13
3
10
1
2
8
2
2
6
4
3
2
3
5
1
4
2
2
10
3
0
0
2
4
4
4
6
8
5
2
4
5
3
2
2
2
3
9
2
1
1
2
2
3
3
11
5
3
2
11
0
0
0
0
2
1
1
2
3
2
3
1
2
0
3
0
4
2
0
0
2
3
1
1
0
1
4
4
4
3
30
0
0
0
0
0
0
0
1
2
14
5
5
5
4
3
1
3
2
1
3
2
5
8
2
1
5
4
3
2
8
3
5
3
2
10
3
12
5
2
6
2
2
5
3
3
3
9

In [12]:
df["ner_subsec_texts"] = df["subsections"].apply(lambda x: [(sub_num,split_and_process(text)) for sub_num,text in x])


1
4
3
11
5
7
9
8
6
1
4
4
4
2
3
3
1
1
3
2
1
3
2
14
1
1
2
5
1
1
2
4
4
6
6
2
8
5
13
6
1
8
2
4
2
2
3
2
1
1
3
2
27
2
4
4
6
6
2
8
5
13
6
1
8
2
4
2
2
3
2
1
1
3
1
27
1
1
2
1
2
6
1
1
2
1
2
7
2
3
5
7
4
7
3
5
13
5
11
15
7
6
2
7
4
3
6
2
1
7
2
1
6
1
1
2
16
6
6
3
5
4
2
2
2
1
2
1
2
3
2
3
6
3
6
3
3
2
3
2
1
2
3
1
1
2
3
4
4
3
3
5
7
3
8
4
9
4
11
7
13
6
3
2
1
2
2
4
11
1
5
4
1
4
17
6
5
3
4
2
3
4
6
11
9
3
2
4
7
2
21
3
1
21
1
4
19
5
2
5
9
1
2
8
11
18
3
3
2
1
1
4
5
5
3
1
1
12
3
1
4
2
1
5
13
1
4
3
1
3
2
1
2
2
4
4
2
1
6
3
1
5
8
2
1
1
1
2
1
9
1
2
1
5
7
3
5
2
1
3
2
2
5
2
3
1
8
5
4
5
2
3
1
3
3
5
4
3
15
2
2
3
4
3
1
1
4
8
2
6
1
1
6
1
2
2
3
3
4
2
2
4
15
1
8
1
2
3
3
2
7
2
9
2
2
1
2
47
2
4
5
1
2
4
2
5
5
3
3
1
1
1
2
3
2
1
2
4
4
2
1
4
2
2
2
3
3
3
2
1
3
1
2
1
3
4
2
2
1
1
1
2
3
2
1
3
4
7
1
2
5
2
1
2
2
2
3
2
3
4
6
2
3
3
3
2
6
3
4
5
3
4
2
2
2
1
1
1
1
2
3
1
13
1
1
3
4
1
1
2
15
3
2
7
3
3
3
1
2
6
5
1
3
3
1
3
1
2
1
2
1
2
2
1
4
2
1
2
2
2
2
4
1
3
3
1
2
1
1
3
1
1
2
1
1
3
3
1
4
2
1
2
1
8
2
1
4
1
2
2
1
6
1
2
2
2
3
11
3
2
1
3
1
3
3
3


In [13]:
# df["ner_cases_texts"] = df["cases"].apply(lambda x: [split_and_process(text) for text in x])


In [14]:
import pickle

In [15]:
filename='file.pkl'

In [16]:
pickle.dump(df,open(filename,'wb'))