In [121]:
import numpy as np
import pandas as pd
import os

In [122]:
data_ner_path = f"data/ner/"

seed = 42

# utils

In [126]:
def df_to_json(data : pd.DataFrame) -> list[dict]:
     ner_data = data.apply(lambda x : 
                             [[int(x['start_token_indices'][i]),int(x['end_token_indices'][i]),x['ner_labels'][i]] for i in range(len(x['start_token_indices']))],axis = 1)
     
     return [{"tokenized_text" : list(tokenized_text), "ner" : ner} for tokenized_text,ner in zip(data['tokenized_text'],ner_data)]

# load original training data

In [123]:
data = pd.read_parquet(os.path.join(data_ner_path,"ner_dataset_1_of_10.parquet"))

In [124]:
data.head()

Unnamed: 0,tokenized_text,start_token_indices,end_token_indices,ner_labels
0,"[Parteien, Maëlle, Meyer, ,, Beschwerdeführer,...","[1, 69, 131, 175, 309, 480, 505]","[2, 70, 132, 176, 310, 481, 506]","[a_name, a_name, a_name, a_name, a_name, a_nam..."
1,"[Tribunale, federale, Tribunal, federal, {, T,...","[44, 127, 233, 283, 387, 424, 443, 197, 292, 1...","[45, 128, 234, 284, 388, 425, 444, 198, 293, 1...","[a_name, a_name, a_name, a_name, a_name, a_nam..."
2,"[Parteien, Deborah, Thut, ,, Beschwerdeführer,...","[1, 86, 129, 159, 207, 271, 306, 190]","[2, 87, 130, 160, 208, 272, 307, 191]","[a_name, a_name, a_name, a_name, a_name, a_nam..."
3,"[Parteien, Denis, Schramm, ,, Beschwerdeführer...","[1, 93]","[2, 94]","[a_name, a_name]"
4,"[Parteien, Roland, Meier, ,, Beschwerdeführer,...","[1, 66, 193, 276]","[2, 67, 194, 277]","[a_name, a_name, a_name, a_name]"


In [125]:
for i in range(2,11):
    data = pd.concat((data,pd.read_parquet(os.path.join(data_ner_path,f"ner_dataset_{i}_of_10.parquet"))),axis = 0,ignore_index=True)

In [127]:
print(len(data))

124089


## Change label names 

Changing "a_name" -> "person"
        "a_place" -> "location"
        "a_organisation" -> "organization"

In [128]:
def change_labels(ner_labels : list) -> list:
    output = []

    for label in ner_labels:
        if label == "a_name":
            output.append('person')
        elif label == 'a_organisation':
            output.append('organization')
        elif label == 'a_place':
            output.append('location')
        else: raise ValueError(f"unknown label {label} encountered")
    
    return output

In [129]:
data['ner_labels'] = data['ner_labels'].apply(lambda x : change_labels(x))

In [131]:
data.head()

Unnamed: 0,tokenized_text,start_token_indices,end_token_indices,ner_labels
0,"[Parteien, Maëlle, Meyer, ,, Beschwerdeführer,...","[1, 69, 131, 175, 309, 480, 505]","[2, 70, 132, 176, 310, 481, 506]","[person, person, person, person, person, perso..."
1,"[Tribunale, federale, Tribunal, federal, {, T,...","[44, 127, 233, 283, 387, 424, 443, 197, 292, 1...","[45, 128, 234, 284, 388, 425, 444, 198, 293, 1...","[person, person, person, person, person, perso..."
2,"[Parteien, Deborah, Thut, ,, Beschwerdeführer,...","[1, 86, 129, 159, 207, 271, 306, 190]","[2, 87, 130, 160, 208, 272, 307, 191]","[person, person, person, person, person, perso..."
3,"[Parteien, Denis, Schramm, ,, Beschwerdeführer...","[1, 93]","[2, 94]","[person, person]"
4,"[Parteien, Roland, Meier, ,, Beschwerdeführer,...","[1, 66, 193, 276]","[2, 67, 194, 277]","[person, person, person, person]"


## save data with new labels

In [134]:
data.to_parquet(f"data/ner/complete_ner_data.parquet",engine = 'pyarrow')

In [132]:
data_json = df_to_json(data)

In [133]:
print(len(data_json))
print(data_json[123])

124089
{'tokenized_text': ['Tribunale', 'federale', 'Tribunal', 'federal', '2C_27', '/', '2007', '/', 'ADD', '/', 'elo', '{', 'T', '0', '/', '2', '}', 'Arrêt', 'du', '11', 'avril', '2007', 'IIe', 'Cour', 'de', 'droit', 'public', 'Composition', 'MM', '.', 'les', 'Juges', 'Hungerbühler', ',', 'Juge', 'présidant', ',', 'Wurzburger', 'et', 'Karlen', '.', 'Greffier', ':', 'M', '.', 'Addy', '.', 'Parties', 'Manuel', 'Steiner', ',', 'recourant', ',', 'représenté', 'par', 'Me', 'Astyanax', 'Peca', ',', 'avocat', ',', 'contre', 'Service', 'de', 'la', 'population', 'du', 'canton', 'de', 'Vaud', ',', 'avenue', 'de', 'Beaulieu', '19', ',', '1014', 'Lausanne', ',', 'Tribunal', 'administratif', 'du', 'canton', 'de', 'Vaud', ',', 'avenue', 'Eugène-Rambert', '15', ',', '1014', 'Lausanne', '.', 'Objet', 'Révocation', 'de', 'l', "'", 'autorisation', 'de', 'séjour', ',', 'recours', 'en', 'matière', 'de', 'droit', 'public', 'contre', 'l', "'", 'arrêt', 'du', 'Tribunal', 'administratif', 'du', 'canton', 'd

In [135]:
with open(f"data/ner/complete_ner_data.json",'w') as f:
    json.dump(data_json,f)

# Create training and test set 

In [85]:
#creating the training set
train_set = data.sample(frac = 0.8,random_state=seed)

#saving the training set
train_path = f"data/training_set"
train_set.to_parquet(os.path.join(train_path,f"training_set.parquet"),engine='pyarrow')

In [86]:
#creating the test set
test_set = data.drop(train_set.index)

#saveing the test set
test_path = f"data/test"
test_set.to_parquet(os.path.join(test_path,f"test_set.parquet"),engine = 'pyarrow')

#free memory
del data

In [87]:
print(f"Training set size : {len(train_set)}")
print(f"Test set size : {len(test_set)}")

Training set size : 99271
Test set size : 24818


In [90]:
train_set.head()

Unnamed: 0,tokenized_text,start_token_indices,end_token_indices,ner_labels
70615,"[Partecipanti, al, procedimento, Ascomp, ,, Zu...","[56, 756, 66, 776, 82, 779, 112, 725, 3382, 13...","[57, 757, 67, 777, 83, 780, 113, 726, 3383, 13...","[person, person, person, person, person, perso..."
27733,"[Participants, à, la, procédure, dame, Fljorim...","[5, 20, 100, 108, 184, 276]","[6, 21, 101, 109, 185, 277]","[person, person, person, person, person, person]"
57958,"[Participants, à, la, procédure, AXA, Assuranc...",[],[],[]
16632,"[Bundesgericht, Tribunal, fédéral, Tribunale, ...","[46, 109]","[47, 110]","[person, person]"
115448,"[Parteien, und, dem, Kantonsgericht, Luzern, ,...",[],[],[]


In [91]:
test_set.head()

Unnamed: 0,tokenized_text,start_token_indices,end_token_indices,ner_labels
2,"[Parteien, Deborah, Thut, ,, Beschwerdeführer,...","[1, 86, 129, 159, 207, 271, 306, 190]","[2, 87, 130, 160, 208, 272, 307, 191]","[person, person, person, person, person, perso..."
5,"[Tribunale, federale, Tribunal, federal, {, T,...","[45, 154, 223, 267, 364, 964]","[46, 155, 224, 268, 365, 965]","[person, person, person, person, person, person]"
15,"[Tribunale, federale, Tribunal, federal, {, T,...","[40, 101, 191]","[41, 102, 192]","[person, person, person]"
18,"[Parteien, Christoph, Trapp, ,, Beschwerdeführ...",[1],[2],[person]
20,"[Parteien, Magali, Traxel, ,, Beschwerdeführer...",[1],[2],[person]


# Create json from DataFrame

In [93]:
train_json = df_to_json(train_set)

In [94]:
print(train_json[0])
print(len(train_json))

{'tokenized_text': ['Partecipanti', 'al', 'procedimento', 'Ascomp', ',', 'Zurich', ',', 'Switzerland', 'Abrantix', 'SA', ',', 'patrocinata', 'dall', "'", 'avv', '.', 'Andrea', 'Ferrazzini', ',', 'ricorrente', ',', 'contro', '1', '.', 'Ministero', 'pubblico', 'del', 'Cantone', 'Ticino', ',', 'palazzo', 'di', 'giustizia', ',', 'via', 'Pretorio', '16', ',', '6901', 'Lugano', ',', '2', '.', 'Abrantix', 'Ltd', ',', 'patrocinata', 'dall', "'", 'avv', '.', 'Georg', 'Zondler', ',', '3', '.', 'Tiago', 'Todorova', ',', '4', '.', 'Cliente', '01', ',', '5', '.', 'Elisabeth', 'Friess', ',', 'entrambi', 'patrocinati', 'dall', "'", 'avv', '.', 'dott', '.', 'Elio', 'Brunetti', ',', '6', '.', 'Martina', 'Zurbriggen', ',', 'patrocinato', 'dall', "'", 'avv', '.', 'Ilario', 'Bernasconi', ',', '7', '.', 'Cliente', '02', ',', '8', '.', 'Ast', 'Swiss', 'Inc', '.', ',', '9', '.', 'Cliente', '03', ',', '10', '.', 'Vedran', 'Blatter', 'Familienstiftung', ',', '11', '.', 'Incave', 'Inc', '.', ',', '12', '.', 'Ja

In [95]:
test_json = df_to_json(test_set)
print(test_json[0])
print(len(test_json))

{'tokenized_text': ['Parteien', 'Deborah', 'Thut', ',', 'Beschwerdeführer', ',', 'gegen', 'Strassenverkehrsamt', 'des', 'Kantons', 'Luzern', ',', 'Postfach', '162', ',', '6000', 'Luzern', '4', ',', 'Verwaltungsgericht', 'des', 'Kantons', 'Luzern', ',', 'Abgaberechtliche', 'Abteilung', ',', 'Obergrundstrasse', '46', ',', '6002', 'Luzern', '.', 'Gegenstand', 'Administrativmassnahmen', ',', 'Führerausweisentzug', ',', 'Beschwerden', 'in', 'öffentlich-rechtlichen', 'Angelegenheiten', '(', 'Art', '.', '82', 'ff', '.', 'BGG', ')', 'gegen', 'die', 'beiden', 'Urteile', 'des', 'Verwaltungsgerichts', 'des', 'Kantons', 'Luzern', ',', 'Abgaberechtliche', 'Abteilung', ',', 'vom', '4', '.', 'und', '5', '.', 'Januar', '2007', '.', 'Das', 'Bundesgericht', 'zieht', 'in', 'Erwägung', ':', '1', '.', 'Am', '19', '.', 'April', '2006', 'lenkte', 'Deborah', 'Thut', 'in', 'Luzern', 'einen', 'Personenwagen', ',', 'obwohl', 'ihm', 'der', 'Führerausweis', 'für', 'einen', 'Monat', 'entzogen', 'worden', 'war', '.'

## Sanity check

In [99]:
from importlib import reload
import utils
reload(utils)
from utils import join_tokens,gliner_ner_format_text_match

In [100]:
ex = train_json[0]

gliner_ner_format_text_match(ex)





----Matching ner_labels to tokenized_text-----
Ascomp, Zurich, Switzerland  -->  organization
Abrantix  -->  organization
Abrantix  -->  organization
Tiago Todorova  -->  person
Elisabeth Friess  -->  person
Martina Zurbriggen  -->  person
Ast Swiss  -->  organization
Vedran Blatter  -->  person
Incave  -->  organization
Jamotion  -->  organization
Samuel Meixner  -->  person
Markus Nerfin  -->  person
Dhz  -->  organization
Ieu  -->  organization
Jacqueline Gräzer  -->  person
Rudolf Gomes  -->  person
Rudolf Gomes  -->  person
Ascomp, Zurich, Switzerland  -->  organization
Abrantix  -->  organization
Ascomp, Zurich, Switzerland  -->  organization
Abrantix  -->  organization
Rudolf Gomes  -->  person
Rudolf Gomes  -->  person
Ascomp, Zurich, Switzerland  -->  organization
Abrantix  -->  organization
Ascomp, Zurich, Switzerland  -->  organization
Abrantix  -->  organization
Ieu  -->  organization
Abrantix  -->  organization
Dhz  -->  organization
Markus Nerfin  -->  person
Incave  --> 

In [108]:
ex_test = test_json[190]

gliner_ner_format_text_match(ex_test)

----Matching ner_labels to tokenized_text-----
Brigitta Latorre  -->  person
Danièle Dudle  -->  person
Danièle Dudle  -->  person
Danièle Dudle  -->  person
Nadine Delabruyère  -->  person


## Save data to json files

In [109]:
import json


In [110]:
with open(f"data/training_set/training_set.json",'w') as file:
    json.dump(train_json,file)

In [111]:
with open(f"data/test/test_set.json",'w') as f:
    json.dump(test_json,f)