# Défi TextMine 2025 : Lecture les données

## Chargement

1. On charge le CSV avec `pandas`.
2. On parse les colonnes "entities" et "relations" avec `json`.

In [35]:
import pandas as pd
import json

train_df = pd.read_csv("./train/train.csv") # chargement des données d'entraînement depuis Kaggle
# train_df = train_df.set_index("id")
# parse the entities and relations columns from json to python objects with utf-8 encoding
# train_df.entities = train_df.entities.apply(lambda x: json.loads(x.encode('utf-8').decode('utf-8'))) # parsing des entités
# train_df.relations = train_df.relations.apply(lambda x: json.loads(x.encode('utf-8').decode('utf-8'))) # encodage en utf-8
train_df.entities = train_df.entities.apply(json.loads) # parsing des entités
train_df.relations = train_df.relations.apply(json.loads) # parsing des relations
# train_df.head()
print(train_df.columns)
print(train_df.shape)
# create a new column containing the length of the text in each row
train_df['text_length'] = train_df.text.apply(len)
# print the max, min and average text length
print("Stats on text length:")
print(train_df.text_length.max())
print(train_df.text_length.min())
print(train_df.text_length.mean())
print(train_df.text_length.median())
train_df['entities_length'] = train_df.entities.apply(len)
print("Stats on entities length:")
print(train_df.entities_length.max())
print(train_df.entities_length.min())
print(train_df.entities_length.mean())
print(train_df.entities_length.median())
# save the entities dictionaries in a json file
with open("entities.json", "w", encoding="UTF-8" ) as f:
    json.dump(train_df.entities.tolist(), f)
train_df['relations_length'] = train_df.relations.apply(len)
print("Stats on relations length:")
print(train_df.relations_length.max())
print(train_df.relations_length.min())
print(train_df.relations_length.mean())
print(train_df.relations_length.median())
# save the relations dictionaries in a json file
with open("relations.json", "w", encoding="UTF-8") as f:
    json.dump(train_df.relations.tolist(), f)

entities = []
relations = []
for i in range(len(train_df)):
    current_entities = train_df.entities[i]
    for entity in current_entities:
        entities.append(entity["type"])
    current_relations = train_df.relations[i]
    for relation in current_relations:
        relations.append(relation[1])
entities = list(set(entities))
# sort the entities list in alphabetical order
entities.sort()
relations = list(set(relations))
# sort the relations list in alphabetical order
relations.sort()
print("Entities:", entities)
print("Entities count:", len(entities))
print("Relations:", relations)
print("Relations count:", len(relations))
# count the number of rows having a text length greater than 1024
# print(train_df[train_df.text_length > 1024].text_length)
# compute number of examples per relation
relations_count = {}
for relation in relations:
    relations_count[relation] = 0
    for i in range(len(train_df)):
        current_relations = train_df.relations[i]
        for rel in current_relations:
            if rel[1] == relation:
                relations_count[relation] += 1
print("Relations count:")
print(relations_count)
# print the min, max, average and median number of examples per relation
relations_count_values = list(relations_count.values())
print("Min:", min(relations_count_values))
print("Max:", max(relations_count_values))
print("Average:", sum(relations_count_values) / len(relations_count_values))
print("Median:", relations_count_values[len(relations_count_values) // 2])
# print the 90th percentile of the number of examples per relation
relations_count_values.sort()
print("90th percentile:", relations_count_values[int(0.9 * len(relations_count_values))])
# print the number of relations having at least 100 examples
print("Relations having less than 100 examples:", len([rel for rel in relations_count_values if rel < 100]))
print("Relations having at least 100 examples:", len([rel for rel in relations_count_values if rel >= 100 and rel < 1000]))
print("Relations having at least 1000 examples:", len([rel for rel in relations_count_values if rel >= 1000]))

# print the relations having less than 100 examples
relations_having_less_than_100_examples = [rel for rel in relations_count if relations_count[rel] < 100]
print("Relations having less than 100 examples:")
print(relations_having_less_than_100_examples)

Index(['id', 'text', 'entities', 'relations'], dtype='object')
(800, 4)
Stats on text length:
1501
335
706.955
688.0
Stats on entities length:
40
11
22.28375
22.0
Stats on relations length:
143
4
39.33625
37.0
Entities: ['ACCIDENT', 'AGITATING_TROUBLE_MAKING', 'BOMBING', 'CATEGORY', 'CBRN_EVENT', 'CIVILIAN', 'CIVIL_WAR_OUTBREAK', 'COLOR', 'COUP_D_ETAT', 'CRIMINAL_ARREST', 'DEMONSTRATION', 'DRUG_OPERATION', 'ECONOMICAL_CRISIS', 'ELECTION', 'EPIDEMIC', 'FIRE', 'FIRSTNAME', 'GATHERING', 'GROUP_OF_INDIVIDUALS', 'HEIGHT', 'HOOLIGANISM_TROUBLEMAKING', 'ILLEGAL_CIVIL_DEMONSTRATION', 'INTERGOVERNMENTAL_ORGANISATION', 'LASTNAME', 'LATITUDE', 'LENGTH', 'LONGITUDE', 'MATERIAL_REFERENCE', 'MATERIEL', 'MILITARY', 'MILITARY_ORGANISATION', 'NATIONALITY', 'NATURAL_CAUSES_DEATH', 'NATURAL_EVENT', 'NON_GOVERNMENTAL_ORGANISATION', 'NON_MILITARY_GOVERNMENT_ORGANISATION', 'PLACE', 'POLITICAL_VIOLENCE', 'POLLUTION', 'QUANTITY_EXACT', 'QUANTITY_FUZZY', 'QUANTITY_MAX', 'QUANTITY_MIN', 'RIOT', 'STRIKE', 'SUICI

## Manipulation

Les entités liées à un texte sont décrites par des dictionnaires stockés dans une liste de mentions, typés par un type (parmi les 55s types d'entités) et identifiés par un id.

In [21]:
print(train_df.loc[0])
print('='*100)
print(train_df.loc[699].entities[0])
print(train_df.loc[699].entities[1])
print(train_df.loc[0].entities[5])
print(train_df.loc[0].entities[7])
print(train_df.loc[0].entities[9])
print(train_df.loc[0].entities[10])
print('='*100)
print(train_df.loc[0].relations[:4])
print(len(train_df.loc[0].entities))
print(len(train_df.loc[0].relations))

id                                                                181
text                Anam Destresse, président de l'ONG "Ma passion...
entities            [{'id': 0, 'mentions': [{'value': 'accident', ...
relations           [[0, STARTED_IN, 9], [7, IS_LOCATED_IN, 9], [5...
text_length                                                       641
entities_length                                                    25
relations_length                                                   30
Name: 0, dtype: object
{'id': 0, 'mentions': [{'value': 'accident', 'start': 70, 'end': 78}, {'value': 'accident de circulation', 'start': 100, 'end': 123}, {'value': 'accident', 'start': 275, 'end': 283}], 'type': 'ACCIDENT'}
{'id': 1, 'mentions': [{'value': 'Anam Destresse', 'start': 0, 'end': 14}, {'value': 'Anam Destresse', 'start': 431, 'end': 445}], 'type': 'CIVILIAN'}
{'id': 5, 'mentions': [{'value': 'blessés', 'start': 470, 'end': 477}], 'type': 'GROUP_OF_INDIVIDUALS'}
{'id': 7, 'mentions': [{'val

Un autre exemple d'entités

In [23]:
print(train_df.loc[699])
print('='*100)
print(train_df.loc[699].entities[2])
print(train_df.loc[699].entities[3])
print(train_df.loc[699].entities[5])
print(train_df.loc[699].entities[12])
print(train_df.loc[699].entities[13])
print(train_df.loc[699].entities[15])
print(train_df.loc[699].entities[17])
print(train_df.loc[699].entities[18])
print('='*100)
print(train_df.loc[699].relations[:4])
print(len(train_df.loc[699].entities))
print(len(train_df.loc[699].relations))

id                                                              41103
text                À Johannesburg, la lutte contre le trafic de v...
entities            [{'id': 0, 'mentions': [{'value': 'trafic', 's...
relations           [[13, IS_AT_ODDS_WITH, 15], [2, IS_LOCATED_IN,...
text_length                                                       597
entities_length                                                    27
relations_length                                                   70
Name: 699, dtype: object
{'id': 2, 'mentions': [{'value': 'envoyés', 'start': 344, 'end': 351}, {'value': 'trafic', 'start': 529, 'end': 535}, {'value': 'convoyer', 'start': 456, 'end': 464}], 'type': 'TRAFFICKING'}
{'id': 3, 'mentions': [{'value': 'arrestation', 'start': 388, 'end': 399}], 'type': 'CRIMINAL_ARREST'}
{'id': 5, 'mentions': [{'value': 'Johannesburg', 'start': 2, 'end': 14}], 'type': 'PLACE'}
{'id': 12, 'mentions': [{'value': 'Sahila William', 'start': 197, 'end': 211}], 'type': 'CIVILIAN'}


Les relations liées à un texte sont décrites par des triplets stockés dans une liste.

In [26]:
text_181_relation_0 = train_df.loc[0].relations[0] # sélection de la première relation listée pour le texte 181
print(text_181_relation_0) # affichage de la description complète de la relation
print(text_181_relation_0[0]) # affichage de l'entité source de la relation
print(text_181_relation_0[1]) # affichage du type de la relation
print(text_181_relation_0[2]) # affichage de l'entité cible de la relation

[0, 'STARTED_IN', 9]
0
STARTED_IN
9


In [25]:
test_df = pd.read_csv("./test/test_01-07-2024.csv") # chargement des données d'entraînement depuis Kaggle
# test_df = test_df.set_index("id")
test_df.entities = test_df.entities.apply(json.loads) # parsing des entités
test_df.head()
print(test_df.columns)
print(test_df.shape)
# create a new column containing the length of the text in each row
test_df['text_length'] = test_df.text.apply(len)
# print the max, min and average text length
print("Stats on text length:")
print(test_df.text_length.max())
print(test_df.text_length.min())
print(test_df.text_length.mean())
print(test_df.text_length.median())
test_df['entities_length'] = test_df.entities.apply(len)
print("Stats on entities length:")
print(test_df.entities_length.max())
print(test_df.entities_length.min())
print(test_df.entities_length.mean())
print(test_df.entities_length.median())

entities = []
for i in range(len(test_df)):
    current_entities = test_df.entities[i]
    for entity in current_entities:
        entities.append(entity["type"])
entities = list(set(entities))
# sort the entities list in alphabetical order
entities.sort()
print("Entities:", entities)
print("Entities count:", len(entities))

Index(['id', 'text', 'entities'], dtype='object')
(400, 3)
Stats on text length:
1091
153
710.47
695.0
Stats on entities length:
38
4
21.0
21.0
Entities: ['ACCIDENT', 'AGITATING_TROUBLE_MAKING', 'BOMBING', 'CATEGORY', 'CBRN_EVENT', 'CIVILIAN', 'CIVIL_WAR_OUTBREAK', 'COLOR', 'COUP_D_ETAT', 'CRIMINAL_ARREST', 'DEMONSTRATION', 'DRUG_OPERATION', 'ECONOMICAL_CRISIS', 'ELECTION', 'EPIDEMIC', 'FIRE', 'FIRSTNAME', 'GATHERING', 'GROUP_OF_INDIVIDUALS', 'HEIGHT', 'HOOLIGANISM_TROUBLEMAKING', 'ILLEGAL_CIVIL_DEMONSTRATION', 'INTERGOVERNMENTAL_ORGANISATION', 'LASTNAME', 'LATITUDE', 'LENGTH', 'LONGITUDE', 'MATERIAL_REFERENCE', 'MATERIEL', 'MILITARY', 'MILITARY_ORGANISATION', 'NATIONALITY', 'NATURAL_CAUSES_DEATH', 'NATURAL_EVENT', 'NON_GOVERNMENTAL_ORGANISATION', 'NON_MILITARY_GOVERNMENT_ORGANISATION', 'PLACE', 'POLITICAL_VIOLENCE', 'POLLUTION', 'QUANTITY_EXACT', 'QUANTITY_FUZZY', 'QUANTITY_MAX', 'QUANTITY_MIN', 'RIOT', 'STRIKE', 'SUICIDE', 'TERRORIST_OR_CRIMINAL', 'THEFT', 'TIME_EXACT', 'TIME_FUZZY',

In [28]:
print(test_df.loc[0])
print('='*100)
print(test_df.loc[0].entities[0])
print(test_df.loc[0].entities[1])
print(test_df.loc[0].entities[5])
print(test_df.loc[0].entities[7])
print(test_df.loc[0].entities[9])
print(test_df.loc[0].entities[10])
print('='*100)
print(len(test_df.loc[0].entities))
print(test_df.id.head(10))


id                                                              1204
text               “FEAR” est une organisation spécialisée dans l...
entities           [{'id': 0, 'mentions': [{'value': 'crash', 'st...
text_length                                                      603
entities_length                                                   19
Name: 0, dtype: object
{'id': 0, 'mentions': [{'value': 'crash', 'start': 277, 'end': 282}, {'value': 'accident', 'start': 487, 'end': 495}], 'type': 'ACCIDENT'}
{'id': 1, 'mentions': [{'value': 'Clavier Joshua', 'start': 406, 'end': 420}], 'type': 'CIVILIAN'}
{'id': 5, 'mentions': [{'value': 'satellite', 'start': 546, 'end': 555}], 'type': 'MATERIEL'}
{'id': 7, 'mentions': [{'value': 'smartphones', 'start': 79, 'end': 90}], 'type': 'MATERIEL'}
{'id': 9, 'mentions': [{'value': 'Argentine', 'start': 196, 'end': 205}], 'type': 'PLACE'}
{'id': 10, 'mentions': [{'value': 'Buenos Aires', 'start': 99, 'end': 111}], 'type': 'PLACE'}
19
0     1204
1     4