# **MA513 - Hands-on Machine Learning for Cybersecurity PROJECT** 

Dans ce projet nous allons dévelloper un modèle NER. Chaque étape aura sa propres explications et nous justifirons chaque choix

In [None]:
#imporation des libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Embedding, Dense, SimpleRNN

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import string
from pprint import pprint
import re

## Méthode 1 : Création de notre modèle LSTM 

### Etape 1 : Preprocessing 

Dans cette étape, nous allons effectuer un préprocessing des données pour optimiser leur qualité et garantir des performances fiables et robustes des modèles d’apprentissage automatique.

In [3]:
import json
import pandas as pd

# Chemin du fichier JSONLines en entrée
input_file1 = "data/NER-TRAINING.jsonlines"
input_file2 = "data/NER-VALIDATION.jsonlines"
input_file_test = "data/NER-TESTING.jsonlines"
# Liste pour stocker les données

def open_file(input_file):
    data = []
    # Charger les données JSONLines
    with open(input_file, "r", encoding="utf-8") as infile:
        for line in infile:
            record = json.loads(line)
            tokens = record["tokens"]
            ner_tags = record["ner_tags"]
            index = record["unique_id"]
            
            # Ajouter chaque token, tag et index à la liste
            for token, ner_tag in zip(tokens, ner_tags):
                data.append({"index": index, "tokens": token, "ner_tags": ner_tag})
    return data

def open_file2(input_file):
    data = []
    # Charger les données JSONLines
    with open(input_file, "r", encoding="utf-8") as infile:
        for line in infile:
            record = json.loads(line)
            tokens = record["tokens"]
            index = record["unique_id"]
            
            # Ajouter chaque token, tag et index à la liste
            for token in tokens:
                data.append({"index": index, "tokens": token})
    return data
# Créer une DataFrame à partir des données
#training data
data1 = open_file(input_file1)
df_train = pd.DataFrame(data1)

data2 = open_file(input_file2)
df_val = pd.DataFrame(data2)

data3 = open_file2(input_file_test)
df_test = pd.DataFrame(data3)

In [5]:
df_train.to_parquet('data_parquet/NER-TRAINING.parquet', index=False)
df_val.to_parquet('data_parquet/NER-VALIDATION.parquet', index=False)
df_test.to_parquet('data_parquet/NER-TESTING.parquet', index=False)

In [5]:
df_train = pd.read_parquet("data_parquet/NER-TRAINING.parquet")
df_val = pd.read_parquet("data_parquet/NER-VALIDATION.parquet")
df_test = pd.read_parquet("data_parquet/NER-TESTING.parquet")

In [None]:
print(df_test.head())

Dans cette étape nous avons choisi d'enregistrer nos données en parquet files car ceci représente plusieurs avantages pour notre volume de données. Meilleurs compressions, plus de flexibilité et donc meilleurs performances. 
1. ressources : https://medium.com/munchy-bytes/are-you-using-parquet-with-pandas-in-the-right-way-595c9ee7112

#### Nettoyage de données  

Dans cette étape nous allons analyser nos données afin de pouvoir retirer les éléments qui serait en trop. 

In [7]:
#Importation de notre dataset 
file_path1 = "data_parquet/NER-TRAINING.parquet" #dataset -> training
file_path2 = "data_parquet/NER-VALIDATION.parquet"
file_path3 = "data_parquet/NER-TESTING.parquet"

df_train = pd.read_parquet(file_path1)
df_val = pd.read_parquet(file_path2)
df_test = pd.read_parquet(file_path3)

In [None]:
#affichage de nos données : 
#affichage val
print("données df_train : \n \n ", df_train)
print(len(df_train['tokens'].iloc[0]))
print(len(df_train['tokens'].iloc[1]))
#affichage 
print("données df_train : \n \n ", df_val)
print(len(df_val['tokens'].iloc[0]))
print(len(df_val['tokens'].iloc[1]))

Avant même de commencer nous devons bien comprendre globalement la représentation de chaque colonnes : 

1. unique_id : un nombre entier 
2. token : un chaine de charactère, le coeur de notre dataset 
3. ner_tag : notre label 

In [None]:
#affichage des dimmensions avant nettoyage : 
print("dimensions avant nettoyage : ", df_train.shape)
print("dimensions avant nettoyage : ", df_val.shape)
print("dimensions avant nettoyage : ", df_test.shape)

Nous avons remarqué que notre dataset contient de nombreuses valeurs inutiles qui pourraient nuire à l'entraînement de notre modèle. Par exemple, les caractères spéciaux et les majuscules, qui sont peu pertinents dans ce contexte. Dans cette étape, nous allons procéder à leur suppression afin d'optimiser la qualité des données pour l'entraînement :

In [None]:
#on retire les ponctuations : 
df_train = df_train[~df_train['tokens'].isin(list('?$#@./|:()",;[]{}-'))]
print('dimensions après nettoyage : ', df_train.shape)

df_val = df_val[~df_val['tokens'].isin(list('?$#@./|:()",;[]{}-'))]
print('dimensions après nettoyage : ', df_val.shape)

df_test = df_test[~df_test['tokens'].isin(list('?$#@./|:()",;[]{}-'))]
print('dimensions après nettoyage : ', df_test.shape)

In [11]:
# Fonction adaptée pour gérer uniquement 'unique_id'
def index_mapper(df):
    
    unique_ids = df['index'].unique()
    index = np.arange(1, len(unique_ids) + 1, 1)
    index_dict = dict(zip(unique_ids, index))
    df['index'] = [index_dict[uid] for uid in df['index']]
    df.set_index('index', inplace=True)
    return df

df_train = index_mapper(df_train)
df_val = index_mapper(df_val)
df_test = index_mapper(df_test)

Nous sélectionnons maintenant uniquement les valeurs pertinentes de notre DataFrame, afin d'entraîner notre modèle de manière optimale

In [None]:
df_train_ = df_train[['tokens', 'ner_tags']]
df_train_.head()

df_val_ = df_val[['tokens', 'ner_tags']]
df_val_.head()

df_test_ = df_test
df_test_.head()

### Analyse des données 
Dans cette étape nous essayer d'analyser les données afin d'éléminer les possible outliners qui pourrais fausser notre modèles.

In [None]:
# Plot length of the sentences.
index, length = np.unique(df_train_.index, return_counts=True)
fig, ax = plt.subplots(figsize=[25,6])
N, bins, patches = ax.hist(length, bins=100)
plt.xlabel('Texte long')
plt.ylabel('Freq')
plt.show()

print('Nombre de phrase dans notre dataset : ', len(length))   ## Number of sentences

In [None]:
# Calculer la répartition des ner_tags dans df_train
train_tag_distribution = df_train['ner_tags'].explode().value_counts()

# Calculer la répartition des ner_tags dans df_val
val_tag_distribution = df_val['ner_tags'].explode().value_counts()

# Afficher les distributions
print("Répartition des ner_tags dans le dataset d'entraînement :\n", train_tag_distribution)
print("\nRépartition des ner_tags dans le dataset de validation :\n", val_tag_distribution)

### Etape 2 : Entraînement du modèle

#### Encodage du dataset

In [None]:
le = LabelEncoder()
df_train_['ner_tags'] = le.fit_transform(df_train_['ner_tags'])
df_train_.head()
df_val_['ner_tags'] = le.fit_transform(df_val_['ner_tags'])
df_val_.head()

In [None]:
## Because padded variable has to be kept as 0 and not -1. Else label encode would trouble it!
df_train_.ner_tags += 1
df_train_.head()
df_val_.ner_tags += 1
df_val_.head()

#### Tokenization du data training 
Dans cette étape, nous allons effectuer une tokenisation des données pour les segmenter en unités plus petites (comme des mots ou des phrases), afin de faciliter leur traitement par les modèles d’apprentissage automatique.

In [16]:
# Tokenize the words
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='?$#@./|:()",;[]{}-',
                                                  lower=False, 
                                                  split=' ',
                                                  num_words=2000,
                                                  oov_token='')       # Initialize

tokenizer.fit_on_texts(df_train_.tokens.values) 
tokenizer.fit_on_texts(df_val_.tokens.values) 
tokenizer.fit_on_texts(df_test.tokens.values) 

In [17]:
# Transform to numeric
tokens_train  = tokenizer.texts_to_sequences(df_train_.tokens.values)
tokens_val  = tokenizer.texts_to_sequences(df_val_.tokens.values)
tokens_test  = tokenizer.texts_to_sequences(df_test.tokens.values)

In [18]:
word_to_token = tokenizer.word_index

In [19]:
#verification de la tokenization
df_train_.tokens = tokens_train
df_train_.head()
df_val_.tokens = tokens_val
df_val_.head()
df_test.tokens = tokens_test

In [20]:
# Supprimer les éléments vides des données d'entraînement
import numpy as np

# Filtrer les tokens non vides (reste sous forme de liste Python)
b = [len(token) > 0 for token in tokens_train]  
df_train_ = df_train_.iloc[b, :]  # Filtrer le DataFrame
tokens_train = [tokens_train[i] for i, keep in enumerate(b) if keep]  # Filtrer les tokens

#
b = [len(token) > 0 for token in tokens_val]  
df_val_ = df_val_.iloc[b, :]  # Filtrer le DataFrame
tokens_val = [tokens_val[i] for i, keep in enumerate(b) if keep]  # Filtrer les tokens

b = [len(token) > 0 for token in tokens_test]  
df_test = df_test.iloc[b, :]  # Filtrer le DataFrame
tokens_test = [tokens_test[i] for i, keep in enumerate(b) if keep]  # Filtrer les tokens

In [None]:
df_train_.tokens = df_train_.tokens.apply(lambda x: x[0])
print(df_train_.head())

df_val_.tokens = df_val_.tokens.apply(lambda x: x[0])
print(df_val_.head())

df_test.tokens = df_test.tokens.apply(lambda x: x[0])
print(df_test.head())

In [None]:
df_train_ = df_train_.astype(str)
df_train_concat = df_train_.groupby(df_train_.index).agg(lambda x: ' '.join(x))
print(df_train_concat.head())

df_val_ = df_val_.astype(str)
df_val_concat = df_val_.groupby(df_val_.index).agg(lambda x: ' '.join(x))
print(df_val_concat.head())

df_test = df_test.astype(str)
df_test_concat = df_test.groupby(df_test.index).agg(lambda x: ' '.join(x))
print(df_test_concat.head())

In [None]:

df_train_concat.tokens = df_train_concat.tokens.apply(lambda x: x.split(' '))
df_train_concat.ner_tags = df_train_concat.ner_tags.apply(lambda x: x.split(' '))
print('Training Set Shape after Concatenating Sentences: ', df_train_concat.shape)

df_val_concat.tokens = df_val_concat.tokens.apply(lambda x: x.split(' '))
df_val_concat.ner_tags = df_val_concat.ner_tags.apply(lambda x: x.split(' '))
print('Training Set Shape after Concatenating Sentences: ', df_val_concat.shape)

df_test_concat.tokens = df_test_concat.tokens.apply(lambda x: x.split(' '))

#### Converting Data for Model Requirements

In [24]:
train_sentences = df_train_concat.tokens
train_tags = df_train_concat.ner_tags

val_sentences = df_val_concat.tokens
val_tags = df_val_concat.ner_tags

test_sentences = df_test_concat.tokens

In [25]:
for sentence in test_sentences:
    for token in sentence:  # Parcourt les tokens dans chaque sous-liste
        if not str(token).isdigit():  # Convertit le token en chaîne avant de vérifier
            print(token)

In [26]:
train_sentences = [list(map(int, sentence)) for sentence in train_sentences]
train_tags = [list(map(int, sentence)) for sentence in train_tags]

val_sentences = [list(map(int, sentence)) for sentence in val_sentences]
val_tags = [list(map(int, sentence)) for sentence in val_tags]

test_sentences = [list(map(int, sentence)) for sentence in test_sentences]

In [None]:
# Helper Code
batch_size = 16
train_shuffle_buffer_size = len(train_sentences)
validation_shuffle_buffer_size = len(val_sentences)

# Fill the required cells to complete the function
def transform_pad(input, output):
    input  = input.to_tensor(default_value=0, shape=[None, None])
    output = output.to_tensor(default_value=0, shape=[None, None])   
    return input, output

def transform_pad2(input):
    input  = input.to_tensor(default_value=0, shape=[None, None])
    #output = output.to_tensor(default_value=0, shape=[None, None])   
    return input

train_processed_x = tf.ragged.constant(train_sentences)
validate_processed_x = tf.ragged.constant(val_sentences)
test_processed_x = tf.ragged.constant(test_sentences)

train_processed_y = tf.ragged.constant(train_tags)
validate_processed_y = tf.ragged.constant(val_tags)

# Create TF Dataset
train_data = tf.data.Dataset.from_tensor_slices((train_processed_x, train_processed_y))
validation_data = tf.data.Dataset.from_tensor_slices((validate_processed_x, validate_processed_y))
test_data = tf.data.Dataset.from_tensor_slices(test_processed_x)

#############
# Train data
#############
train_data = train_data.shuffle(buffer_size=train_shuffle_buffer_size)
train_data = train_data.batch(batch_size)
train_data = train_data.map(transform_pad, num_parallel_calls=tf.data.AUTOTUNE)
train_data = train_data.prefetch(tf.data.AUTOTUNE)


##################
# Validation data
##################
validation_data = validation_data.batch(batch_size)
validation_data = validation_data.map(transform_pad, num_parallel_calls=tf.data.AUTOTUNE)
validation_data = validation_data.prefetch(tf.data.AUTOTUNE)

##################
# Test data
##################
test_data = test_data.batch(batch_size)
test_data = test_data.map(transform_pad2, num_parallel_calls=tf.data.AUTOTUNE)
test_data = test_data.prefetch(tf.data.AUTOTUNE)


print("train_data", train_data)
print("validation_data", validation_data)


### Etape 3 : Création du modèle

In [87]:
# Your code here

hidden_size_LSTM = 100
hidden_size_Dense = 100

def build_model():

  ## ---------------------------------------------------------------------------
  ## FORWARD LAYER -------------------------------------------------------------
  ## ---------------------------------------------------------------------------

  ## Define input layer.
  inputs_f = tf.keras.Input(shape=[None])


  ## Embedding Layer for forward.
  embedding_layer_f = tf.keras.layers.Embedding(input_dim=2000,
                                                output_dim=300,
                                                # weights=[embedding_matrix],
                                                # input_length=max_length,
                                                trainable=True,
                                                mask_zero=True)

  ## Create a forward LSTM.
  RNN1_layer_f = tf.keras.layers.LSTM(hidden_size_LSTM, return_sequences=True)

  ## Add Dropout after first LSTM
  dropout1_f = tf.keras.layers.Dropout(0.3)  ## Adjust dropout rate as needed.

  ## Create a dense layer for simulating the highway layer rather than using it here.
  dense_layer_f = tf.keras.layers.Dense(units=hidden_size_Dense, activation='linear', use_bias=False)

  ## Add Dropout after Dense layer
  dropout2_f = tf.keras.layers.Dropout(0.3)

  ## Create an additive layer.
  additive_layer_f = tf.keras.layers.Add()

  ## Create second forward LSTM.
  RNN2_layer_f = tf.keras.layers.LSTM(hidden_size_LSTM, return_sequences=True)

  ## Add Dropout after second LSTM
  dropout3_f = tf.keras.layers.Dropout(0.3)

  ## Pass Inputs ---------------------------------------------------------------

  embedding_f = embedding_layer_f(inputs_f)   ## Get forward and backward embeddings.
  r_f = dropout1_f(RNN1_layer_f(embedding_f)) ## Get LSTM outputs with dropout.
  z_f = dropout2_f(dense_layer_f(embedding_f))## Get Dense layer outputs with dropout.
  h_f = dropout3_f(RNN2_layer_f(r_f + z_f))   ## Get LSTM2 outputs with dropout.


  ## ---------------------------------------------------------------------------
  ## BACKWARD LAYER ------------------------------------------------------------
  ## ---------------------------------------------------------------------------

  ## Create a forward LSTM.
  RNN1_layer_b = tf.keras.layers.LSTM(hidden_size_LSTM, return_sequences=True, go_backwards=True)

  ## Add Dropout after first LSTM
  dropout1_b = tf.keras.layers.Dropout(0.3)

  ## Create a dense layer for simulating the highway layer rather than using it here.
  dense_layer_b = tf.keras.layers.Dense(units=hidden_size_Dense, activation=None, use_bias=False)

  ## Add Dropout after Dense layer
  dropout2_b = tf.keras.layers.Dropout(0.3)

  ## Create an additive layer.
  additive_layer_b = tf.keras.layers.Add()

  ## Create second forward LSTM.
  RNN2_layer_b = tf.keras.layers.LSTM(hidden_size_LSTM, return_sequences=True, go_backwards=True)

  ## Add Dropout after second LSTM
  dropout3_b = tf.keras.layers.Dropout(0.3)

  ## Pass Inputs ------------------------------------------------------------

  r_b = dropout1_b(RNN1_layer_b(embedding_f)) ## Get LSTM outputs with dropout.
  r_b = r_b[:,::-1,:]                         ## We need to reverse the output from go_backwards. Ref: https://medium.com/@rachit1jain/lstm-go-backwards-unravelling-its-hidden-secrets-ed094952b5cc
  z_b = dropout2_b(dense_layer_b(embedding_f))## Get Dense layer outputs with dropout.
  h_b = dropout3_b(RNN2_layer_b(r_b + z_b))   ## Get LSTM2 outputs with dropout.
  h_b = h_b[:,::-1,:]                         ## We need to reverse the output from go_backwards. Ref: https://medium.com/@rachit1jain/lstm-go-backwards-unravelling-its-hidden-secrets-ed094952b5cc


  ## ---------------------------------------------------------------------------
  ## For Outputs ---------------------------------------------------------------
  ## ---------------------------------------------------------------------------


  ## Create Softmax Layer.
  softmaxLayer = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(units=le.classes_.shape[0]+1, activation='softmax', name='softmaxLayer'))

  output_f = softmaxLayer(h_f)
  output_b = softmaxLayer(h_b)

  output_mf = 0.5*output_f
  output_mb = 0.5*output_b
  output = output_mf + output_mb

  ## ---------------------------------------------------------------------------
  ## Setup the Outputs ---------------------------------------------------------
  ## ---------------------------------------------------------------------------

  ## Set up the model with appropriate inputs and the output defined above 
  model = tf.keras.Model(inputs=inputs_f, outputs=output, name='Model')


  return model


In [None]:

############################
# Training Params
############################

import time

learning_rate = 5e-4
epochs = 40

# Free up memory
tf.keras.backend.clear_session()

# Build the model
model = build_model()

# Print the model architecture
print(model.summary())

# Optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Loss
loss = tf.keras.losses.sparse_categorical_crossentropy

# Callbacks
my_callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=2, monitor='val_loss'),
    tf.keras.callbacks.ModelCheckpoint(filepath='model_2.{epoch:02d}-{loss:.2f}.h5.keras'),
]

# Compile
model.compile(
              loss=loss,
              optimizer=optimizer,
              metrics=[tf.keras.metrics.sparse_categorical_accuracy])



In [None]:
# Train model
start_time = time.time()

training_results = model.fit(
        train_data,
        epochs=epochs, 
        verbose=1,
        # class_weight=class_weights,
        validation_data=validation_data,
        callbacks=my_callbacks)

execution_time = (time.time() - start_time)/60.0
print("Training execution time (mins)",execution_time)

In [None]:
# Plot the trace plot of the loss of the model
plt.plot(training_results.history['loss'], label='Train')
plt.plot(training_results.history['val_loss'], label='Validation')
plt.title('Loss Plot')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(loc=0)
plt.show()

In [None]:
# Plot the trace plot of the loss of the model
plt.plot(training_results.history['sparse_categorical_accuracy'], label='Train')
plt.plot(training_results.history['val_sparse_categorical_accuracy'], label='Validation')
plt.title('Accuracy Plot')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(loc=0)
plt.show()

"""
COMMENTAIRE: 
ce graphe montre un entraînement efficace mais un début de sur-apprentissage. 
"""

#### Étape 5 : Prédiction 
Dans cette étape nous allons utiliser notre modèles afin de pouvoir prédire les différents "ner_tags" de notre fichier NER-TESTING.jsonlines.

In [None]:
# Prédictions sur les données de test
predictions = model.predict(test_data)
predictions_dense = predictions.to_tensor()
predicted_tags = predictions_dense.numpy().argmax(axis=-1)
predicted_tags[predicted_tags == 7] = 6

le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
le_mapping_new = {le_name_mapping[k]: k for k in le_name_mapping.keys()}

# Mapping des indices des classes vers les étiquettes originales
predicted_labels = [
    [le_mapping_new[tag] for tag in sentence if tag in le_mapping_new]
    for sentence in predicted_tags
]

token_list = []
label_list = []
tokens_reconstructed = [[tokenizer.index_word[idx] for idx in seq] for seq in test_sentences]
print(len(tokens_reconstructed))
for i, (tokens, labels) in enumerate(zip(tokens_reconstructed, predicted_labels)):
    print(f"Phrase {i + 1}:")
    for token, label in zip(tokens, labels):
        print(f"{token} -> {label}")
        token_list.append(token)
        label_list.append(label)
    print("-" * 30)



#### Création du dossier résultat 

In [None]:
print('token list :', len(token_list))
print('label list :', len(label_list)) 
print(token_list[0])

In [None]:
#
from collections import defaultdict

data3 = open_file2(input_file_test)
i = 0
custom_punctuation = '?$#@./|:()",;[]{}-'
custom_punctuation_list = list(custom_punctuation)
#print(custom_punctuation_list)
for item in data3:

    if item['tokens'] in custom_punctuation_list or item['tokens'] == '...':
        item['ner_tag'] = 'O'
    else:
        if i != 23391: 
            i += 1
            item['ner_tag'] = label_list[i]
            #print("items :", item['tokens'])
            #print("token :", token_list[i])
            #print(i)
        else: 
             item['ner_tag'] = 'O'
        

# Utilisation de defaultdict pour regrouper les tokens et ner_tags par index
grouped_data = defaultdict(lambda: {'tokens': [], 'ner_tags': []})

# Remplir les listes tokens et ner_tags pour chaque index
for item in data3:
    index = item['index']
    grouped_data[index]['tokens'].append(item['tokens'])
    grouped_data[index]['ner_tags'].append(item['ner_tag'])

# Transformer chaque groupe d'index en un format souhaité
formatted_data = []
for index, value in grouped_data.items():
    formatted_data.append({
        "unique_id": index,
        "tokens": value['tokens'],
        "ner_tags": value['ner_tags']
    })

# Enregistrer le résultat en jsonlines
with open('resultat/output.jsonlines', 'w') as f:
    for entry in formatted_data:
        json.dump(entry, f)
        f.write('\n')

# Afficher le résultat pour vérification
print(formatted_data)

##  Méthode 2 : Utilisation d'un modèle pré-entrainer - SciBert 

In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
import warnings
from imblearn.over_sampling import RandomOverSampler
warnings.filterwarnings("ignore")

# Reading the WIESP-NER_TRAINING Dataset 
input_file1 = "data_parquet/NER-TRAINING.parquet"
input_file2 = "data_parquet/NER-VALIDATION.parquet"
df_train = pd.read_parquet(input_file1)
df_val = pd.read_parquet(input_file2)

# Renaming the columns as required by simpletransformer train method
#df = df[['index', 'tokens', 'ner_tags']]
df_train = df_train.rename(columns={'index':'sentence_id', 'tokens':'words', 'ner_tags':'labels'})
df_val = df_val.rename(columns={'index':'sentence_id', 'tokens':'words', 'ner_tags':'labels'})
df_train.set_index('sentence_id',inplace=True)
df_val.set_index('sentence_id',inplace=True)


  from pandas.core import (


#### Répartition des données

In [2]:
# Calculer la répartition des ner_tags dans df_train
train_tag_distribution = df_train['labels'].explode().value_counts()

# Calculer la répartition des ner_tags dans df_val
val_tag_distribution = df_val['labels'].explode().value_counts()

# Afficher les distributions
print("Répartition des ner_tags dans le dataset d'entraînement :\n", train_tag_distribution)
print("\nRépartition des ner_tags dans le dataset de validation :\n", val_tag_distribution)

Répartition des ner_tags dans le dataset d'entraînement :
 labels
O             95878
I-Entity      12644
B-Entity       4240
B-Action       1989
B-Modifier     1226
I-Action        518
I-Modifier       99
Name: count, dtype: int64

Répartition des ner_tags dans le dataset de validation :
 labels
O             20760
I-Entity       2907
B-Entity        923
B-Action        416
B-Modifier      280
I-Action        110
I-Modifier       22
Name: count, dtype: int64


In [3]:
print(df_train.head(5))
print(df_val.head(5))

             words labels
sentence_id              
6506         Later      O
6506            in      O
6506           May      O
6506            of      O
6506          2010      O
             words labels
sentence_id              
6422          Just      O
6422             1      O
6422          year      O
6422         later      O
6422             ,      O


In [4]:
from simpletransformers.ner import NERModel, NERArgs
from sklearn.utils.class_weight import compute_class_weight
# Calcul des poids des classes
# Calcul des poids pour chaque classe

# Setting model arguments
model_args = NERArgs()
model_args.labels_list = list(df_train.labels.unique())
model_args.num_train_epochs = 5
#model_args.class_weights = class_weights.tolist()
model_args.evaluate_during_training = True
model_args.evaluate_during_training_verbose = False
model_args.max_seq_length = 256
model_args.early_stopping_metric = 'eval_loss'
model_args.use_early_stopping = True
model_args.early_stopping_delta = 0.001
model_args.optimizer = 'AdamW'
model_args.early_stopping_patience = 1
model_args.do_lower_case = False
model_args.overwrite_output_dir = True
model_args.train_batch_size = 32 
model_args.learning_rate = 0.0001

# Defining the model
model = NERModel(
    "bert",
    "bert-base-cased",
    args=model_args,
    use_cuda=False
)

# Entraîner le modèle
history = model.train_model(df_train, show_running_loss=True, eval_data=df_val)

In [8]:
input_file3 = "data_parquet/NER-TESTING.parquet"
df_test = pd.read_parquet(input_file2)
df_test = df_test.rename(columns={'index':'sentence_id','tokens':'words'})
df_test.set_index('sentence_id',inplace=True)
df_test = df_test.groupby('sentence_id').agg(sentences=('words', lambda x:list(x)))
df_test.head(5)

Unnamed: 0_level_0,sentences
sentence_id,Unnamed: 1_level_1
0,"[Context, Information, Security, distribute, C..."
1,"[The, various, levels, of, the, TLP, are, repr..."
14,"[Gom, Player, originates, in, South, Korea, an..."
19,"[If, this, is, the, work, of, a, Chinese, grou..."
24,"[If, this, was, indeed, the, case, ,, then, th..."


In [9]:
# Getting prediction from the model for sentences in validation dataset
predictions, _ = model.predict(df_test.sentences, split_on_space=False)

  0%|          | 0/3 [00:00<?, ?it/s]

  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  Referenced from: <61623A3D-DA3C-3AAD-B2F0-D363151DDB3F> /Users/ecsrkhaif/anaconda3/lib/python3.10/site-packages/torchvision/image.so
  Expected in:     <ECC148AF-20FF-3EEE-BC75-4DD3E7455393> /Users/ecsrkhaif/anaconda3/lib/python3.10/site-packages/torch/lib/libtorch_cpu.dylib
  warn(f"Failed to load image Python extension: {e}")
  Referenced from: <61623A3D-DA3C-3AAD-B2F0-D363151DDB3F> /Users/ecsrkhaif/anaconda3/lib/python3.10/site-packages/torchvision/image.so
  Expected in:     <ECC148AF-20FF-3EEE-BC75-4DD3E7455393> /Users/ecsrkhaif/anaconda3/lib/python3.10/site-packages/torch/lib/libtorch_cpu.dylib
  warn(f"Failed to load image Python extension: {e}")
  Referenced from: <61623A3D-DA3C-3AAD-B2F0-D363151DDB3F> /Users/ecsrkhaif/anaconda3/lib/python3.10/site-packages/torchvision/image.so
  Expected in:     <ECC148AF-20FF-3EEE-BC75-4DD3E7455393> /Users/ecsrkhaif/anaconda3/lib/python3.10/site-packages/tor

Running Prediction:   0%|          | 0/11 [00:00<?, ?it/s]

In [10]:
import json

formatted_predictions = [
    [list(tag.values())[0] for tag in sentence]
    for sentence in predictions
]

# Préparer les données pour le fichier de sortie
output_data = []
for idx, (sentence, ner_tags) in zip(df_test.index, zip(df_test.sentences, formatted_predictions)):
    output_data.append({
        "unique_id": idx,  # Utiliser l'index original de la DataFrame
        "tokens": sentence,  # Les tokens de la phrase
        "ner_tags": ner_tags  # Les étiquettes extraites
    })

In [11]:
output_data

[{'unique_id': 0,
  'tokens': ['Context',
   'Information',
   'Security',
   'distribute',
   'Context',
   'Threat',
   'Intelligence',
   '(',
   'CTI',
   ')',
   'reporting',
   'under',
   'the',
   'Traffic',
   'Light',
   'Protocol',
   '(',
   'TLP',
   ')',
   ',',
   'a',
   'method',
   'of',
   'classifying',
   'a',
   'document',
   'in',
   'order',
   'to',
   'promote',
   'the',
   'distribution',
   'of',
   'sensitive',
   'information',
   'between',
   'individuals',
   ',',
   'organisations',
   'or',
   'communities',
   'in',
   'a',
   'controlled',
   'and',
   'trusted',
   'way',
   ',',
   'based',
   'on',
   'the',
   'originator',
   "'s",
   'wishes',
   '.'],
  'ner_tags': ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
  

#### Création du fichier de sortie

In [12]:
import os 

os.makedirs("resultat_scibert", exist_ok=True)
# Sauvegarder dans un fichier JSON
with open("resultat_scibert/output_predictions.jsonlines", "w") as f:
    for record in output_data:
        f.write(json.dumps(record) + "\n")

# Affichage pour vérification
with open("resultat_scibert/output_predictions.jsonlines", "r") as f:
    for line in f:
        print(line.strip())

{"unique_id": 0, "tokens": ["Context", "Information", "Security", "distribute", "Context", "Threat", "Intelligence", "(", "CTI", ")", "reporting", "under", "the", "Traffic", "Light", "Protocol", "(", "TLP", ")", ",", "a", "method", "of", "classifying", "a", "document", "in", "order", "to", "promote", "the", "distribution", "of", "sensitive", "information", "between", "individuals", ",", "organisations", "or", "communities", "in", "a", "controlled", "and", "trusted", "way", ",", "based", "on", "the", "originator", "'s", "wishes", "."], "ner_tags": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
{"unique_id": 1, "tokens": ["The", "various", "levels", "of", "the", "TLP", "are", "represented", "by", "the", "following", "colours", ":", "Sources", "may", "use", "TLP"