Notebook à utiliser pour faire le travail pratique # 3 sur l'analyse d'incidents.

In [4]:
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from transformers import DistilBertTokenizer, DistilBertForTokenClassification, AdamW
from torch.utils.data import DataLoader, Dataset
import torch

OSError: [WinError 126] The specified module could not be found. Error loading "c:\Users\taha\Coding\.venv\Lib\site-packages\torch\lib\omptarget.sycl.wrap.dll" or one of its dependencies.

In [3]:
!pip list

Package                  Version
------------------------ -----------
aiobotocore              2.5.0
aiohttp                  3.8.4
aioitertools             0.11.0
aiosignal                1.3.1
altair                   5.0.1
androguard               3.3.5
annotated-types          0.6.0
anyio                    3.6.2
argon2-cffi              21.3.0
argon2-cffi-bindings     21.2.0
arrow                    1.2.3
asn1crypto               1.5.1
asttokens                2.2.1
async-timeout            4.0.2
attrs                    22.2.0
backcall                 0.2.0
beautifulsoup4           4.12.2
bleach                   6.0.0
blinker                  1.6.2
blis                     0.7.11
botocore                 1.29.76
cachetools               5.3.1
catalogue                2.0.10
certifi                  2022.12.7
cffi                     1.15.1
charset-normalizer       3.1.0
click                    8.1.3
cloudpathlib             0.16.0
colorama                 0.4.6
comm            

In [None]:
# Charger les données depuis le fichier JSON
dev_examples = "C:\\Users\\Nitro\Desktop\\tp3_2023\\data\\dev_examples.json"
with open(dev_examples, 'r') as file:
    data = json.load(file)

In [None]:

# Diviser les données en ensembles d'entraînement et de test
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Charger le tokenizer et le modèle préentraîné DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')


In [None]:

# Dynamiquement déterminer les classes à partir des données
all_classes = set()
for item in data:
    if 'arguments' in item:
        all_classes.update(item['arguments'].keys())



In [None]:

MAX_SEQUENCE_LENGTH = 128  

class CustomDataset(Dataset):
    def __init__(self, data, classes, max_seq_length=MAX_SEQUENCE_LENGTH):
        self.data = data
        self.classes = classes
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item['text']
        labels_dict = {key: [-100] * self.max_seq_length for key in self.classes}

        if 'arguments' in item:
            for key, values in item['arguments'].items():
                for value in values:
                    start_idx = text.find(value)
                    while start_idx != -1:
                        end_idx = start_idx + len(value.split())
                        # Tronquer ou remplir en fonction de la taille maximale de la séquence
                        start_idx = min(start_idx, self.max_seq_length - 1)
                        end_idx = min(end_idx, self.max_seq_length)
                        labels_dict[key][start_idx:end_idx] = [1] * (end_idx - start_idx)
                        start_idx = text.find(value, start_idx + 1)

        # Tronquer ou remplir la séquence des étiquettes
        padded_labels = []
        for key in self.classes:
            label = labels_dict[key][:self.max_seq_length]
            padded_labels.append(label)

        labels = torch.LongTensor(padded_labels)

        return {'text': text, 'labels': labels}

In [None]:
# Préparer les données pour l'entraînement

train_dataset = CustomDataset(train_data, all_classes)
test_dataset = CustomDataset(test_data, all_classes)


In [None]:

# Définir les paramètres d'entraînement
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=len(all_classes))
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)


In [None]:

# Entraîner le modèle
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        inputs = {key: val.to(device) for key, val in batch.items()}
        labels = {key: torch.tensor(value) for key, value in batch['labels'].items()}
        inputs = {**inputs, **labels}

        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()



In [None]:
# Évaluer le modèle
model.eval()
y_true = []
y_pred = []


In [None]:
for item in test_dataset:
    inputs = tokenizer(item['text'], return_tensors='pt', padding=True, truncation=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    predicted_labels = torch.argmax(outputs.logits, dim=2).cpu().numpy()
    true_labels = {key: torch.tensor(value) for key, value in item['labels'].items()}
    
    for key, true_label in true_labels.items():
        mask = true_label != -100
        predicted_label = predicted_labels[0, mask]
        true_label = true_label[mask]
        y_true.extend(true_label.cpu().numpy())
        y_pred.extend(predicted_label)

In [None]:



# Calculer les scores
f1 = f1_score(y_true, y_pred, average='micro')
print(f"F1-Score: {f1}")

classification_rep = classification_report(y_true, y_pred)
print("Classification Report:\n", classification_rep)
