In [1]:
import torch
import json
import os
import argparse
import torch.nn.functional as F
import sys
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric

LABEL_LIST=[0,1]
LANGS = ['ar', 'de', 'en', 'es', 'fi', 'fr', 'hi', 'it', 'sv', 'zh']
#LANGS = ['en']
MODEL_NAME = 'FacebookAI/xlm-roberta-base'

def tokenize_and_map_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['model_output_text'], return_offsets_mapping=True, padding=True, truncation=True)
    offset_mappings = tokenized_inputs['offset_mapping']
    all_labels = examples['hard_labels']
    tok_labels_batch = []
    for batch_idx in range(len(offset_mappings)):
        offset_mapping = offset_mappings[batch_idx]
        hard_labels = all_labels[batch_idx]
        tok_labels = [0] * len(offset_mapping)
        for idx, start_end in enumerate(offset_mapping):
            start = start_end[0]
            end = start_end[1]
            for (label_start, label_end) in hard_labels:
                if start >= label_start and end <= label_end:
                    tok_labels[idx] = 1
        tok_labels_batch.append(tok_labels)
    tokenized_inputs['labels'] = tok_labels_batch
    return tokenized_inputs

def train_model(test_lang, data_path, output_dir):
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=2)  # Adjust num_labels as needed

    data_files = {
        'train': [f'{data_path}\mushroom.{lang}-val.v2.jsonl' for lang in LANGS if lang != test_lang],
        'validation': f'{data_path}\mushroom.{test_lang}-val.v2.jsonl'
    }
    print(data_files)
    dataset = load_dataset('json', data_files=data_files)
    # Tokenize the dataset
    tokenized_datasets = dataset.map(lambda x: tokenize_and_map_labels(x, tokenizer), batched=True)
    print("tokenized_datasets:\n",tokenized_datasets)

    # Prepare the dataset for training
    train_dataset = tokenized_datasets['train']
    eval_dataset = tokenized_datasets['validation']

    # Define the training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy='epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        weight_decay=0.01,
    )

    # Define the metric
    metric = load_metric('seqeval', trust_remote_code=True)

    def compute_metrics(p):
        predictions, labels = p
        predictions = torch.argmax(torch.tensor(predictions), dim=2)
        true_labels = [[LABEL_LIST[l] for l in label if l != -100] for label in labels]
        true_predictions = [
            [LABEL_LIST[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        results = metric.compute(predictions=true_predictions, references=true_labels)
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    trainer.evaluate()
    print(f"Model trained and evaluated successfully. Model checkpoint saved in {output_dir}")


def test_model(test_lang, model_path, data_path):
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForTokenClassification.from_pretrained(model_path)
    # Load the test dataset
    test_dataset = load_dataset('json', data_files={'test': f'{data_path}\mushroom.{test_lang}-val.v2.jsonl'})['test']
    # Tokenize test dataset
    inputs = tokenizer(test_dataset['model_output_text'], padding=True, truncation=True, return_offsets_mapping=True, return_tensors="pt")

    # Get predictions for the test set
    model.eval()
    with torch.no_grad():
        outputs = model(inputs.input_ids)
    preds = torch.argmax(outputs.logits, dim=2)
    probs = F.softmax(outputs.logits, dim=2)
    # map predictions to character spans
    hard_labels_all = {}
    soft_labels_all = {}
    predictions_all = []
    for i, pred in enumerate(preds):
        hard_labels_sample = []
        soft_labels_sample = []
        positive_indices = torch.nonzero(pred == 1, as_tuple=False)
        offset_mapping = inputs['offset_mapping'][i]
        for j, offset in enumerate(offset_mapping):
            soft_labels_sample.append({'start': offset[0].item(), 'end': offset[1].item(), 'prob': probs[i][j][1].item()})
            if j in positive_indices:
                hard_labels_sample.append((offset[0].item(), offset[1].item()))
        soft_labels_all[test_dataset['id'][i]] = soft_labels_sample
        hard_labels_all[test_dataset['id'][i]] = hard_labels_sample
        predictions_all.append({'id': test_dataset['id'][i], 'hard_labels': hard_labels_sample, 'soft_labels': soft_labels_sample})
    with open(f"{test_lang}-hard_labels.json", 'w') as f:
        json.dump(hard_labels_all, f)
    with open(f"{test_lang}-soft_labels.json", 'w') as f:
        json.dump(soft_labels_all, f)
    with open(f"{test_lang}-pred.jsonl", 'w') as f:
        for pred_dict in predictions_all:
            print(json.dumps(pred_dict), file=f)
    print(f"Labels saved to {test_lang}-hard_labels.json and {test_lang}-soft_labels.json")
    print(f"Prediction file saved to {test_lang}-pred.jsonl")


def main(args):
    if args.mode == 'train':
        train_model(test_lang='en', data_path=args.data_path, output_dir=args.model_checkpoint,)
    else:
        print(f"Test model: {args.model_checkpoint}")
        test_model(test_lang=args.test_lang, model_path=args.model_checkpoint, data_path=args.data_path)

In [2]:
import argparse

# Define the parser
parser = argparse.ArgumentParser(description="Train or test the model")
parser.add_argument('--mode', type=str, choices=['train', 'test'], default='train')
parser.add_argument('--data_path', default=r'C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Datasets\train_ds', type=str, help="Path to the training data")
parser.add_argument('--model_checkpoint', type=str, default="./results", help="Path to the trained checkpoint")
parser.add_argument('--test_lang', type=str, default="en")

# Manually pass the arguments as a list
args = parser.parse_args(['--mode', 'train', '--data_path', r'C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Datasets\val_ds', '--model_checkpoint', './results', '--test_lang', 'en'])

# Call your main function
main(args)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train': ['C:\\Users\\FLopezP\\Documents\\GitHub\\Mu-SHROOM-GIL\\Datasets\\val_ds\\mushroom.ar-val.v2.jsonl', 'C:\\Users\\FLopezP\\Documents\\GitHub\\Mu-SHROOM-GIL\\Datasets\\val_ds\\mushroom.de-val.v2.jsonl', 'C:\\Users\\FLopezP\\Documents\\GitHub\\Mu-SHROOM-GIL\\Datasets\\val_ds\\mushroom.es-val.v2.jsonl', 'C:\\Users\\FLopezP\\Documents\\GitHub\\Mu-SHROOM-GIL\\Datasets\\val_ds\\mushroom.fi-val.v2.jsonl', 'C:\\Users\\FLopezP\\Documents\\GitHub\\Mu-SHROOM-GIL\\Datasets\\val_ds\\mushroom.fr-val.v2.jsonl', 'C:\\Users\\FLopezP\\Documents\\GitHub\\Mu-SHROOM-GIL\\Datasets\\val_ds\\mushroom.hi-val.v2.jsonl', 'C:\\Users\\FLopezP\\Documents\\GitHub\\Mu-SHROOM-GIL\\Datasets\\val_ds\\mushroom.it-val.v2.jsonl', 'C:\\Users\\FLopezP\\Documents\\GitHub\\Mu-SHROOM-GIL\\Datasets\\val_ds\\mushroom.sv-val.v2.jsonl', 'C:\\Users\\FLopezP\\Documents\\GitHub\\Mu-SHROOM-GIL\\Datasets\\val_ds\\mushroom.zh-val.v2.jsonl'], 'validation': 'C:\\Users\\FLopezP\\Documents\\GitHub\\Mu-SHROOM-GIL\\Datasets\\val_ds\\m

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

tokenized_datasets:
 DatasetDict({
    train: Dataset({
        features: ['id', 'lang', 'model_input', 'model_output_text', 'model_id', 'soft_labels', 'hard_labels', 'model_output_tokens', 'model_output_logits', 'input_ids', 'attention_mask', 'offset_mapping', 'labels'],
        num_rows: 449
    })
    validation: Dataset({
        features: ['id', 'lang', 'model_input', 'model_output_text', 'model_id', 'soft_labels', 'hard_labels', 'model_output_tokens', 'model_output_logits', 'input_ids', 'attention_mask', 'offset_mapping', 'labels'],
        num_rows: 50
    })
})


  metric = load_metric('seqeval', trust_remote_code=True)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.193795,0.0,0.0,0.0,0.935143
2,No log,0.19227,0.0,0.0,0.0,0.925314
3,No log,0.179499,0.0,0.0,0.0,0.920857
4,No log,0.168715,0.0,0.0,0.0,0.9264
5,No log,0.165664,0.0,0.0,0.0,0.932571


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Model trained and evaluated successfully. Model checkpoint saved in ./results


In [4]:
parser = argparse.ArgumentParser(description="Train or test the model")
parser.add_argument('--mode', type=str, choices=['train', 'test'], default='train')
parser.add_argument('--data_path', default=r'C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Datasets\train_ds', type=str, help="Path to the training data")
parser.add_argument('--model_checkpoint', type=str, default="./results", help="Path to the trained checkpoint")
parser.add_argument('--test_lang', type=str, default="en")

# Manually pass the arguments as a list
#args = parser.parse_args(['--mode', 'test', '--data_path', r'C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Datasets\val_ds', '--model_checkpoint', './results', '--test_lang', 'en'])
args = parser.parse_args(['--mode', 'test', '--data_path', r'C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Datasets\val_ds', '--model_checkpoint', r'C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Notebooks\results\checkpoint-145', '--test_lang', 'en'])

# Call your main function
main(args)

Test model: C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Notebooks\results\checkpoint-145


Generating test split: 0 examples [00:00, ? examples/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Labels saved to en-hard_labels.json and en-soft_labels.json
Prediction file saved to en-pred.jsonl
