In [1]:
import os
import ast
import torch
import torch.nn
import numpy as np
import pandas as pd
import sklearn.base
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import Dataset
from transformers import DataCollatorForTokenClassification
import evaluate
import seqeval
import csv
from seqeval.metrics import f1_score
from transformers import   AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from datasets import concatenate_datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv("train.csv",delimiter=';')
extended_percent_samples = pd.read_csv("Extended_Percent_samples.csv",delimiter=';')
extended_volume_samples = pd.read_csv("Extended_Volume_samples.csv",delimiter=';')
extended_percent_samples2 = pd.read_csv("percent.csv",delimiter=';')
extended_volume_samples2 = pd.read_csv("volume.csv",delimiter=';')
data.head()

Unnamed: 0,sample,annotation
0,aa,"[(0, 2, 'O')]"
1,aala,"[(0, 4, 'O')]"
2,aarcca,"[(0, 6, 'O')]"
3,abon,"[(0, 4, 'O')]"
4,abso,"[(0, 4, 'B-BRAND')]"


In [3]:

label2id = {}
for index in data.index:
    row = data.loc[index]
    for label_tuple in ast.literal_eval(row['annotation']):
     label = label_tuple[2]   
     if label  == '0' :
           label  = 'O' 
     
     label2id.setdefault( label ,len(label2id))
id2label = {value: key for key, value in label2id.items()}     

In [4]:
id2label

{0: 'O',
 1: 'B-BRAND',
 2: 'B-TYPE',
 3: 'I-BRAND',
 4: 'I-TYPE',
 5: 'B-PERCENT',
 6: 'B-VOLUME',
 7: 'I-VOLUME',
 8: 'I-PERCENT'}

In [5]:
class_priority = {'O' : 0, 'B-TYPE' : 1, 'B-BRAND' : 2, 'B-VOLUME' : 3 , 'B-PERCENT' : 4}
def PrepareData(data):
 rows_list = []
 for index in data.index:
    row = data.loc[index]
    labels = []
    labels_id = []
    original_tuple = []
    for label_tuple in ast.literal_eval(row['annotation']):
       label = label_tuple[2]   
       if label  == '0' :
           label  = 'O'

       labels_id.append(label2id[label])
       labels.append(label) 
       original_tuple.append(label_tuple)
    
    
    class_for_split = max(class_priority.get(label,-1) for label in labels)
    dict1 = {'words' : row['sample'].split(' '), 'labels_id' : labels_id, 'labels' :  labels ,'labels_string' : " ".join(str(item) for item in set(labels)),'classes_for_split' : class_for_split ,  'original_tuple' : original_tuple , 'sample' : row['sample'], 'annotation' : row['annotation'] }
    rows_list.append(dict1)
 return  pd.DataFrame(rows_list)    
 

In [6]:
prepared_df = PrepareData(data)
extended_percent_df = PrepareData(extended_percent_samples)
extended_volume_df = PrepareData(extended_volume_samples)
extended_percent_df2 = PrepareData(extended_percent_samples2)
extended_volume_df2 = PrepareData(extended_volume_samples2)
data_exclusively_for_validation = prepared_df[prepared_df['classes_for_split'] > 2]
other_data = prepared_df[prepared_df['classes_for_split'] <= 2]


train,val  =  train_test_split(other_data,test_size=0.1,shuffle=True,stratify=other_data['classes_for_split'])

 

val = pd.concat([val,data_exclusively_for_validation]).drop(['labels','original_tuple','labels_string' , 'classes_for_split'],axis=1)
train = pd.concat([train,extended_percent_df, extended_percent_df, extended_percent_df, extended_volume_df, extended_volume_df, extended_volume_df, extended_percent_df2, extended_volume_df2]).drop(['labels','original_tuple','labels_string' , 'classes_for_split'],axis=1)
val_dataset = Dataset.from_pandas(val,preserve_index=False)
train_dataset =  Dataset.from_pandas(train,preserve_index=False)

In [7]:
prepared_df.groupby('classes_for_split').count()['labels'].nlargest(25) / prepared_df.shape[0]

classes_for_split
1    0.705846
2    0.264100
0    0.027008
3    0.002092
4    0.000954
Name: labels, dtype: float64

In [8]:
df_agg = prepared_df.groupby('labels_string').count()['labels'].nlargest(25)

df_agg / prepared_df.shape[0]

labels_string
B-TYPE                             0.483652
B-TYPE B-BRAND                     0.172324
I-TYPE B-TYPE                      0.144398
B-TYPE O                           0.069208
B-BRAND                            0.065649
O                                  0.027008
B-TYPE I-BRAND B-BRAND             0.011743
I-TYPE B-TYPE O                    0.008477
O B-BRAND                          0.004477
I-TYPE B-TYPE B-BRAND              0.004404
I-BRAND B-BRAND                    0.003046
B-TYPE O B-BRAND                   0.000844
I-VOLUME B-TYPE B-VOLUME           0.000734
I-TYPE B-TYPE I-BRAND B-BRAND      0.000697
I-BRAND O B-BRAND                  0.000661
B-PERCENT B-TYPE                   0.000550
I-TYPE B-TYPE B-VOLUME             0.000367
B-TYPE O B-VOLUME                  0.000294
B-PERCENT I-TYPE B-TYPE            0.000183
B-PERCENT B-TYPE I-PERCENT         0.000147
B-TYPE B-VOLUME                    0.000147
I-VOLUME I-TYPE B-TYPE B-VOLUME    0.000147
B-TYPE B-VOLUME B-

Finetune Model

In [9]:
model_name = 'FacebookAI/xlm-roberta-large'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(id2label), id2label=id2label, label2id=label2id
)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"labels_id"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [11]:
tokenized_val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 2800/2800 [00:00<00:00, 27458.11 examples/s]
Map: 100%|██████████| 24822/24822 [00:01<00:00, 23696.07 examples/s]


In [12]:
seqeval = evaluate.load("seqeval")

def get_results(eval_predictions, predictions):
  
  
  true_predictions = [
        [id2label[p.item()] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, eval_predictions.label_ids)
    ]

  true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions,  eval_predictions.label_ids)
    ]
  
  f1 = f1_score(true_predictions, true_labels, average='macro')
  results = seqeval.compute(predictions=true_predictions, references=true_labels)
  
  return {"f1": f1, "results": results}


def compute_metrics(eval_predictions):
   softmax = torch.nn.Softmax(dim=2)
   softmax_predictions = softmax(torch.tensor(eval_predictions.predictions))
   
   predictions = np.argmax(softmax_predictions, axis=2)
   
   predictions_modified_zero_threshold = []
   for scores in softmax_predictions:
     predictions_modified_zero_threshold_batch = []
     for scores_row in scores:
       prediction = 0
       if scores_row[0] < 0.1:
         prediction = np.argmax(scores_row)
       predictions_modified_zero_threshold_batch.append(prediction)
     predictions_modified_zero_threshold.append(predictions_modified_zero_threshold_batch)
      
   predictions_modified_zero_threshold = torch.tensor(predictions_modified_zero_threshold)
   argmax_results = get_results(eval_predictions, predictions)
   f1_modified_zero_threshold = get_results(eval_predictions, predictions_modified_zero_threshold)
 

   metrics = {
        "precision": argmax_results["results"]["overall_precision"],
        "recall": argmax_results["results"]["overall_recall"],
        "accuracy": argmax_results["results"]["overall_accuracy"],
        "f1 argmax": argmax_results["f1"],
        "f1 predictions modified zero threshold": f1_modified_zero_threshold["f1"],
        "f1 brand": argmax_results["results"]["BRAND"]['f1'],
        "f1 type": argmax_results["results"]["TYPE"]['f1'],
        "f1 percent": argmax_results["results"]["PERCENT"]['f1'],
        "f1 volume": argmax_results["results"]["VOLUME"]['f1'],
        "brand support": np.float64(argmax_results["results"]["BRAND"]['number']),
        "type support": np.float64(argmax_results["results"]["TYPE"]['number']),
        "percent support": np.float64(argmax_results["results"]["PERCENT"]['number']),
        "volume support": np.float64(argmax_results["results"]["VOLUME"]['number']),
    }
   
   return metrics

In [13]:
training_args = TrainingArguments(
    output_dir="models/FacebookAI/xml-roberta-large",
    learning_rate=1e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="no",
    save_strategy="no",
    load_best_model_at_end=False,
    push_to_hub=False,
)

train_ds = concatenate_datasets([tokenized_train_dataset, tokenized_val_dataset])

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model()


Step,Training Loss


KeyboardInterrupt: 