In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
def load_conll_format(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = []
        sentence = []
        labels = []

        for line in f:
            line = line.strip()
            if line == "":
                if sentence:  # Only append if the sentence list is not empty
                    data.append((sentence, labels))
                    sentence = []
                    labels = []
            else:
                parts = line.split()
                if len(parts) == 2:  # Ensure there are exactly two parts
                    token, label = parts
                    sentence.append(token)
                    labels.append(label)
                else:
                    print(f"Skipping line: {line}")  # Optional: print to debug which lines are problematic

        if sentence:  # Append the last sentence if the file doesn't end with a newline
            data.append((sentence, labels))

    return pd.DataFrame(data, columns=['tokens', 'labels'])

df = load_conll_format("/kaggle/input/newenewnewn/dataset.txt")

Skipping line: Ô∏è
Skipping line: Ô∏è
Skipping line: Ô∏è
Skipping line: Ô∏è
Skipping line: Ô∏è
Skipping line: Ô∏è
Skipping line: Ô∏è
Skipping line: Ô∏è
Skipping line: Ô∏è


In [2]:
df

Unnamed: 0,tokens,labels
0,"[2, ·àä·âµ·à≠, ·çî·à≠·àô·àµ, ·àà·â§·âµ, ·àà·â¢·àÆ, ·àà·àÜ·â¥·àç, ·ä†·åà·àç·åç·àé·âµ, ·àò·ãã·àç, ·ã®·àö...","[O, O, O, O, O, O, O, O, O, O, B-PRODUCT, O, O..."
1,"[·â¢·àã, ·ã®·àµ·åã, ·ã®·ä†·å•·äï·âµ, 1000, ·â•·à≠, ·àà·àõ·ãò·ãù, ·ãç·àµ·äï, ·çç·à¨, ·äê·ãç, ...","[B-PRODUCT, O, O, B-PRICE, I-PRICE, O, O, O, O..."
2,"[·ãã·àµ·âµ·äì, ·âÖ·äì·àΩ, ·ä†·ãµ·à´·àª, ·âÅ·å•·à≠, 1, ·àò·åà·äì·äõ, ·ãò·çç·àò·àΩ, ·åç·à´·äï·ãµ, ·àû·àç...","[O, O, O, O, O, B-LOC, I-LOC, I-LOC, I-LOC, I-..."
3,"[2, ·ä†·äï·ãµ, ·â•·à©·àΩ, ·ä•·äì, ·ä†·äï·ãµ, ·àµ·çì·âπ·àã, ·ãã·åã, 300, ·â•·à≠, ·àà·àõ·ãò·ãù...","[O, O, B-PRODUCT, O, O, B-PRODUCT, B-PRICE, I-..."
4,"[2, ·ä†·äï·ãµ, ·â•·à©·àΩ, ·ä•·äì, ·ä†·äï·ãµ, ·àµ·çì·âπ·àã, ·ãã·åã, 300, ·â•·à≠, ·ãç·àµ·äï,...","[O, O, B-PRODUCT, O, O, B-PRODUCT, B-PRICE, I-..."
...,...,...
56,"[·ãã·åã, 1000, ·â•·à≠, ·àà·àõ·ãò·ãù, ·ãç·àµ·äï, ·çç·à¨, ·äê·ãç, ·ã®·âÄ·à®·ãç, Ô∏è·å•·à´·âµ, ...","[B-PRICE, I-PRICE, I-PRICE, O, O, O, O, O, O, ..."
57,"[·àà·â§·â∂, ·àà·àµ·å¶·â≥, ·â†·å•·à´·âµ, ·â†·âÖ·äì·àΩ, ·ãç·àµ·äï, ·çç·à¨, ·ã®·âÄ·à©, ·ãï·âÉ·ãà·âΩ, ·ä†·àÅ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
58,"[·àà·àç·åÜ, ·àà·àµ·å¶·â≥, ·â†·å•·à´·âµ, ·â†·âÖ·äì·àΩ, ·ãç·àµ·äï, ·çç·à¨, ·ã®·âÄ·à©, ·ãï·âÉ·ãà·âΩ, ·ä†·àÅ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
59,"[4, 1, 400, 10000, ·â•·à≠, ·ä®, ·äê·çÉ, ·àõ·ãµ·à®·àµ, ·åã·à≠, ·àà·àõ·ãò·ãù, ...","[O, O, O, B-PRICE, I-PRICE, O, O, O, O, O, O, ..."


In [4]:
# Define label mappings
label_to_id = {
    "O": 0,
    "B-PRICE": 1,
    "I-PRICE": 2,
    "B-LOC": 3,
    "I-LOC": 4,
    "B-PRODUCT": 5,
    "I-PRODUCT": 6,
    # Add other labels as needed
}

In [5]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Model names for fine-tuning
model_names = {
    "xlm-roberta": "xlm-roberta-base",
    "distilbert": "distilbert-base-multilingual-cased",
    "mbert": "bert-base-multilingual-cased"
}

tokenizers = {}
models = {}

# Load tokenizers and models
for model_key, model_name in model_names.items():
    tokenizers[model_key] = AutoTokenizer.from_pretrained(model_name)
    models[model_key] = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_to_id))


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from datasets import Dataset

# Define function to tokenize and align labels
def tokenize_and_align_labels(dataframe, tokenizer):
    tokenized_inputs = tokenizer(
        list(dataframe['tokens']),
        truncation=True,
        is_split_into_words=True,
        padding=True,
        return_tensors="pt"
    )

    labels = []
    for i, label in enumerate(dataframe['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100] * len(tokenized_inputs['input_ids'][i])  # Use -100 for padding tokens
        
        for word_index in range(len(word_ids)):
            if word_ids[word_index] is not None:  # Check if it's a valid word token
                current_label = label[word_ids[word_index]]
                label_ids[word_index] = label_to_id[current_label]
        
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Tokenize data for each model
tokenized_data = {}
for model_key, tokenizer in tokenizers.items():
    tokenized_data[model_key] = tokenize_and_align_labels(df, tokenizer)


In [7]:
# Convert the tokenized data into Hugging Face Datasets for training and validation
datasets = {}
for model_key, data in tokenized_data.items():
    dataset = Dataset.from_dict({
        'input_ids': data['input_ids'],
        'attention_mask': data['attention_mask'],
        'labels': data['labels']
    })

    # Split the dataset into training and validation sets (80/20 split)
    datasets[model_key] = dataset.train_test_split(test_size=0.2)


In [10]:
from transformers import Trainer, TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate after every epoch
    logging_steps=10,  # Log after every 10 steps
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",  # Directory to save logs
    logging_first_step=True,  # Log the first step as well
)

# Fine-tune each model and store the trainers
trainers = {}
for model_key, model in models.items():
    trainers[model_key] = Trainer(
        model=model,
        args=training_args,
        train_dataset=datasets[model_key]['train'],
        eval_dataset=datasets[model_key]['test']
    )

    # Train the model
    print(f"Training model: {model_key}")
    trainers[model_key].train()




Training model: xlm-roberta


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,0.8294,0.742567
2,0.8294,0.618416
3,0.8294,0.610165


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Training model: distilbert


Epoch,Training Loss,Validation Loss
1,0.581,0.513163
2,0.581,0.483254
3,0.581,0.468558


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Training model: mbert


Epoch,Training Loss,Validation Loss
1,0.5517,0.626722
2,0.5517,0.586671
3,0.5517,0.576116


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


In [11]:
# Evaluate the models on the validation dataset
results = {}
for model_key, trainer in trainers.items():
    print(f"Evaluating model: {model_key}")
    eval_result = trainer.evaluate()
    results[model_key] = eval_result

# Print the results
for model_key, result in results.items():
    print(f"Results for {model_key}: {result}")


Evaluating model: xlm-roberta


Evaluating model: distilbert


Evaluating model: mbert


Results for xlm-roberta: {'eval_loss': 0.6101651191711426, 'eval_runtime': 0.2765, 'eval_samples_per_second': 47.019, 'eval_steps_per_second': 3.617, 'epoch': 3.0}
Results for distilbert: {'eval_loss': 0.4685583710670471, 'eval_runtime': 0.0782, 'eval_samples_per_second': 166.301, 'eval_steps_per_second': 12.792, 'epoch': 3.0}
Results for mbert: {'eval_loss': 0.5761160254478455, 'eval_runtime': 0.1149, 'eval_samples_per_second': 113.14, 'eval_steps_per_second': 8.703, 'epoch': 3.0}


In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    # Flatten and filter out padding tokens (-100)
    true_labels = []
    predicted_labels = []
    
    for i in range(len(labels)):
        true_labels.extend([label for label, pred_label in zip(labels[i], preds[i]) if label != -100])
        predicted_labels.extend([pred_label for label, pred_label in zip(labels[i], preds[i]) if label != -100])
    
    # Compute metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average="macro")
    recall = recall_score(true_labels, predicted_labels, average="macro")
    f1 = f1_score(true_labels, predicted_labels, average="macro")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [13]:
trainers = {}
for model_key, model in models.items():
    trainers[model_key] = Trainer(
        model=model,
        args=training_args,
        train_dataset=datasets[model_key]['train'],
        eval_dataset=datasets[model_key]['test'],
        compute_metrics=compute_metrics  # Use the updated compute_metrics function
    )

# Evaluate the models
results = {}
for model_key, trainer in trainers.items():
    print(f"Evaluating model: {model_key}")
    eval_result = trainer.evaluate()
    results[model_key] = eval_result

# Print results
for model_key, result in results.items():
    print(f"Results for {model_key}: {result}")


Evaluating model: xlm-roberta


Evaluating model: distilbert


  _warn_prf(average, modifier, msg_start, len(result))
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


  _warn_prf(average, modifier, msg_start, len(result))
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Evaluating model: mbert




Results for xlm-roberta: {'eval_loss': 0.6101651191711426, 'eval_accuracy': 0.8245496997998666, 'eval_precision': 0.1177928142571238, 'eval_recall': 0.14285714285714285, 'eval_f1': 0.1291198746408984, 'eval_runtime': 0.3262, 'eval_samples_per_second': 39.857, 'eval_steps_per_second': 3.066}
Results for distilbert: {'eval_loss': 0.4685583710670471, 'eval_accuracy': 0.8668941979522184, 'eval_precision': 0.12384202827888835, 'eval_recall': 0.14285714285714285, 'eval_f1': 0.13267171585270304, 'eval_runtime': 0.0913, 'eval_samples_per_second': 142.393, 'eval_steps_per_second': 10.953}
Results for mbert: {'eval_loss': 0.5761160254478455, 'eval_accuracy': 0.8474114441416893, 'eval_precision': 0.12105877773452704, 'eval_recall': 0.14285714285714285, 'eval_f1': 0.13105773282764432, 'eval_runtime': 0.1274, 'eval_samples_per_second': 102.032, 'eval_steps_per_second': 7.849}


  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import Trainer, TrainingArguments
import warnings
warnings.filterwarnings("ignore")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Flatten and filter out padding tokens (-100)
    true_labels = []
    predicted_labels = []

    for i in range(len(labels)):
        true_labels.extend([label for label, pred_label in zip(labels[i], preds[i]) if label != -100])
        predicted_labels.extend([pred_label for label, pred_label in zip(labels[i], preds[i]) if label != -100])

    # Compute metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average="macro")
    recall = recall_score(true_labels, predicted_labels, average="macro")
    f1 = f1_score(true_labels, predicted_labels, average="macro")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# Assuming models and datasets are defined elsewhere
# models = {...}
# datasets = {...}

trainers = {}

# Define TrainingArguments for Trainer
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save checkpoints and results
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    logging_dir="./logs",  # Directory for storing logs
    logging_steps=10,  # Number of steps between logging updates
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,  # Example: Train for 3 epochs
    save_steps=100,  # Save model checkpoints every 100 steps
    logging_first_step=True,
)

# Create trainers for each model
for model_key, model in models.items():
    trainers[model_key] = Trainer(
        model=model,
        args=training_args,
        train_dataset=datasets[model_key]['train'],
        eval_dataset=datasets[model_key]['test'],
        compute_metrics=compute_metrics,
    )

# Train and evaluate each model
results = {}
for model_key, trainer in trainers.items():
    print(f"Training and evaluating model: {model_key}")
    trainer.train()
    eval_result = trainer.evaluate()

    results[model_key] = eval_result
    print(f"Results for {model_key}: {eval_result}")

# Optionally, you can access the evaluation results per epoch from the Trainer object directly.


Training and evaluating model: xlm-roberta


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1388,0.162264,0.973316,0.415035,0.424536,0.419589
2,0.1388,0.1508,0.973316,0.413809,0.42637,0.419883
3,0.1388,0.145427,0.96998,0.416641,0.422312,0.419446


Results for xlm-roberta: {'eval_loss': 0.14542677998542786, 'eval_accuracy': 0.9699799866577719, 'eval_precision': 0.4166411355243466, 'eval_recall': 0.42231170932627243, 'eval_f1': 0.4194464951177891, 'eval_runtime': 0.234, 'eval_samples_per_second': 55.552, 'eval_steps_per_second': 4.273, 'epoch': 3.0}
Training and evaluating model: distilbert


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1713,0.243769,0.932878,0.548666,0.455753,0.423656
2,0.1713,0.136208,0.96132,0.49909,0.53559,0.516065
3,0.1713,0.136929,0.96587,0.642508,0.588259,0.573261


Results for distilbert: {'eval_loss': 0.13692906498908997, 'eval_accuracy': 0.9658703071672355, 'eval_precision': 0.6425079300870166, 'eval_recall': 0.5882592800899887, 'eval_f1': 0.5732605202151494, 'eval_runtime': 0.1093, 'eval_samples_per_second': 118.963, 'eval_steps_per_second': 9.151, 'epoch': 3.0}
Training and evaluating model: mbert


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2078,0.393601,0.871935,0.395734,0.234014,0.259467
2,0.2078,0.255662,0.934605,0.500282,0.460569,0.456918
3,0.2078,0.245124,0.933243,0.488957,0.455696,0.448963


Results for mbert: {'eval_loss': 0.24512434005737305, 'eval_accuracy': 0.9332425068119891, 'eval_precision': 0.48895739740022653, 'eval_recall': 0.4556964586477678, 'eval_f1': 0.4489632563593071, 'eval_runtime': 0.1478, 'eval_samples_per_second': 87.964, 'eval_steps_per_second': 6.766, 'epoch': 3.0}


In [17]:
import shap
import torch

# Choose a sample text from your validation dataset for SHAP analysis
sample_text = ["·ã≠·àÖ ·àù·à≠·âµ ·â†·ä†·ã≤·àµ ·ä†·â†·â£ ·ä•·åÖ·åç ·â†·à≠·ä´·â≥ ·åà·äï·ãò·â• ·ã≠·äñ·à´·àç·ç¢"]

# Select a model for SHAP analysis (e.g., "xlm-roberta")
model_key = 'xlm-roberta'
model = models[model_key]
tokenizer = tokenizers[model_key]

# Tokenize the sample text
tokenized_sample = tokenizer(
    sample_text,
    truncation=True,
    padding=True,
    return_tensors="pt"
)

# Define prediction function for SHAP
def prediction_function(inputs):
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
    return predictions.cpu().numpy()

# Convert tokenized input to SHAP-compatible format
explainer = shap.Explainer(prediction_function, tokenizer)
shap_values = explainer(tokenized_sample)

# Visualize the SHAP values
shap.plots.text(shap_values[0])


TypeError: XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): XLMRobertaIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermediate_act_fn): GELUActivation()
          )
          (output): XLMRobertaOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (classifier): Linear(in_features=768, out_features=7, bias=True)
) argument after ** must be a mapping, not numpy.ndarray

In [18]:
from lime.lime_text import LimeTextExplainer

# Create an instance of LimeTextExplainer
explainer = LimeTextExplainer(class_names=["O", "B-LOC", "I-LOC", "B-PRICE", "I-PRICE", "B-PRODUCT", "I-PRODUCT"])

# Define the prediction function for LIME
def lime_prediction_function(texts):
    # Tokenize input texts
    tokenized_inputs = tokenizer(
        texts,
        truncation=True,
        padding=True,
        return_tensors="pt"
    )

    with torch.no_grad():
        # Predict with model
        outputs = model(**tokenized_inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
    
    # Convert predictions to the format expected by LIME
    return predictions.cpu().numpy()

# Sample text for LIME analysis
sample_text = "·ã≠·àÖ ·àù·à≠·âµ ·â†·ä†·ã≤·àµ ·ä†·â†·â£ ·ä•·åÖ·åç ·â†·à≠·ä´·â≥ ·åà·äï·ãò·â• ·ã≠·äñ·à´·àç·ç¢"

# Generate LIME explanation
lime_explanation = explainer.explain_instance(
    sample_text,
    classifier_fn=lime_prediction_function,
    num_features=10  # Number of features to show
)

# Display the LIME explanation in a notebook (for Jupyter/Colab environments)
lime_explanation.show_in_notebook(text=True)

# To view LIME explanation in other environments, you can export it as HTML
# lime_explanation.save_to_file('lime_explanation.html')


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)