## Install Necessary Packages

In [3]:
!pip install transformers datasets seqeval



# Import Necessary Libraries

In [4]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import accuracy_score, classification_report
import torch
from sklearn.model_selection import train_test_split
from collections import Counter

### Load the Dataset

In [5]:
def load_conll_dataset(file_path):
    sentences, labels = [], []
    sentence, label = [], []
    
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            if line.strip():  # If the line is not empty
                parts = line.strip().split()
                if len(parts) == 2:  # Ensure the line has exactly two components
                    token, tag = parts
                    sentence.append(token)
                    label.append(tag)
                else:
                    print(f"Skipping malformed line: {line.strip()}")
            else:
                if sentence:  # Append only if the sentence is not empty
                    sentences.append(sentence)
                    labels.append(label)
                sentence, label = [], []
    
    if sentence:  # Append any remaining sentence
        sentences.append(sentence)
        labels.append(label)
    
    return pd.DataFrame({"tokens": sentences, "ner_tags": labels})


### Split the dataset

In [6]:
# Function to split the CoNLL dataset into training and validation sets
def split_conll_dataset(conll_df, train_ratio=0.8):
    # Split the dataset into train and validation sets
    train_df, val_df = train_test_split(conll_df, train_size=train_ratio, random_state=42, shuffle=True)

    return train_df, val_df

# Example usage
file_path = "C:/Users/HP/10 Acadamy PRojects/New folder (4)/amharic-ecommerce-scraper/data/labeled_data_CoNLL.txt"
conll_df = load_conll_dataset(file_path)  # Load the dataset
train_dataset, val_dataset = split_conll_dataset(conll_df)  # Split the dataset

# Check the sizes of the resulting datasets
print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")


Training set size: 36
Validation set size: 10


### Label Mapping

In [7]:
# Define label mapping
label_to_id = {
    "O": 0,  # Outside of entity
    "B-Product": 1,  # Beginning of a Product entity
    "I-Product": 2,  # Inside of a Product entity
    "B-PRICE": 3,  # Beginning of a Price entity
    "I-PRICE": 4,  # Inside of a Price entity
    "B-LOC": 5,  # Beginning of a Location entity
    "I-LOC": 6   # Inside of a Location entity
}

# Reverse mapping for predictions
id_to_label = {v: k for k, v in label_to_id.items()}

### Tokenize and Align Labels

In [8]:
def tokenize_and_align_labels(examples, tokenizer, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)
    
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        # Replace any '0' (zero) with 'O' (uppercase letter O) and 'o' (lowercase) with 'O'
        label = ['O' if l in ['0', 'o'] else l for l in label]
        
        # Convert string labels to integers using label_to_id mapping
        label = [label_to_id[l] for l in label]  # Mapping the string NER tags to integers
        
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Padding token
            elif word_idx != previous_word_idx:  # First token of a word
                label_ids.append(label[word_idx])
            else:  # Non-first token of a word
                label_ids.append(-100 if not label_all_tokens else label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

### Train and Evaluate Model

In [9]:
# Function to fine-tune the model
def train_and_evaluate_model(model_name, train_dataset, val_dataset, label_list, batch_size=16, epochs=15):
    print(f"Training model: {model_name}")
    
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

    # Tokenize dataset
    # Passing tokenizer inside lambda function
    training_dataset = train_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)
    evaluation_dataset = val_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)

    # Data collator
    data_collator = DataCollatorForTokenClassification(tokenizer)
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./results_{model_name}",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        logging_dir=f"./logs_{model_name}",
        logging_steps=50
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=training_dataset,
        eval_dataset=evaluation_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_results = trainer.evaluate()
    print(f"Evaluation results for {model_name}:", eval_results)
    return eval_results

### Compute Metrics

In [10]:
def compute_metrics(pred):
    # Retrieve predictions and true labels
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    # Convert numeric labels back to their string names using id_to_label mapping
    true_labels = [[id_to_label[l] for l in label_row if l != -100] for label_row in labels]
    true_preds = [[id_to_label[p] for (p, l) in zip(pred_row, label_row) if l != -100] for pred_row, label_row in zip(preds, labels)]
    
    # Use seqeval to evaluate the performance
    report = classification_report(true_labels, true_preds)
    accuracy = accuracy_score(true_labels, true_preds)
    
    return {"accuracy": accuracy, "report": report}

### Compare Models

In [11]:
# Function to compare models
def compare_models(models, dataset, label_list):
    results = {}
    for model_name in models:
        eval_result = train_and_evaluate_model(model_name, dataset, label_list)
        results[model_name] = eval_result
    return results

In [12]:
# Load the labeled CoNLL dataset
#conll_df = load_conll_dataset("/kaggle/input/collection-ner/NER_Collection_data.txt")
#dataset = Dataset.from_pandas(conll_df)

### Count Labels in the Dataset

In [13]:
# Function to count each label in the dataset
def count_labels(dataset):
    all_labels = [label for labels in dataset['ner_tags'] for label in labels]
    label_counts = Counter(all_labels)
    
    # Print the counts for each label
    for label, count in label_counts.items():
        print(f"Label: {label}, Count: {count}")
    
    return label_counts

In [14]:
train_label_counts = count_labels(train_dataset)

Label: O, Count: 237
Label: B-Product, Count: 11
Label: I-Product, Count: 281
Label: B-LOC, Count: 27
Label: I-LOC, Count: 257
Label: B-PRICE, Count: 19
Label: I-PRICE, Count: 36


In [15]:
evaluation_label_counts = count_labels(val_dataset)

Label: B-Product, Count: 6
Label: I-Product, Count: 123
Label: O, Count: 97
Label: B-PRICE, Count: 8
Label: I-PRICE, Count: 18
Label: B-LOC, Count: 11
Label: I-LOC, Count: 80


#### Map labels to correct labels

In [16]:
# Function to map incorrect labels to correct labels
def map_labels(dataset):
    # Define the mapping from incorrect to correct labels
    label_mapping = {
        'B-PROD': 'B-Product',   # Map 'B-PROD' to 'B-Product'
        'B-PRODUCT': 'B-Product', # Map 'B-PRODUCT' to 'B-Product'
        'I-PRODUCT': 'I-Product', # Map 'I-PRODUCT' to 'I-Product'
        'B-Price': 'B-PRICE',    # Map 'B-Price' to 'B-PRICE'
        'I-Price': 'I-PRICE',    # Map 'I-Price' to 'I-PRICE'
        'IO': 'O'                # Map 'IO' to 'O'
    }
    
    # Replace the incorrect labels with the correct ones
    dataset['ner_tags'] = dataset['ner_tags'].apply(
        lambda tags: [label_mapping.get(tag, tag) for tag in tags]
    )
    
    return dataset


In [17]:
# Example usage:
train_df = map_labels(train_dataset)
train_dataset = Dataset.from_pandas(train_df)

# Verify the label counts after remapping
label_counts = count_labels(train_df)

Label: O, Count: 237
Label: B-Product, Count: 11
Label: I-Product, Count: 281
Label: B-LOC, Count: 27
Label: I-LOC, Count: 257
Label: B-PRICE, Count: 19
Label: I-PRICE, Count: 36


In [18]:
# Example usage:
val_df = map_labels(val_dataset)
val_dataset = Dataset.from_pandas(val_df)

# Verify the label counts after remapping
label_counts = count_labels(val_df)

Label: B-Product, Count: 6
Label: I-Product, Count: 123
Label: O, Count: 97
Label: B-PRICE, Count: 8
Label: I-PRICE, Count: 18
Label: B-LOC, Count: 11
Label: I-LOC, Count: 80


In [19]:
"""
# Function to save the dataset to storage
def save_dataset(dataset, file_path):
    
    Save the modified dataset to a specified file path in CSV format.
    
    Args:
        dataset (pd.DataFrame): The DataFrame containing the dataset.
        file_path (str): The file path where the dataset will be saved.
    
    dataset.to_csv(file_path, index=False)
    print(f"Dataset saved to {file_path}")

# Save the mapped dataset to a CSV file
save_dataset(conll_df, "preprocessed_conll_data.txt")

"""

'\n# Function to save the dataset to storage\ndef save_dataset(dataset, file_path):\n\n    Save the modified dataset to a specified file path in CSV format.\n\n    Args:\n        dataset (pd.DataFrame): The DataFrame containing the dataset.\n        file_path (str): The file path where the dataset will be saved.\n\n    dataset.to_csv(file_path, index=False)\n    print(f"Dataset saved to {file_path}")\n\n# Save the mapped dataset to a CSV file\nsave_dataset(conll_df, "preprocessed_conll_data.txt")\n\n'

### List Models and Labels

In [20]:
"""
# List of entity labels 
label_list = ['O', 'B-Product', 'I-Product', 'B-PRICE', 'I-PRICE', 'B-LOC', 'I-LOC']

# Define models for comparison
models = [
    "xlm-roberta-base",  
    "bert-base-multilingual-cased",  
    "distilbert-base-multilingual-cased"  
]

# Compare models
results = compare_models(models, dataset, label_list)

"""

'\n# List of entity labels \nlabel_list = [\'O\', \'B-Product\', \'I-Product\', \'B-PRICE\', \'I-PRICE\', \'B-LOC\', \'I-LOC\']\n\n# Define models for comparison\nmodels = [\n    "xlm-roberta-base",  \n    "bert-base-multilingual-cased",  \n    "distilbert-base-multilingual-cased"  \n]\n\n# Compare models\nresults = compare_models(models, dataset, label_list)\n\n'

### Print Comparison Results

In [21]:
"""
# Print out comparison results
for model_name, result in results.items():
    print(f"Model: {model_name}")
    print(f"Accuracy: {result['eval_accuracy']}")
    print(result['eval_report'])

"""

'\n# Print out comparison results\nfor model_name, result in results.items():\n    print(f"Model: {model_name}")\n    print(f"Accuracy: {result[\'eval_accuracy\']}")\n    print(result[\'eval_report\'])\n\n'

In [22]:
#API = c912e406b425b51cb31ae3db26397612b381918d

### Fine-tune a Single Model at a Time

In [23]:
# Function to train and evaluate one model
def run_single_model(model_name, train_dataset, val_dataset, label_list):
    # Train and evaluate the model
    eval_result = train_and_evaluate_model(model_name, train_dataset, val_dataset, label_list)
    
    # Print the evaluation result for the model
    print(f"Model: {model_name}")
    print(f"Accuracy: {eval_result['eval_accuracy']}")
    print(eval_result['eval_report'])
    
    return eval_result


In [24]:
def train_and_evaluate_model(model_name, train_dataset, val_dataset, label_list, batch_size=16, epochs=15):
    print(f"Training model: {model_name}")
    
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

    # Tokenize dataset
    training_dataset = train_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)
    evaluation_dataset = val_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)

    # Data collator
    data_collator = DataCollatorForTokenClassification(tokenizer)
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./results_{model_name.replace('/', '-')}",
        eval_strategy="epoch",  # Changed from evaluation_strategy to eval_strategy
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        logging_dir=f"./logs_{model_name.replace('/', '-')}",
        logging_steps=50
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=training_dataset,
        eval_dataset=evaluation_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_results = trainer.evaluate()
    print(f"Evaluation results for {model_name}:", eval_results)
    return eval_results

In [27]:
# List of entity labels 
label_list = ['O', 'B-Product', 'I-Product', 'B-PRICE', 'I-PRICE', 'B-LOC', 'I-LOC']

# Define the model to run
model_name = "distilbert-base-multilingual-cased"

# Run and evaluate the model
eval_result = run_single_model(model_name, train_dataset, val_dataset, label_list)


Training model: distilbert-base-multilingual-cased


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Error while downloading from https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Error while downloading from https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/pytorch_model.bin: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


OSError: distilbert-base-multilingual-cased does not appear to have a file named pytorch_model.bin but there is a file for TensorFlow weights. Use `from_tf=True` to load this model from those weights.

In [None]:
del /s /q C:\Users\HP\.cache\huggingface\hub