In [3]:
import zipfile
import os

def unzip_zip_file(file_path):
    # Check if the file is a .zip file
    if not file_path.endswith('.zip'):
        print("The specified file is not a .zip file.")
        return

    # Get the folder name without the .zip extension
    folder_name = os.path.splitext(file_path)[0]

    # Create the folder if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    # Extract the .zip file contents to the folder
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(folder_name)

    print(f"Extracted to {folder_name}")

# Example usage
unzip_zip_file('/content/Data.zip')


Extracted to /content/Data


In [4]:
!pip install transformers datasets
import json
import os
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report, accuracy_score, f1_score

# Step 1: Load Data
def load_data(folder_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
                json_data = json.load(f)
                for i in range(len(json_data['text'])):
                    text = json_data['text'][i]
                    entities = json_data['entities']
                    data.append({"text": text, "entities": entities})
    return data

data = load_data('/content/Data')

# Define the label map
label_map = {
    'BANK': 0,
    'ORG': 1,
    'PERSON': 2,
    'OFFICIAL': 3,
    'NATIONALITY': 4,
    'COUNTRY': 5,
    'MEDIA': 6,
    'FINANCIAL_INSTRUMENT': 7,
    'TIME': 8,
    'QUNATITY_OR_UNIT': 9,
    'GOVERNMENT_ENTITY': 10,
    'CORP': 11,
    'PRODUCT_OR_SERVICE': 12,
    'STOCK_EXCHANGE': 13,
    'CURRENCY': 14,
    'ROLE': 15,
    'GPE': 16,
    'CITY': 17,
    'FinMarket': 18,
    'Metrics': 19,
    'Events': 20,
}
reverse_label_map = {v: k for k, v in label_map.items()}

# Step 2: Update Tokenizer and Model to use CAMeL
tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-ner")
model = AutoModelForTokenClassification.from_pretrained(
    "CAMeL-Lab/bert-base-arabic-camelbert-msa-ner",
    num_labels=len(label_map),
    ignore_mismatched_sizes=True
)

# Step 3: Tokenize Data and Align Labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['text'], truncation=True, padding='max_length', max_length=512, return_offsets_mapping=True
    )

    labels = []
    for i, entities in enumerate(examples['entities']):
        label_ids = [-100] * len(tokenized_inputs['input_ids'][i])  # Initialize with -100 for ignored labels

        for entity in entities:
            if entity['type'] not in label_map:
                continue  # Skip unknown entity types

            start, end = entity['start'], entity['end']
            word_ids = tokenized_inputs['offset_mapping'][i]

            for j, (start_offset, end_offset) in enumerate(word_ids):
                if start_offset is None or end_offset is None:
                    continue  # Skip special tokens (e.g., [CLS], [SEP])
                if start_offset >= start and end_offset <= end:
                    label_ids[j] = label_map[entity['type']]

        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs




Some weights of the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-msa-ner were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-msa-ner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([21, 768]) in the model inst

In [5]:
# Create a Dataset and split it for training and evaluation
dataset = Dataset.from_list(data)
dataset = dataset.train_test_split(test_size=0.1, seed=42)

# Tokenize the datasets
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=["text", "entities"])

# Compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Convert labels and predictions to the original label names, ignoring special tokens
    true_labels = [[reverse_label_map[l] for l in label if l != -100] for label in labels]
    true_preds = [[reverse_label_map[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]

    # Flatten the lists for metric calculations
    true_labels_flat = [item for sublist in true_labels for item in sublist]
    true_preds_flat = [item for sublist in true_preds for item in sublist]

    # Calculate accuracy and F1 scores
    accuracy = accuracy_score(true_labels_flat, true_preds_flat)
    f1 = f1_score(true_labels_flat, true_preds_flat, average='weighted')

    # Print classification report
    report = classification_report(true_labels_flat, true_preds_flat)
    print(report)

    return {
        'accuracy': accuracy,
        'f1': f1
    }

# Step 4: Training Configuration
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    save_steps=500
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)

# Step 5: Training and Evaluation
trainer.train()
trainer.evaluate()

# Save Model and Tokenizer
model.save_pretrained('/content/n')
tokenizer.save_pretrained('/content/n')

Map:   0%|          | 0/23607 [00:00<?, ? examples/s]

Map:   0%|          | 0/2624 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.0907,0.384907,0.903798,0.902698
2,0.9639,0.34095,0.911838,0.909329
3,0.6688,0.336438,0.913936,0.911637


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                      precision    recall  f1-score   support

                BANK       0.29      0.17      0.21       749
                CITY       1.00      0.99      0.99      2814
                CORP       0.97      0.94      0.96      8146
             COUNTRY       0.89      0.85      0.87      2713
            CURRENCY       0.97      0.57      0.72       803
              Events       0.96      0.98      0.97     24470
FINANCIAL_INSTRUMENT       0.00      0.00      0.00       131
           FinMarket       0.00      0.00      0.00        62
   GOVERNMENT_ENTITY       0.24      0.24      0.24       187
                 GPE       0.42      0.26      0.32       289
               MEDIA       1.00      0.99      0.99      1920
             Metrics       0.21      0.03      0.06       174
         NATIONALITY       0.24      0.18      0.21       316
            OFFICIAL       0.53      0.67      0.59       310
                 ORG       0.00      0.00      0.00        44
       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                      precision    recall  f1-score   support

                BANK       0.31      0.20      0.25       749
                CITY       1.00      0.99      0.99      2814
                CORP       0.96      0.95      0.96      8146
             COUNTRY       0.86      0.86      0.86      2713
            CURRENCY       0.90      0.58      0.70       803
              Events       0.96      0.98      0.97     24470
FINANCIAL_INSTRUMENT       0.22      0.02      0.03       131
           FinMarket       0.00      0.00      0.00        62
   GOVERNMENT_ENTITY       0.33      0.35      0.34       187
                 GPE       0.48      0.25      0.33       289
               MEDIA       1.00      0.99      0.99      1920
             Metrics       0.30      0.09      0.14       174
         NATIONALITY       0.25      0.28      0.27       316
            OFFICIAL       0.70      0.68      0.69       310
                 ORG       0.00      0.00      0.00        44
       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                      precision    recall  f1-score   support

                BANK       0.36      0.18      0.24       749
                CITY       1.00      0.99      0.99      2814
                CORP       0.96      0.95      0.96      8146
             COUNTRY       0.88      0.86      0.87      2713
            CURRENCY       0.83      0.60      0.69       803
              Events       0.96      0.98      0.97     24470
FINANCIAL_INSTRUMENT       0.39      0.12      0.19       131
           FinMarket       0.00      0.00      0.00        62
   GOVERNMENT_ENTITY       0.33      0.36      0.34       187
                 GPE       0.48      0.28      0.35       289
               MEDIA       1.00      0.99      0.99      1920
             Metrics       0.28      0.12      0.17       174
         NATIONALITY       0.26      0.23      0.24       316
            OFFICIAL       0.67      0.68      0.68       310
                 ORG       0.36      0.20      0.26        44
       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                      precision    recall  f1-score   support

                BANK       0.36      0.18      0.24       749
                CITY       1.00      0.99      0.99      2814
                CORP       0.96      0.95      0.96      8146
             COUNTRY       0.88      0.86      0.87      2713
            CURRENCY       0.83      0.60      0.69       803
              Events       0.96      0.98      0.97     24470
FINANCIAL_INSTRUMENT       0.39      0.12      0.19       131
           FinMarket       0.00      0.00      0.00        62
   GOVERNMENT_ENTITY       0.33      0.36      0.34       187
                 GPE       0.48      0.28      0.35       289
               MEDIA       1.00      0.99      0.99      1920
             Metrics       0.28      0.12      0.17       174
         NATIONALITY       0.26      0.23      0.24       316
            OFFICIAL       0.67      0.68      0.68       310
                 ORG       0.36      0.20      0.26        44
       

('/content/n/tokenizer_config.json',
 '/content/n/special_tokens_map.json',
 '/content/n/vocab.txt',
 '/content/n/added_tokens.json',
 '/content/n/tokenizer.json')