In [None]:
import zipfile
import os

def unzip_zip_file(file_path):
    # Check if the file is a .zip file
    if not file_path.endswith('.zip'):
        print("The specified file is not a .zip file.")
        return

    # Get the folder name without the .zip extension
    folder_name = os.path.splitext(file_path)[0]

    # Create the folder if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    # Extract the .zip file contents to the folder
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(folder_name)

    print(f"Extracted to {folder_name}")

# Example usage
unzip_zip_file('/content/Data.zip')

Extracted to /content/Data


In [None]:
!pip install transformers datasets
import json
import os
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report, accuracy_score, f1_score

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "offline"

# Step 1: Load Data
def load_data(folder_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
                json_data = json.load(f)
                for i in range(len(json_data['text'])):
                    text = json_data['text'][i]
                    entities = json_data['entities']
                    data.append({"text": text, "entities": entities})
    return data

data = load_data('/content/Data')

# Define the label map
label_map = {
    'BANK': 0,
    'ORG': 1,
    'PERSON': 2,
    'OFFICIAL': 3,
    'NATIONALITY': 4,
    'COUNTRY': 5,
    'MEDIA': 6,
    'FINANCIAL_INSTRUMENT': 7,
    'TIME': 8,
    'QUNATITY_OR_UNIT': 9,
    'GOVERNMENT_ENTITY': 10,
    'CORP': 11,
    'PRODUCT_OR_SERVICE': 12,
    'STOCK_EXCHANGE': 13,
    'CURRENCY': 14,
    'ROLE': 15,
    'GPE': 16,
    'CITY': 17,
    'FinMarket': 18,
    'Metrics': 19,
    'Events': 20,
}
reverse_label_map = {v: k for k, v in label_map.items()}

In [None]:
# Step 2: Load Wojood Tokenizer and Model
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("SinaLab/ArabicWojood-FlatNER")
model = AutoModelForTokenClassification.from_pretrained(
    "SinaLab/ArabicWojood-FlatNER",
    num_labels=len(label_map),
    ignore_mismatched_sizes=True
)

# Step 3: Tokenize Data and Align Labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['text'], truncation=True, padding='max_length', max_length=512, return_offsets_mapping=True
    )

    labels = []
    for i, entities in enumerate(examples['entities']):
        label_ids = [-100] * len(tokenized_inputs['input_ids'][i])  # Initialize with -100 for ignored labels

        for entity in entities:
            if entity['type'] not in label_map:
                continue  # Skip unknown entity types

            start, end = entity['start'], entity['end']
            word_ids = tokenized_inputs['offset_mapping'][i]

            for j, (start_offset, end_offset) in enumerate(word_ids):
                if start_offset is None or end_offset is None:
                    continue  # Skip special tokens (e.g., [CLS], [SEP])
                if start_offset >= start and end_offset <= end:
                    label_ids[j] = label_map[entity['type']]

        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

Some weights of BertForTokenClassification were not initialized from the model checkpoint at SinaLab/ArabicWojood-FlatNER and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.Laye

In [None]:
# Create a Dataset and split it for training and evaluation
dataset = Dataset.from_list(data)
dataset = dataset.train_test_split(test_size=0.1, seed=42)

# Tokenize the datasets
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=["text", "entities"])

# Compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Convert labels and predictions to the original label names, ignoring special tokens
    true_labels = [[reverse_label_map[l] for l in label if l != -100] for label in labels]
    true_preds = [[reverse_label_map[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]

    # Flatten the lists for metric calculations
    true_labels_flat = [item for sublist in true_labels for item in sublist]
    true_preds_flat = [item for sublist in true_preds for item in sublist]

    # Calculate accuracy and F1 scores
    accuracy = accuracy_score(true_labels_flat, true_preds_flat)
    f1 = f1_score(true_labels_flat, true_preds_flat, average='weighted')

    # Print classification report
    report = classification_report(true_labels_flat, true_preds_flat)
    print(report)

    return {
        'accuracy': accuracy,
        'f1': f1
    }

Map:   0%|          | 0/23607 [00:00<?, ? examples/s]

Map:   0%|          | 0/2624 [00:00<?, ? examples/s]

In [None]:
# Step 4: Training Configuration
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    save_steps=500
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
# Step 5: Training and Evaluation
trainer.train()
trainer.evaluate()

# Save Model and Tokenizer
model.save_pretrained('/content/TestingWojood')
tokenizer.save_pretrained('/content/TestingWojood')

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,2.1989,1.414756,0.641662,0.568951
2,1.5915,0.87527,0.761957,0.724739
3,1.3646,0.710706,0.814884,0.79707


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                      precision    recall  f1-score   support

                BANK       0.00      0.00      0.00       328
                CITY       0.33      0.67      0.44      2203
                CORP       0.55      0.26      0.35      8298
             COUNTRY       0.34      0.01      0.01      4433
            CURRENCY       0.00      0.00      0.00       660
              Events       0.68      0.96      0.80     25856
FINANCIAL_INSTRUMENT       0.00      0.00      0.00       210
           FinMarket       0.00      0.00      0.00        57
   GOVERNMENT_ENTITY       0.00      0.00      0.00       712
                 GPE       0.00      0.00      0.00       244
               MEDIA       0.00      0.00      0.00      2324
             Metrics       0.00      0.00      0.00       207
         NATIONALITY       0.00      0.00      0.00       419
            OFFICIAL       0.24      0.09      0.13       183
                 ORG       0.00      0.00      0.00       524
       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                      precision    recall  f1-score   support

                BANK       0.10      0.02      0.03       328
                CITY       0.82      0.98      0.89      2203
                CORP       0.84      0.78      0.81      8298
             COUNTRY       0.13      0.03      0.05      4433
            CURRENCY       0.00      0.00      0.00       660
              Events       0.82      0.96      0.88     25856
FINANCIAL_INSTRUMENT       0.00      0.00      0.00       210
           FinMarket       0.00      0.00      0.00        57
   GOVERNMENT_ENTITY       0.00      0.00      0.00       712
                 GPE       0.75      0.02      0.05       244
               MEDIA       0.74      0.98      0.85      2324
             Metrics       0.00      0.00      0.00       207
         NATIONALITY       0.03      0.17      0.05       419
            OFFICIAL       0.06      0.19      0.09       183
                 ORG       0.00      0.00      0.00       524
       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                      precision    recall  f1-score   support

                BANK       0.13      0.05      0.07       328
                CITY       0.81      0.98      0.89      2203
                CORP       0.86      0.80      0.83      8298
             COUNTRY       0.53      0.69      0.60      4433
            CURRENCY       0.29      0.07      0.11       660
              Events       0.93      0.94      0.94     25856
FINANCIAL_INSTRUMENT       0.00      0.00      0.00       210
           FinMarket       0.00      0.00      0.00        57
   GOVERNMENT_ENTITY       0.00      0.00      0.00       712
                 GPE       0.28      0.09      0.14       244
               MEDIA       0.86      0.98      0.92      2324
             Metrics       0.09      0.02      0.03       207
         NATIONALITY       0.00      0.00      0.00       419
            OFFICIAL       0.10      0.34      0.15       183
                 ORG       0.00      0.00      0.00       524
       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                      precision    recall  f1-score   support

                BANK       0.13      0.05      0.07       328
                CITY       0.81      0.98      0.89      2203
                CORP       0.86      0.80      0.83      8298
             COUNTRY       0.53      0.69      0.60      4433
            CURRENCY       0.29      0.07      0.11       660
              Events       0.93      0.94      0.94     25856
FINANCIAL_INSTRUMENT       0.00      0.00      0.00       210
           FinMarket       0.00      0.00      0.00        57
   GOVERNMENT_ENTITY       0.00      0.00      0.00       712
                 GPE       0.28      0.09      0.14       244
               MEDIA       0.86      0.98      0.92      2324
             Metrics       0.09      0.02      0.03       207
         NATIONALITY       0.00      0.00      0.00       419
            OFFICIAL       0.10      0.34      0.15       183
                 ORG       0.00      0.00      0.00       524
       

('/content/TestingWojood/tokenizer_config.json',
 '/content/TestingWojood/special_tokens_map.json',
 '/content/TestingWojood/vocab.txt',
 '/content/TestingWojood/added_tokens.json',
 '/content/TestingWojood/tokenizer.json')

In [None]:
from google.colab import files
import shutil

# Compress the folder into a zip file
shutil.make_archive('/content/TestingWojood', 'zip', '/content/TestingWojood')

# Download the zip file to your PC
files.download('/content/TestingWojood.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>