In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict, Features, Sequence, Value
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)
from seqeval.metrics import f1_score, precision_score, recall_score
import torch
import json
import ast
from typing import List, Dict, Tuple

class AzerbaijaniNERPipeline:
    def __init__(self, model_name="bert-base-multilingual-cased", output_dir="az-ner-model"):
        self.model_name = model_name
        self.output_dir = output_dir
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.initialize_label_mappings()

    def initialize_label_mappings(self):
        """Initialize label mappings for the NER tags"""
        self.label2id = {str(i): i for i in range(25)}  # 0-24 for all entity types
        self.id2label = {v: k for k, v in self.label2id.items()}

    def parse_list_string(self, s: str) -> List:
        """Parse a string representation of a list"""
        try:
            if pd.isna(s) or not isinstance(s, str):
                return []
            result = ast.literal_eval(s)
            if not isinstance(result, list):
                return []
            return result
        except:
            return []

    def clean_and_validate_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Clean and validate the dataset"""
        print("Cleaning and validating data...")

        def process_row(row):
            try:
                # Parse tokens and tags
                tokens = self.parse_list_string(row['tokens'])
                ner_tags = self.parse_list_string(row['ner_tags'])

                # Skip invalid rows
                if not tokens or not ner_tags or len(tokens) != len(ner_tags):
                    return None

                # Ensure all tags are integers and within valid range
                ner_tags = [
                    int(tag) if isinstance(tag, (int, str)) and str(tag).isdigit() and int(tag) < 25
                    else 0
                    for tag in ner_tags
                ]

                return {
                    'tokens': tokens,
                    'ner_tags': ner_tags,
                }
            except Exception as e:
                return None

        # Process all rows
        processed_data = []
        skipped_rows = 0

        for _, row in df.iterrows():
            processed_row = process_row(row)
            if processed_row is not None:
                processed_data.append(processed_row)
            else:
                skipped_rows += 1

        print(f"Skipped {skipped_rows} invalid rows")
        print(f"Processed {len(processed_data)} valid rows")

        return pd.DataFrame(processed_data)

    def create_features(self) -> Features:
        """Create feature descriptions for the dataset"""
        return Features({
            'tokens': Sequence(Value('string')),
            'ner_tags': Sequence(Value('int64'))
        })

    def load_dataset(self, parquet_path: str) -> DatasetDict:
        """Load and prepare the dataset"""
        print(f"Loading dataset from {parquet_path}...")

        # Load parquet file
        df = pd.read_parquet(parquet_path)
        print(f"Initial dataset size: {len(df)} rows")

        # Clean and validate data
        processed_df = self.clean_and_validate_data(df)

        # Create dataset with explicit feature definitions
        dataset = Dataset.from_pandas(
            processed_df,
            features=self.create_features(),
            preserve_index=False
        )

        # Split dataset
        train_test = dataset.train_test_split(test_size=0.2, seed=42)
        test_valid = train_test['test'].train_test_split(test_size=0.5, seed=42)

        dataset_dict = DatasetDict({
            'train': train_test['train'],
            'validation': test_valid['train'],
            'test': test_valid['test']
        })

        # Print split sizes and sample
        print("\nDataset splits:")
        for split, ds in dataset_dict.items():
            print(f"{split} set size: {len(ds)} examples")

        print("\nSample from training set:")
        sample = dataset_dict['train'][0]
        print(f"Tokens: {sample['tokens']}")
        print(f"Tags: {sample['ner_tags']}")

        # Calculate and print label distribution
        print("\nLabel distribution in training set:")
        all_labels = []
        for example in dataset_dict['train']:
            all_labels.extend(example['ner_tags'])
        label_counts = pd.Series(all_labels).value_counts().sort_index()
        for label, count in label_counts.items():
            print(f"Label {label}: {count} occurrences")

        return dataset_dict

    def tokenize_and_align_labels(self, examples):
        """Tokenize and align labels with tokens"""
        tokenized_inputs = self.tokenizer(
            examples["tokens"],
            truncation=True,
            is_split_into_words=True,
            max_length=512,
            padding="max_length"
        )

        labels = []
        for i, label in enumerate(examples["ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []

            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(int(label[word_idx]))
                else:
                    label_ids.append(-100)
                previous_word_idx = word_idx

            labels.append(label_ids)

        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    def compute_metrics(self, eval_preds):
        """Compute evaluation metrics"""
        predictions, labels = eval_preds
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (-100)
        true_predictions = [
            [str(p) for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [str(l) for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        return {
            "precision": precision_score(true_labels, true_predictions),
            "recall": recall_score(true_labels, true_predictions),
            "f1": f1_score(true_labels, true_predictions)
        }

    def train(self, dataset_dict: DatasetDict):
        """Train the NER model"""
        print("Initializing model...")
        model = AutoModelForTokenClassification.from_pretrained(
            self.model_name,
            num_labels=len(self.label2id),
            id2label=self.id2label,
            label2id=self.label2id
        )

        print("Preparing datasets...")
        tokenized_datasets = dataset_dict.map(
            self.tokenize_and_align_labels,
            batched=True,
            remove_columns=dataset_dict["train"].column_names
        )

        training_args = TrainingArguments(
            output_dir=self.output_dir,
            evaluation_strategy="steps",
            eval_steps=100,
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=5,
            weight_decay=0.01,
            push_to_hub=False,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            logging_dir=os.path.join(self.output_dir, 'logs'),
            logging_steps=50,
            report_to="none"  # Disable wandb logging
        )

        print("Initializing trainer...")
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["validation"],
            tokenizer=self.tokenizer,
            data_collator=DataCollatorForTokenClassification(self.tokenizer),
            compute_metrics=self.compute_metrics
        )

        print("Starting training...")
        trainer.train()

        print("Saving model...")
        trainer.save_model(self.output_dir)

        return trainer

def main():
    # Initialize pipeline
    pipeline = AzerbaijaniNERPipeline()

    # Load and process dataset
    dataset_dict = pipeline.load_dataset("train-00000-of-00001.parquet")

    # Train model
    trainer = pipeline.train(dataset_dict)

    # Final evaluation
    print("Performing final evaluation...")
    test_results = trainer.evaluate(
        dataset_dict["test"].map(
            pipeline.tokenize_and_align_labels,
            batched=True,
            remove_columns=dataset_dict["test"].column_names
        )
    )
    print("\nFinal Test Results:", json.dumps(test_results, indent=2))

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading dataset from train-00000-of-00001.parquet...
Initial dataset size: 99545 rows
Cleaning and validating data...
Skipped 3592 invalid rows
Processed 95953 valid rows

Dataset splits:
train set size: 76762 examples
validation set size: 9595 examples
test set size: 9596 examples

Sample from training set:
Tokens: ['AxÄ±r', 'vaxta', 'qÉ™dÉ™r', 'gecikdirmÉ™k', 'namazÄ±', 'yerli-dibli', 'qÄ±lmamaÄŸa', 'gÉ™tirib', 'Ã§Ä±xarda', 'bilÉ™r', '.']
Tags: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Label distribution in training set:
Label 0: 1041401 occurrences
Label 1: 43483 occurrences
Label 2: 10845 occurrences
Label 3: 50748 occurrences
Label 4: 20986 occurrences
Label 5: 1725 occurrences
Label 6: 3114 occurrences
Label 7: 1054 occurrences
Label 8: 6324 occurrences
Label 9: 6449 occurrences
Label 10: 6367 occurrences
Label 11: 3462 occurrences
Label 12: 471 occurrences
Label 13: 481 occurrences
Label 14: 28125 occurrences
Label 15: 3918 occurrences
Label 16: 4878 occurrences
Label 17: 17546 occurre

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing datasets...


Map:   0%|          | 0/76762 [00:00<?, ? examples/s]

Map:   0%|          | 0/9595 [00:00<?, ? examples/s]

Map:   0%|          | 0/9596 [00:00<?, ? examples/s]



Initializing trainer...
Starting training...


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict, Features, Sequence, Value
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from seqeval.metrics import f1_score, precision_score, recall_score
import torch
import json
import ast
from typing import List, Dict, Tuple
from datetime import datetime
import logging

# Set up logging
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO,
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler('training.log')
    ]
)

def get_training_args(output_dir: str) -> TrainingArguments:
    """Get training arguments with optimized settings"""
    return TrainingArguments(
        output_dir=output_dir,
        eval_strategy="steps",  # Updated from evaluation_strategy
        eval_steps=100,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        weight_decay=0.01,
        push_to_hub=False,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_dir=os.path.join(output_dir, 'logs'),
        logging_steps=50,
        report_to="none",  # Disable wandb logging
        save_strategy="steps",
        save_steps=100,
        save_total_limit=2,  # Keep only the last 2 checkpoints
        warmup_steps=500,
        fp16=True,  # Enable mixed precision training
        dataloader_num_workers=4,  # Parallelize data loading
        group_by_length=True,  # Reduce padding by grouping similar length sequences
    )

class AzerbaijaniNERPipeline:
    def __init__(self, model_name="bert-base-multilingual-cased", output_dir="az-ner-model"):
        self.model_name = model_name
        self.output_dir = output_dir
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.initialize_label_mappings()

    def initialize_label_mappings(self):
        """Initialize label mappings for the NER tags"""
        self.label2id = {str(i): i for i in range(25)}  # 0-24 for all entity types
        self.id2label = {v: k for k, v in self.label2id.items()}

    def parse_list_string(self, s: str) -> List:
        """Parse a string representation of a list"""
        try:
            if pd.isna(s) or not isinstance(s, str):
                return []
            result = ast.literal_eval(s)
            if not isinstance(result, list):
                return []
            return result
        except:
            return []

    def clean_and_validate_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Clean and validate the dataset"""
        print("Cleaning and validating data...")

        def process_row(row):
            try:
                tokens = self.parse_list_string(row['tokens'])
                ner_tags = self.parse_list_string(row['ner_tags'])

                if not tokens or not ner_tags or len(tokens) != len(ner_tags):
                    return None

                ner_tags = [
                    int(tag) if isinstance(tag, (int, str)) and str(tag).isdigit() and int(tag) < 25
                    else 0
                    for tag in ner_tags
                ]

                return {
                    'tokens': tokens,
                    'ner_tags': ner_tags,
                }
            except Exception as e:
                return None

        processed_data = []
        skipped_rows = 0

        for _, row in df.iterrows():
            processed_row = process_row(row)
            if processed_row is not None:
                processed_data.append(processed_row)
            else:
                skipped_rows += 1

        print(f"Skipped {skipped_rows} invalid rows")
        print(f"Processed {len(processed_data)} valid rows")

        return pd.DataFrame(processed_data)

    def create_features(self) -> Features:
        """Create feature descriptions for the dataset"""
        return Features({
            'tokens': Sequence(Value('string')),
            'ner_tags': Sequence(Value('int64'))
        })

    def load_dataset(self, parquet_path: str) -> DatasetDict:
        """Load and prepare the dataset"""
        print(f"Loading dataset from {parquet_path}...")

        df = pd.read_parquet(parquet_path)
        print(f"Initial dataset size: {len(df)} rows")

        processed_df = self.clean_and_validate_data(df)

        dataset = Dataset.from_pandas(
            processed_df,
            features=self.create_features(),
            preserve_index=False
        )

        train_test = dataset.train_test_split(test_size=0.2, seed=42)
        test_valid = train_test['test'].train_test_split(test_size=0.5, seed=42)

        dataset_dict = DatasetDict({
            'train': train_test['train'],
            'validation': test_valid['train'],
            'test': test_valid['test']
        })

        print("\nDataset splits:")
        for split, ds in dataset_dict.items():
            print(f"{split} set size: {len(ds)} examples")

        print("\nSample from training set:")
        sample = dataset_dict['train'][0]
        print(f"Tokens: {sample['tokens']}")
        print(f"Tags: {sample['ner_tags']}")

        print("\nLabel distribution in training set:")
        all_labels = []
        for example in dataset_dict['train']:
            all_labels.extend(example['ner_tags'])
        label_counts = pd.Series(all_labels).value_counts().sort_index()
        for label, count in label_counts.items():
            print(f"Label {label}: {count} occurrences")

        return dataset_dict

    def tokenize_and_align_labels(self, examples):
        """Tokenize and align labels with tokens"""
        tokenized_inputs = self.tokenizer(
            examples["tokens"],
            truncation=True,
            is_split_into_words=True,
            max_length=512,
            padding="max_length"
        )

        labels = []
        for i, label in enumerate(examples["ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []

            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(int(label[word_idx]))
                else:
                    label_ids.append(-100)
                previous_word_idx = word_idx

            labels.append(label_ids)

        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    def compute_metrics(self, eval_preds):
        """Compute evaluation metrics"""
        predictions, labels = eval_preds
        predictions = np.argmax(predictions, axis=2)

        true_predictions = [
            [str(p) for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [str(l) for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        return {
            "precision": precision_score(true_labels, true_predictions),
            "recall": recall_score(true_labels, true_predictions),
            "f1": f1_score(true_labels, true_predictions)
        }

    def train(self, dataset_dict: DatasetDict):
        """Train the NER model with improved monitoring"""
        logging.info("Initializing model...")
        model = AutoModelForTokenClassification.from_pretrained(
            self.model_name,
            num_labels=len(self.label2id),
            id2label=self.id2label,
            label2id=self.label2id
        )

        logging.info("Preparing datasets...")
        tokenized_datasets = dataset_dict.map(
            self.tokenize_and_align_labels,
            batched=True,
            remove_columns=dataset_dict["train"].column_names,
            num_proc=4  # Parallelize preprocessing
        )

        # Get training arguments
        training_args = get_training_args(self.output_dir)

        # Add callbacks
        callbacks = [
            EarlyStoppingCallback(
                early_stopping_patience=3,
                early_stopping_threshold=0.01
            )
        ]

        logging.info("Initializing trainer...")
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["validation"],
            tokenizer=self.tokenizer,
            data_collator=DataCollatorForTokenClassification(self.tokenizer),
            compute_metrics=self.compute_metrics,
            callbacks=callbacks
        )

        # Log training parameters
        logging.info(f"Training parameters:")
        logging.info(f"Number of training examples: {len(tokenized_datasets['train'])}")
        logging.info(f"Number of validation examples: {len(tokenized_datasets['validation'])}")
        logging.info(f"Training batch size: {training_args.per_device_train_batch_size}")
        logging.info(f"Number of epochs: {training_args.num_train_epochs}")

        # Start timing
        start_time = datetime.now()
        logging.info(f"Starting training at {start_time}")

        # Train
        train_result = trainer.train()

        # End timing
        end_time = datetime.now()
        training_time = end_time - start_time
        logging.info(f"Training completed at {end_time}")
        logging.info(f"Total training time: {training_time}")

        # Log training metrics
        metrics = train_result.metrics
        logging.info(f"Training metrics: {metrics}")

        # Save everything
        logging.info("Saving model, tokenizer, and metrics...")
        trainer.save_model(self.output_dir)
        self.tokenizer.save_pretrained(self.output_dir)

        # Save training metrics
        metrics_path = os.path.join(self.output_dir, "training_metrics.json")
        with open(metrics_path, "w") as f:
            json.dump(metrics, f, indent=2)

        return trainer

def main():
    try:
        # Initialize pipeline
        pipeline = AzerbaijaniNERPipeline()

        # Load and process dataset
        dataset_dict = pipeline.load_dataset("train-00000-of-00001.parquet")

        # Train model
        trainer = pipeline.train(dataset_dict)

        # Final evaluation
        logging.info("Performing final evaluation...")
        test_results = trainer.evaluate(
            dataset_dict["test"].map(
                pipeline.tokenize_and_align_labels,
                batched=True,
                remove_columns=dataset_dict["test"].column_names
            )
        )
        logging.info(f"\nFinal Test Results: {json.dumps(test_results, indent=2)}")

        # Save test results
        results_path = os.path.join(pipeline.output_dir, "test_results.json")
        with open(results_path, "w") as f:
            json.dump(test_results, f, indent=2)

    except Exception as e:
        logging.error(f"Error occurred during training: {str(e)}", exc_info=True)
        raise

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading dataset from train-00000-of-00001.parquet...
Initial dataset size: 99545 rows
Cleaning and validating data...
Skipped 3592 invalid rows
Processed 95953 valid rows

Dataset splits:
train set size: 76762 examples
validation set size: 9595 examples
test set size: 9596 examples

Sample from training set:
Tokens: ['AxÄ±r', 'vaxta', 'qÉ™dÉ™r', 'gecikdirmÉ™k', 'namazÄ±', 'yerli-dibli', 'qÄ±lmamaÄŸa', 'gÉ™tirib', 'Ã§Ä±xarda', 'bilÉ™r', '.']
Tags: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Label distribution in training set:
Label 0: 1041401 occurrences
Label 1: 43483 occurrences
Label 2: 10845 occurrences
Label 3: 50748 occurrences
Label 4: 20986 occurrences
Label 5: 1725 occurrences
Label 6: 3114 occurrences
Label 7: 1054 occurrences
Label 8: 6324 occurrences
Label 9: 6449 occurrences
Label 10: 6367 occurrences
Label 11: 3462 occurrences
Label 12: 471 occurrences
Label 13: 481 occurrences
Label 14: 28125 occurrences
Label 15: 3918 occurrences
Label 16: 4878 occurrences
Label 17: 17546 occurre

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map (num_proc=4):   0%|          | 0/76762 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/9595 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/9596 [00:00<?, ? examples/s]

