In [15]:
# Import necessary libraries
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments

# Step 1: Load the labeled data from a CSV file
with open('../data/labeled_data.csv', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Split lines into tokens and labels
data = []
for line in lines:
    token, label = line.strip().rsplit(' ', 1)
    data.append((token, label))

# Create a DataFrame
labeled_df = pd.DataFrame(data, columns=['token', 'label'])

# Step 2: Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(labeled_df)

# Create a dataset for training and validation
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Step 3: Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('Davlan/bert-base-multilingual-cased-finetuned-amharic')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['token'], truncation=True)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Step 4: Prepare the labels for token classification
def align_labels_with_tokens(examples):
    labels = examples['label']
    word_ids = tokenizer(examples['token'], truncation=True).word_ids()
    label_ids = [-100] * len(word_ids)  # Default to -100 for ignored tokens
    for i, word_id in enumerate(word_ids):
        if word_id is not None:
            label_ids[i] = label_mapping.get(labels[word_id], -100)  # Adjust label mapping
    examples['labels'] = label_ids
    return examples

# Create a mapping for labels
label_mapping = {label: idx for idx, label in enumerate(labeled_df['label'].unique())}

# Apply label alignment
tokenized_train = tokenized_train.map(align_labels_with_tokens, batched=True)
tokenized_test = tokenized_test.map(align_labels_with_tokens, batched=True)

# Step 5: Load the model
model = AutoModelForTokenClassification.from_pretrained(
    'Davlan/bert-base-multilingual-cased-finetuned-amharic', 
    num_labels=len(label_mapping)  # Use the length of the label mapping
)

# Step 6: Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Step 7: Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

# Step 8: Train the model
trainer.train()


Map:   0%|          | 0/32745 [00:00<?, ? examples/s]

Map:   0%|          | 0/8187 [00:00<?, ? examples/s]

Map:   0%|          | 0/32745 [00:00<?, ? examples/s]

ArrowInvalid: Column 5 named labels expected length 1000 but got length 3

In [16]:
# Import necessary libraries
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments

# Step 1: Load the pre-tokenized and labeled data from the CSV
labeled_data = pd.read_csv('../data/labeled_data.csv')

# Assuming your CSV has columns 'input_ids' and 'labels'
# Convert input_ids and labels from strings to lists (if they are stored as strings in the CSV)
labeled_data['input_ids'] = labeled_data['input_ids'].apply(eval)  # Convert string representation of lists to actual lists
labeled_data['labels'] = labeled_data['labels'].apply(eval)        # Same for labels

# Step 2: Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(labeled_data)

# Create a dataset for training and validation
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Step 3: Load the model (No need for tokenizer since the data is already tokenized)
model = AutoModelForTokenClassification.from_pretrained(
    'Davlan/bert-base-multilingual-cased-finetuned-amharic', 
    num_labels=len(labeled_data['labels'].explode().unique())  # Number of unique labels
)

# Step 4: Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Step 5: Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Step 6: Train the model
trainer.train()


KeyError: 'input_ids'

In [19]:
# Import necessary libraries
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments

# Step 1: Load the data from the CSV file
labeled_df = pd.read_csv('../data/labeled_data.csv')

# Step 2: Split the "Labeled Data" column into tokens and labels
data = []
for entry in labeled_df['Labeled Data']:
    token, label = entry.split(' ')  # Assuming tokens and labels are separated by a space
    data.append((token, label))

# Create a DataFrame with 'token' and 'label' columns
labeled_df = pd.DataFrame(data, columns=['token', 'label'])

# Step 3: Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(labeled_df)

# Create a dataset for training and validation
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Step 4: Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('Davlan/bert-base-multilingual-cased-finetuned-amharic')

# Convert tokens into input_ids
def convert_to_input_ids(examples):
    # Tokenize tokens directly (tokens are already labeled)
    tokenized_inputs = tokenizer(examples['token'], truncation=True, padding='max_length', max_length=512)
    return tokenized_inputs

# Tokenize the datasets (without label alignment logic)
tokenized_train = train_dataset.map(convert_to_input_ids, batched=True)
tokenized_test = test_dataset.map(convert_to_input_ids, batched=True)

# Step 5: Load the model
label_mapping = {label: idx for idx, label in enumerate(labeled_df['label'].unique())}
model = AutoModelForTokenClassification.from_pretrained(
    'Davlan/bert-base-multilingual-cased-finetuned-amharic',
    num_labels=len(label_mapping)  # Use the length of the label mapping
)

# Step 6: Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Step 7: Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

# Step 8: Train the model
trainer.train()


Map:   0%|          | 0/32744 [00:00<?, ? examples/s]

Map:   0%|          | 0/8187 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-amharic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: too many dimensions 'str'

In [27]:
# Import necessary libraries
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments

# Step 1: Load the data from the CSV file
labeled_df = pd.read_csv('../data/labeled_data.csv')

# Step 2: Split the "Labeled Data" column into tokens and labels
data = []
for entry in labeled_df['Labeled Data']:
    # Ensure correct splitting by handling entries where token and label are separated by space
    try:
        token, label = entry.rsplit(' ', 1)  # Split from the right, in case token contains spaces
        data.append((token, label))
    except ValueError:
        print(f"Skipping invalid entry: {entry}")

# Create a DataFrame with 'token' and 'label' columns
labeled_df = pd.DataFrame(data, columns=['token', 'label'])

# Step 3: Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(labeled_df)

# Create a dataset for training and validation
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Step 4: Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('Davlan/bert-base-multilingual-cased-finetuned-amharic')

# Convert tokens into input_ids
def convert_to_input_ids(examples):
    # Tokenize tokens directly (tokens are already labeled)
    tokenized_inputs = tokenizer(examples['token'], truncation=True, padding='max_length', max_length=512)
    return tokenized_inputs

# Tokenize the datasets (without label alignment logic)
tokenized_train = train_dataset.map(convert_to_input_ids, batched=True)
tokenized_test = test_dataset.map(convert_to_input_ids, batched=True)

# Step 5: Load the model
label_mapping = {label: idx for idx, label in enumerate(labeled_df['label'].unique())}
model = AutoModelForTokenClassification.from_pretrained(
    'Davlan/bert-base-multilingual-cased-finetuned-amharic',
    num_labels=len(label_mapping)  # Use the length of the label mapping
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

# Start training
trainer.train()



Map:   0%|          | 0/32744 [00:00<?, ? examples/s]

Map:   0%|          | 0/8187 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-amharic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: too many dimensions 'str'

In [30]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer

# Step 1: Load the labeled data
labeled_data = pd.read_csv('../data/labeled_data.csv', encoding='utf-8')

# Check the dtype and first few rows of the 'Labeled Data' column
print("Data Type of 'Labeled Data':", labeled_data['Labeled Data'].dtype)
print(labeled_data['Labeled Data'].head())

# Ensure 'Labeled Data' is of string type
labeled_data['Labeled Data'] = labeled_data['Labeled Data'].astype(str)

# Split it into 'token' and 'label'
labeled_data[['token', 'label']] = labeled_data['Labeled Data'].str.split(' ', n=1, expand=True)

# Step 2: Create a mapping for labels (label to ID)
unique_labels = labeled_data['label'].unique()
label_to_id = {label: idx for idx, label in enumerate(unique_labels)}
id_to_label = {idx: label for idx, label in enumerate(unique_labels)}

# Step 3: Prepare the dataset for Hugging Face
train_data = labeled_data[['token', 'label']]
train_data['label'] = train_data['label'].map(label_to_id)

# Create Dataset from pandas DataFrame
train_dataset = Dataset.from_pandas(train_data)

# Step 4: Load the tokenizer
model_name = "bert-base-uncased"  # Update this to your desired model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Step 5: Tokenization function
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['token'].tolist(), truncation=True, padding='max_length', max_length=128)
    labels = []

    for i, label in enumerate(examples['label']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Get word ids
        label_ids = [-100] * len(tokenized_inputs['input_ids'])  # Default label id

        for word_id in set(word_ids):  # Set to avoid duplicates
            if word_id is None:
                continue
            label_ids[word_id] = label[i]  # Set the label ID

        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Step 6: Tokenize the dataset
tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True)

# Step 7: Load the model
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(unique_labels))

# Step 8: Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    evaluation_strategy="epoch",     # Evaluate every epoch
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=16,  # Batch size for training
    num_train_epochs=3,              # Number of training epochs
    weight_decay=0.01,               # Strength of weight decay
)

# Step 9: Create Trainer instance
trainer = Trainer(
    model=model,                        # The instantiated 🤗 Transformers model to be trained
    args=training_args,                 # Training arguments, defined above
    train_dataset=tokenized_train,      # Training dataset
)

# Step 10: Train the model
trainer.train()

# Step 11: Save the model
trainer.save_model('./trained_model')  # Save the model to disk

# Note: Now you can use the trained model for predictions and inference.


Data Type of 'Labeled Data': object
0    ተገጣጣሚዎቹን O
1       የእንጨት O
2        ብሎኮች O
3       ከመነሻዬ O
4        ሲሸምቱ O
Name: Labeled Data, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['label'] = train_data['label'].map(label_to_id)


Map:   0%|          | 0/40931 [00:00<?, ? examples/s]

AttributeError: 'list' object has no attribute 'tolist'