In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [None]:
folder_name = "data/trans"
csv_file = "text-animals.csv"

Loading a CSV file with text data and class labels, cleans it, splits it into training and test samples in a balanced way, converts labels to numeric identifiers, and creates datasets from the data

In [3]:
file_path = os.path.join(folder_name, csv_file)

df = pd.read_csv(file_path, names=["text", "label"])

df = df[df["label"] != "label"].dropna().reset_index(drop=True)

print("Count of examples for each class:\n", df["label"].value_counts())

train_dfs, test_dfs = [], []

for label in df["label"].unique():
    df_label = df[df["label"] == label]
    train_df, test_df = train_test_split(df_label, test_size=0.2, random_state=42)
    train_dfs.append(train_df)
    test_dfs.append(test_df)

train_df = pd.concat(train_dfs).reset_index(drop=True)
test_df = pd.concat(test_dfs).reset_index(drop=True)

print("Size train:", train_df.shape)
print("Size test:", test_df.shape)

labels = sorted(df["label"].unique())
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

train_df["label_id"] = train_df["label"].map(label2id)
test_df["label_id"] = test_df["label"].map(label2id)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

Count of examples for each class:
 label
dog          100
horse        100
elephant     100
butterfly    100
chicken      100
cat          100
cow          100
sheep        100
squirrel     100
spider       100
Name: count, dtype: int64
Size train: (800, 2)
Size test: (200, 2)


Loads a pre-trained DistilBERT model for text classification, tokenises the data, converts it to PyTorch format and prepares training and test datasets

In [4]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

def tokenize_function(example):
    return tokenizer(example['text'], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.rename_column("label_id", "labels")
test_dataset = test_dataset.rename_column("label_id", "labels")

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 800/800 [00:00<00:00, 24077.87 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 162444.00 examples/s]


Configures the training parameters for the DistilBERT model, defines an accuracy metric, creates a Trainer object to train and evaluate the model, and then runs the training process

In [None]:
training_args = TrainingArguments(
    output_dir="./trans/results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./trans/logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = np.mean(predictions == labels)
    return {"accuracy": accuracy}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1234,0.080506,0.995
2,0.0204,0.016455,1.0
3,0.0146,0.012976,1.0


TrainOutput(global_step=300, training_loss=0.36689042682449025, metrics={'train_runtime': 150.9552, 'train_samples_per_second': 15.899, 'train_steps_per_second': 1.987, 'total_flos': 79491778560000.0, 'train_loss': 0.36689042682449025, 'epoch': 3.0})

Evaluates the trained model on test data and then uses it to predict the class of an animal from the input text, outputting classification results for several examples

In [6]:
results = trainer.evaluate(test_dataset)
print("Results on the test sample:", results)

def predict_animal(text):
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    return id2label[prediction]

new_texts = [
    "A little dog was barking in the park.",
    "I saw a spider weaving its web on the porch.",
    "The picture shows a beautiful dog playing with a ball",
]

for text in new_texts:
    animal = predict_animal(text)
    print(f"Текст: {text}\nPredict animal: {animal}\n")

Results on the test sample: {'eval_loss': 0.016454851254820824, 'eval_accuracy': 1.0, 'eval_runtime': 3.0034, 'eval_samples_per_second': 66.592, 'eval_steps_per_second': 8.324, 'epoch': 3.0}
Текст: A little dog was barking in the park.
Predict animal: dog

Текст: I saw a spider weaving its web on the porch.
Predict animal: spider

Текст: The picture shows a beautiful dog playing with a ball
Predict animal: dog



Saving the model

In [7]:
save_directory = "./trans/trained_model_trans"
os.makedirs(save_directory, exist_ok=True)  

In [8]:
trainer.save_model(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Saved: {save_directory}")

Saved: ./trans/trained_model_trans
