In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
import torch


In [27]:
# Load dataset
df = pd.read_csv('Intent_detection_Data(Hoja1).csv', encoding='latin1')

# Map intents to numerical IDs
label2id = {label: idx for idx, label in enumerate(df["INTENT"].unique())}
id2label = {idx: label for label, idx in label2id.items()}
df["label"] = df["INTENT"].map(label2id)

# Display the first few rows of the dataset
print(df.head())


                                             REQUEST            INTENT  label
0  Good day! Are there any single phase equivalen...  CatalogSelection      0
1  Good day! I would like to ask for the differen...  CatalogSelection      0
2               hi do we have a current transformer?  CatalogSelection      0
3  Hi, may I ask if these items are still active,...  CatalogSelection      0
4  I am looking for a magnetic contactor for LC1K...  CatalogSelection      0


In [28]:
# Split dataset into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["REQUEST"], df["label"], test_size=0.2, stratify=df["INTENT"], random_state=42
)


In [40]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(label2id)
)

# Handle padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


loading configuration file config.json from cache at /Users/kawtarissam/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.46.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at /Users/kawtarissam/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594

In [41]:
# Configure device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [42]:
# Create datasets from training and validation data
train_dataset = Dataset.from_dict({"REQUEST": train_texts.tolist(), "label": train_labels.tolist()})
val_dataset = Dataset.from_dict({"REQUEST": val_texts.tolist(), "label": val_labels.tolist()})


In [43]:
# Define tokenization function
def tokenize_function(examples):
    return tokenizer(examples["REQUEST"], padding="max_length", truncation=True, max_length=512)

# Apply tokenization to train and validation datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Format datasets for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


Map: 100%|██████████| 128/128 [00:00<00:00, 2474.95 examples/s]
Map: 100%|██████████| 32/32 [00:00<00:00, 3751.09 examples/s]


In [44]:
# Adjust the model's token embeddings to match the tokenizer
model.resize_token_embeddings(len(tokenizer))


You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 30522. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(30522, 768, padding_idx=0)

In [45]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="/Users/kawtarissam/Desktop/chatbot2/my_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=100,  # Adjust as needed
    log_level="info",
    no_cuda=True
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [46]:
import evaluate
accuracy = evaluate.load("accuracy")
# Function to compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [47]:
# Define data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    
)


In [48]:
trainer.train()
results = trainer.evaluate()
print("Validation Accuracy:", results["eval_accuracy"])

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: REQUEST. If REQUEST are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 128
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 640
  Number of trainable parameters = 109,485,316
 16%|█▌        | 100/640 [03:49<20:37,  2.29s/it]
 16%|█▌        | 100/640 [03:49<20:37,  2.29s/it]

{'loss': 1.4173, 'grad_norm': 21.55948257446289, 'learning_rate': 1.6875e-05, 'epoch': 0.78}


 20%|██        | 128/640 [04:52<19:58,  2.34s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: REQUEST. If REQUEST are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 32
  Batch size = 1

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

 20%|██        | 128/640 [05:06<19:58,  2.34s/it]
[A
[A

{'eval_loss': 1.1689668893814087, 'eval_accuracy': 0.46875, 'eval_runtime': 13.664, 'eval_samples_per_second': 2.342, 'eval_steps_per_second': 2.342, 'epoch': 1.0}


 31%|███▏      | 200/640 [07:50<16:55,  2.31s/it]
 31%|███▏      | 200/640 [07:50<16:55,  2.31s/it]

{'loss': 1.0435, 'grad_norm': 10.0105562210083, 'learning_rate': 1.375e-05, 'epoch': 1.56}


 40%|████      | 256/640 [10:01<14:14,  2.22s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: REQUEST. If REQUEST are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 32
  Batch size = 1

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

 40%|████      | 256/640 [10:14<14:14,  2.22s/it]
[A
[A

{'eval_loss': 0.524774432182312, 'eval_accuracy': 0.84375, 'eval_runtime': 13.1861, 'eval_samples_per_second': 2.427, 'eval_steps_per_second': 2.427, 'epoch': 2.0}


 47%|████▋     | 300/640 [11:53<12:19,  2.18s/it]
 47%|████▋     | 300/640 [11:53<12:19,  2.18s/it]

{'loss': 0.4787, 'grad_norm': 41.179847717285156, 'learning_rate': 1.0625e-05, 'epoch': 2.34}


 60%|██████    | 384/640 [14:59<09:04,  2.13s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: REQUEST. If REQUEST are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 32
  Batch size = 1

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

 60%|██████    | 384/640 [15:13<09:04,  2.13s/it]
[A
[A

{'eval_loss': 0.40331292152404785, 'eval_accuracy': 0.84375, 'eval_runtime': 14.6067, 'eval_samples_per_second': 2.191, 'eval_steps_per_second': 2.191, 'epoch': 3.0}


 62%|██████▎   | 400/640 [15:50<08:52,  2.22s/it]
 62%|██████▎   | 400/640 [15:50<08:52,  2.22s/it]  

{'loss': 0.1821, 'grad_norm': 0.5565797090530396, 'learning_rate': 7.500000000000001e-06, 'epoch': 3.12}


 78%|███████▊  | 500/640 [19:46<05:24,  2.32s/it]
 78%|███████▊  | 500/640 [19:46<05:24,  2.32s/it]  Saving model checkpoint to /Users/kawtarissam/Desktop/chatbot2/my_model/checkpoint-500
Configuration saved in /Users/kawtarissam/Desktop/chatbot2/my_model/checkpoint-500/config.json


{'loss': 0.1815, 'grad_norm': 0.2898178994655609, 'learning_rate': 4.3750000000000005e-06, 'epoch': 3.91}


Model weights saved in /Users/kawtarissam/Desktop/chatbot2/my_model/checkpoint-500/model.safetensors
 80%|████████  | 512/640 [20:26<06:28,  3.04s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: REQUEST. If REQUEST are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 32
  Batch size = 1

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

 80%|████████  | 512/640 [20:42<06:28,  3.04s/it]
[A
[A

{'eval_loss': 0.35099202394485474, 'eval_accuracy': 0.9375, 'eval_runtime': 15.1224, 'eval_samples_per_second': 2.116, 'eval_steps_per_second': 2.116, 'epoch': 4.0}


 94%|█████████▍| 600/640 [24:07<01:22,  2.07s/it]
 94%|█████████▍| 600/640 [24:07<01:22,  2.07s/it]  

{'loss': 0.1539, 'grad_norm': 0.26460182666778564, 'learning_rate': 1.25e-06, 'epoch': 4.69}


100%|██████████| 640/640 [25:29<00:00,  2.04s/it]Saving model checkpoint to /Users/kawtarissam/Desktop/chatbot2/my_model/checkpoint-640
Configuration saved in /Users/kawtarissam/Desktop/chatbot2/my_model/checkpoint-640/config.json
Model weights saved in /Users/kawtarissam/Desktop/chatbot2/my_model/checkpoint-640/model.safetensors
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: REQUEST. If REQUEST are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 32
  Batch size = 1

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

100%|██████████| 640/640 [25:46<00:00,  2.04s/it]
[A
[A

Training completed. Do not forget to share your model on huggingface.co/models =)



100%|████

{'eval_loss': 0.3701702952384949, 'eval_accuracy': 0.90625, 'eval_runtime': 15.4343, 'eval_samples_per_second': 2.073, 'eval_steps_per_second': 2.073, 'epoch': 5.0}
{'train_runtime': 1546.8914, 'train_samples_per_second': 0.414, 'train_steps_per_second': 0.414, 'train_loss': 0.5407536635175347, 'epoch': 5.0}


100%|██████████| 32/32 [00:14<00:00,  2.15it/s]

Validation Accuracy: 0.90625





In [49]:
print("Validation Accuracy:", results["eval_accuracy"])

Validation Accuracy: 0.90625


In [50]:
model.save_pretrained("./my_model")  # Saving the model
tokenizer.save_pretrained("./my_model")  #  tokenizer

Configuration saved in ./my_model/config.json
Model weights saved in ./my_model/model.safetensors
tokenizer config file saved in ./my_model/tokenizer_config.json
Special tokens file saved in ./my_model/special_tokens_map.json


('./my_model/tokenizer_config.json',
 './my_model/special_tokens_map.json',
 './my_model/vocab.txt',
 './my_model/added_tokens.json',
 './my_model/tokenizer.json')