In [10]:
import pandas as pd
#loading the dataset into a dataframe 
df = pd.read_csv('Intent_detection_Data(Hoja1).csv', encoding='latin1')
print(df.head())
#mapping intents to numerical Ids so it is machine understandable
label2id = {label: idx for idx, label in enumerate(df["INTENT"].unique())}# we create a dictionary linking intents to a unique id
id2label = {idx: label for label, idx in label2id.items()}#reverse mapping of the first dictionary
df["label"] = df["INTENT"].map(label2id)# adding a new column with all the unique ids mapped to their intents


                                             REQUEST            INTENT
0  Good day! Are there any single phase equivalen...  CatalogSelection
1  Good day! I would like to ask for the differen...  CatalogSelection
2               hi do we have a current transformer?  CatalogSelection
3  Hi, may I ask if these items are still active,...  CatalogSelection
4  I am looking for a magnetic contactor for LC1K...  CatalogSelection


In [11]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["REQUEST"], df["label"], test_size=0.2, stratify=df["INTENT"],random_state=42
)

In [12]:
print(min(train_labels), max(train_labels))  # Should be between 0 and num_labels - 1


0 3


In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
model = AutoModelForSequenceClassification.from_pretrained(
    "meta-llama/Llama-3.2-3B-Instruct", num_labels=len(label2id)
)



Loading checkpoint shards: 100%|██████████| 2/2 [00:42<00:00, 21.17s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-3B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Ensure tokenizer has a pad_token defined
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [15]:
#in case of GPU not working this will force it to use cpu instead of just crashing
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
 

In [19]:
from datasets import Dataset

def tokenize_function(examples):
    return tokenizer(
        examples["text"], padding="max_length", truncation=True, max_length=512
    )

train_dataset = Dataset.from_dict({
    "text": train_texts.tolist(),
    "label": train_labels.tolist()
}).map(tokenize_function, batched=True)

val_dataset = Dataset.from_dict({
    "text": val_texts.tolist(),
    "label": val_labels.tolist()
}).map(tokenize_function, batched=True)


Map: 100%|██████████| 128/128 [00:00<00:00, 1712.58 examples/s]
Map: 100%|██████████| 32/32 [00:00<00:00, 3860.60 examples/s]


In [20]:
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import DataLoader

#a function to tokenize the data and add padding in case needed, our datasets are now all numerical and readable by the machine
tokenizer.add_special_tokens({"pad_token": "[PAD]"})#manually adding a token to do the padding
def preprocess_function(examples):
    return tokenizer(examples['text'], padding=True, truncation=True)

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Ensure dataset format is correct for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


Map: 100%|██████████| 128/128 [00:00<00:00, 15366.39 examples/s]
Map: 100%|██████████| 32/32 [00:00<00:00, 6217.53 examples/s]


In [21]:
model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(128257, 3072)

In [22]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results", 
    evaluation_strategy="epoch", 
    learning_rate=2e-5, 
    per_device_train_batch_size=1, 
    per_device_eval_batch_size=1, 
    num_train_epochs=5, 
    weight_decay=0.01,
    no_cuda=True,  # Ensure we're using CPU (or adjust as needed)
   
)



In [23]:
from transformers import AdamW

# Define the optimizer
optimizer = AdamW(
    model.parameters(),  # The model's parameters to optimize
    lr=2e-5,  # Learning rate
    weight_decay=0.01,  # Weight decay (helps regularize the model)
)



In [24]:
from torch.utils.data import DataLoader

# Test DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=1)
for batch in train_dataloader:
    print(batch)  # Check for input_ids, attention_mask, and label consistency
    break


{'label': tensor([2]), 'input_ids': tensor([[128000,  85625,  13228,     12,   7261,     17,     12,  24538,   4767,
         128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256,
         128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256,
         128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256,
         128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256,
         128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256,
         128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256,
         128256, 128256, 128256, 128256, 128256, 128256, 128256]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [None]:
# Initialize Trainer
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=(optimizer, None),
    data_collator=data_collator  # Handles padding dynamically
)

trainer.train()

Currently training with a batch size of: 1
The following columns in the training set don't have a corresponding argument in `LlamaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `LlamaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 128
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 640
  Number of trainable parameters = 3,212,765,184
  0%|          | 0/640 [00:00<?, ?it/s]

In [None]:
from transformers import Trainer

# Add a padding token
tokenizer.add_special_tokens({"pad_token": "[PAD]"})  
print("Padding token:", tokenizer.pad_token)  # Should print: [PAD]


Padding token: [PAD]
