<a href="https://colab.research.google.com/github/IshaqKHATTAK/Kaggle-work/blob/main/Bert_spanish_text_classificatio_version2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments


### Data analysis and loading

In [None]:
df = pd.read_csv('/content/train5q.csv')

In [None]:
df

Unnamed: 0,sentence,category
0,delivery,delivery
1,delivery gracias,delivery
2,delivery por favor,delivery
3,enviame el pedido,delivery
4,enviame la pizza,delivery
...,...,...
735,si esta bien,yes
736,si gracias,yes
737,si perfecto,yes
738,si por favor,yes


In [None]:
df = df.dropna(subset=['category'])

In [None]:
df['category'].isna().sum()

0

### preprocessing and model building

Preprocessing data for training

In [None]:
def preprocess_and_prepare_data(df,tokenizer):
  def preprocess_function(input_sentence):
    return tokenizer(input_sentence['sentence'], truncation=True, padding='max_length'), input_sentence['category']

  tokenized_datasets = df.apply(preprocess_function, axis=1)

  le = LabelEncoder()
  df['category'] = le.fit_transform(df['category'])
  labels = df['category'].values

  # Create dataset format for Trainer
  dataset = []
  for i, (tokenized_example, _) in enumerate(tokenized_datasets):
      dataset.append({
          'input_ids': torch.tensor(tokenized_example['input_ids']),
          'attention_mask': torch.tensor(tokenized_example['attention_mask']),
          'labels': torch.tensor(labels[i], dtype=torch.long)
      })
  return dataset,le


training model

In [None]:
def train_model(df,dataset,model):
  train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=42, stratify=df['category'])

  early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=2,
    early_stopping_threshold=0.01
  )

  def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='macro')
    accuracy = accuracy_score(labels, preds)
    return {
        'f1': f1,
        'accuracy': accuracy
    }

  # Define training arguments
  training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    metric_for_best_model='f1',
    load_best_model_at_end=True,
    save_total_limit=3
  )

  trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
  )

  trainer.train()

  return trainer


In [None]:
#laoding model and tokenizers for traning and data preapration
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased')
num_labels = len(df['category'].value_counts())
model = BertForSequenceClassification.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased', num_labels=num_labels)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/310 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/486k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#trainig
dataset, label_encoder = preprocess_and_prepare_data(df,tokenizer)

In [None]:
train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=42, stratify=dataset['category'])


In [None]:
# Extract labels directly from the dataset
extracted_labels = [data['labels'].item() for data in dataset]  # Convert tensor to scalar


In [None]:
# Extract labels from the dataset
dataset_labels = [data['labels'].item() for data in dataset]  # Assuming labels are tensors

# Compare lengths
assert len(dataset_labels) == len(df), "Length mismatch between dataset and DataFrame."

# Compare distributions
from collections import Counter
df_label_counts = Counter(df['category'])
dataset_label_counts = Counter(dataset_labels)

print("DataFrame Label Distribution:", df_label_counts)
print("Dataset Label Distribution:", dataset_label_counts)


DataFrame Label Distribution: Counter({6: 92, 4: 86, 13: 57, 2: 56, 8: 50, 15: 50, 5: 43, 12: 41, 1: 37, 3: 37, 10: 37, 9: 36, 0: 34, 7: 28, 11: 28, 14: 27, 16: 1})
Dataset Label Distribution: Counter({6: 92, 4: 86, 13: 57, 2: 56, 8: 50, 15: 50, 5: 43, 12: 41, 1: 37, 3: 37, 10: 37, 9: 36, 0: 34, 7: 28, 11: 28, 14: 27, 16: 1})


In [None]:
classes = label_encoder.classes_  # Original class names
encoded_values = label_encoder.transform(classes)  # Encoded values

# Print the mapping
print("Class -> Encoded Label")
for class_name, encoded_value in zip(classes, encoded_values):
    print(f"{class_name} -> {encoded_value}")

Class -> Encoded Label
delivery -> 0
hello -> 1
hours -> 2
location -> 3
menu -> 4
no -> 5
order -> 6
pay-cash -> 7
pay-mp -> 8
pay-mp-later -> 9
pay-now -> 10
payments -> 11
pickup -> 12
status -> 13
thanks -> 14
yes -> 15
nan -> 16


In [None]:
# Filter out classes with fewer than 2 instances before stratification
label_counts = Counter(dataset_labels)
filtered_dataset = [data for data in dataset if label_counts[data['labels'].item()] > 1]
filtered_labels = [data['labels'].item() for data in filtered_dataset]

# Perform the stratified split on the filtered dataset
train_dataset, val_dataset = train_test_split(filtered_dataset, test_size=0.2, random_state=42, stratify=filtered_labels)


In [None]:
train_test_split(dataset, test_size=0.2, random_state=42, stratify=extracted_labels)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [None]:
trainer = train_model(df,dataset,model)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

To save the trained model and label encoder to use later when needed.

In [None]:
# trainer.save_model("./trainer_checkpoint")

# import pickle
# with open("./label_encoder.pkl", "wb") as f:
#     pickle.dump(label_encoder, f)

### Testing

**How to test and run the model.**


1.   First you must have internet becuase the tokenizer will be downladed from internet.
2.   Must have this file and trainer_checkpoint in same directory otherwise you have to setup hte path to load the model from trainer_checkpoint.
3.   Also have the labelencoder .pkl file in the same direcotry other setup path for that also.
4.   Run the below cell to load the model tokenizer and label encoder.
5.   Run next cell to laod up the fucntion.

6.   Replace the content of sentece with your content and run the cell and see th prediction.



In [None]:
# Reload the trainer and tokenizer
import pickle
trainer_loaded = BertForSequenceClassification.from_pretrained("./model_save")
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased')
# Reload the label encoder
with open("./label_encoder.pkl", "rb") as f:
    label_encoder_loaded = pickle.load(f)

In [None]:
#prediction
def predict_intent(model,sentence,le,tokenizer):
  inputs = tokenizer(sentence, truncation=True, padding='max_length', return_tensors='pt')

  model.eval()
  with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

  predicted_label_idx = torch.argmax(logits, dim=1).item()

  predicted_label = le.inverse_transform([predicted_label_idx])

  return predicted_label[0]

In [None]:
sentence = 'Quiero una pizza grande de queso con una Coca-Cola de 2 litros.'
predicted_label = predict_intent(model = trainer_loaded, sentence = sentence, le = label_encoder_loaded, tokenizer = tokenizer)
print(predicted_label)

order
