In [128]:
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset

In [129]:
checkpoint = "distilbert-base-uncased" #or roberta-base
model = AutoModel.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The following layers were not sharded: transformer.layer.*.attention.k_lin.bias, embeddings.LayerNorm.bias, embeddings.LayerNorm.weight, transformer.layer.*.attention.out_lin.weight, transformer.layer.*.attention.out_lin.bias, transformer.layer.*.ffn.lin*.weight, embeddings.position_embeddings.weight, embeddings.word_embeddings.weight, transformer.layer.*.ffn.lin*.bias, transformer.layer.*.attention.v_lin.bias, transformer.layer.*.attention.q_lin.weight, transformer.layer.*.output_layer_norm.bias, transformer.layer.*.sa_layer_norm.weight, transformer.layer.*.attention.k_lin.weight, transformer.layer.*.attention.v_lin.weight, transformer.layer.*.attention.q_lin.bias, transformer.layer.*.output_layer_norm.weight, transformer.layer.*.sa_layer_norm.bias


In [130]:
dataset = load_dataset(
    'csv',
    data_files={
        "train": "dataset/train.csv",
        "test": "dataset/test.csv"
    }
)

In [131]:
dataset['train'][:4]

{'text': ['I am still waiting on my card?',
  "What can I do if my card still hasn't arrived after 2 weeks?",
  'I have been waiting over a week. Is the card still coming?',
  'Can I track my card while it is in the process of delivery?'],
 'category': ['card_arrival', 'card_arrival', 'card_arrival', 'card_arrival']}

In [132]:
len(set(dataset['train']['category']))

77

In [133]:
token_lengts = [len(tokenizer.encode(text)) for text in dataset['train']['text']]

In [134]:
import numpy as np
print(f" Mean: {np.mean(token_lengts)}")
print(f"Max :{np.max(token_lengts)}")
print(f"Min:{np.min(token_lengts)}")

 Mean: 16.21373587923623
Max :98
Min:4


In [135]:
token_sorted = sorted(token_lengts)

In [136]:
for text in dataset['train']['text']:
    if len(tokenizer.encode(text)) ==4 or len(tokenizer.encode(text)) == 5:
        print(f"text: {text}")

text: Report stolen card
text: Cancel a transaction
text: stop the transaction
text: Cancel Transaction
text: top up cancellation
text: Reverted top up
text: Unusual direct deposit
text: passcode retrieval
text: Lost password
text: pending transaction?
text: phone is gone
text: Transfer declined.
text: cancel my account
text: Changing my PIN
text: My transfer failed
text: Activate my card
text: Card activation steps
text: Supported countries


In [137]:
token_sorted[:10]

[4, 4, 4, 5, 5, 5, 5, 5, 5, 5]

In [138]:
token_sorted[-10:] #max_length = 128 enough for us

[74, 77, 79, 79, 80, 81, 83, 87, 93, 98]

In [139]:
len(set(token_lengts))

78

In [101]:
print(dataset['train'].features['category'])

ClassLabel(names=['Refund_not_showing_up', 'activate_my_card', 'age_limit', 'apple_pay_or_google_pay', 'atm_support', 'automatic_top_up', 'balance_not_updated_after_bank_transfer', 'balance_not_updated_after_cheque_or_cash_deposit', 'beneficiary_not_allowed', 'cancel_transfer', 'card_about_to_expire', 'card_acceptance', 'card_arrival', 'card_delivery_estimate', 'card_linking', 'card_not_working', 'card_payment_fee_charged', 'card_payment_not_recognised', 'card_payment_wrong_exchange_rate', 'card_swallowed', 'cash_withdrawal_charge', 'cash_withdrawal_not_recognised', 'change_pin', 'compromised_card', 'contactless_not_working', 'country_support', 'declined_card_payment', 'declined_cash_withdrawal', 'declined_transfer', 'direct_debit_payment_not_recognised', 'disposable_card_limits', 'edit_personal_details', 'exchange_charge', 'exchange_rate', 'exchange_via_app', 'extra_charge_on_statement', 'failed_transfer', 'fiat_currency_support', 'get_disposable_virtual_card', 'get_physical_card', 'g

In [141]:
import pandas as pd
df_train = dataset['train'].to_pandas()
print(df_train['category'].value_counts().describe()) #class with the fewest example has 35 example which is enough for us, no need to data augmentation.

count     77.000000
mean     129.909091
std       32.942207
min       35.000000
25%      112.000000
50%      127.000000
75%      159.000000
max      187.000000
Name: count, dtype: float64


In [140]:
df_train.duplicated().sum()

0

In [162]:
low_freq = df_train['category'].value_counts()
low_freq = low_freq[low_freq < 100]
print(len(low_freq))

11


In [163]:
#low_freq.index[0]
low_freq.index.tolist()

['getting_virtual_card',
 'get_disposable_virtual_card',
 'top_up_limits',
 'receiving_money',
 'atm_support',
 'compromised_card',
 'lost_or_stolen_card',
 'card_swallowed',
 'card_acceptance',
 'virtual_card_not_working',
 'contactless_not_working']

# Data Augmentation

"I lost my card" --> "My card is missing"

In [164]:
import nlpaug.augmenter.word as naw

aug = naw.ContextualWordEmbsAug(
    model_path='distilbert-base-uncased', 
    action="substitute",
    device='cuda',
    top_k=5
)
text = "I lost my credit card"

for _ in range(3):
    print(aug.augment(text)) 

['please check my credit card']
['i lost my heart …']
['i lost no credit …']


**I won't use this it is soo badd.**

i will use llm instead of this

In [None]:
#label_smoothing_factor=0.1

In [116]:
import google.generativeai as genai
import pandas as pd
import json
import time
import os
from dotenv import load_dotenv
from tqdm import tqdm
load_dotenv()

True

In [166]:
api_key = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=api_key)

SYSTEM_PROMPT = """
You are an expert AI data augmentor for a banking customer service chatbot.
Your goal is to generate diverse, natural English phrasings for a given customer intent.

STRICT RULES:
1. MEANING: The intent must remain exactly the same as the original text.
2. STYLE: Use a mix of formal (polite) and casual (short, slang, typos) language.
3. FORMAT: Return ONLY a raw JSON list of strings. Do not use Markdown code blocks. Do not add explanations.
4. QUANTITY: Generate exactly 3 variations per request.
"""
model = genai.GenerativeModel("gemini-2.5-flash")

def augment(user_text):
    prompt = f"""
SYSTEM INSTRUCTIONS (MUST FOLLOW):
{SYSTEM_PROMPT}

--- DO NOT BREAK THE ABOVE RULES ---

USER INTENT:
{user_text}

OUTPUT:
"""
    response = model.generate_content(prompt)
    return response.text.strip()


In [None]:
from datasets import ClassLabel
dataset = dataset.class_encode_column("category")
print(dataset['train']['category'])

In [None]:
splitted_data = dataset['train'].train_test_split(
    test_size = 0.1,
    stratify_by_column='category',
    seed = 42
)