In [1]:
import pandas as pd

In [2]:
bank = pd.read_csv("BankFAQs.csv")
bank.head()

Unnamed: 0,Question,Answer,Class
0,What are the documents required for opening a ...,Following documents are required to open a Cur...,accounts
1,Can I transfer my Current Account from one bra...,"Yes, Current Accounts can be transferred from ...",accounts
2,My present status is NRI. What extra documents...,NRI/PIO can open the proprietorship/partnershi...,accounts
3,What are the documents required for opening a ...,Following documents are required for opening a...,accounts
4,What documents are required to change the addr...,Following documents are required to change the...,accounts


In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
label_encoder = LabelEncoder()

In [5]:
bank ["label"] = label_encoder.fit_transform (bank["Class"])

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
train_text,val_text,train_label,val_label = train_test_split (
    bank["Question"].tolist(),
    bank["label"].tolist(),
    test_size=0.2,
    stratify = bank["label"],
    random_state=42
                                                             )

In [8]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [9]:
train_encodings = tokenizer(train_text,truncation = True ,padding = True ,max_length=128)
val_encodings = tokenizer(val_text,truncation=True,padding=True, max_length=128)

In [10]:
import torch

In [11]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item


train_dataset = Dataset(train_encodings, train_label)
val_dataset = Dataset(val_encodings, val_label)


In [12]:
from transformers import BertForSequenceClassification

num_labels = len(label_encoder.classes_)  
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=num_labels
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",         
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir='./logs',                
    logging_steps=10
)







In [14]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)


In [15]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.541,0.461093
2,0.3076,0.293099
3,0.0485,0.271037
4,0.0129,0.274052


TrainOutput(global_step=356, training_loss=0.35204597361636963, metrics={'train_runtime': 2778.6927, 'train_samples_per_second': 2.041, 'train_steps_per_second': 0.128, 'total_flos': 119511228541200.0, 'train_loss': 0.35204597361636963, 'epoch': 4.0})

In [16]:
eval_results = trainer.evaluate()
print(eval_results)


{'eval_loss': 0.27405211329460144, 'eval_runtime': 41.4655, 'eval_samples_per_second': 8.561, 'eval_steps_per_second': 0.555, 'epoch': 4.0}


In [17]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Evaluate again
trainer.evaluate()


{'eval_loss': 0.27405211329460144,
 'eval_model_preparation_time': 0.0064,
 'eval_accuracy': 0.9295774647887324,
 'eval_runtime': 43.7587,
 'eval_samples_per_second': 8.113,
 'eval_steps_per_second': 0.526}

In [18]:
model.save_pretrained("bert-faq-model")
tokenizer.save_pretrained("bert-faq-model")


('bert-faq-model\\tokenizer_config.json',
 'bert-faq-model\\special_tokens_map.json',
 'bert-faq-model\\vocab.txt',
 'bert-faq-model\\added_tokens.json')

In [20]:
def predict_faq(question):
    inputs = tokenizer(question, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=1).item()
    predicted_label = label_encoder.inverse_transform([predicted_class_id])[0]
    return predicted_label

# Example
question = "How can I apply for a credit card?"
print(predict_faq(question))


security
