In [1]:
import pandas as pd

In [13]:
df = pd.read_csv('..\TrialFAQHandling\BankFAQs.csv')

In [14]:
df.head()

Unnamed: 0,Question,Answer,Class
0,Do I need to enter ‘#’ after keying in my Card...,Please listen to the recorded message and foll...,security
1,What details are required when I want to perfo...,"To perform a secure IVR transaction, you will ...",security
2,How should I get the IVR Password if I hold a...,An IVR password can be requested only from the...,security
3,How do I register my Mobile number for IVR Pas...,Please call our Customer Service Centre and en...,security
4,How can I obtain an IVR Password,By Sending SMS request: Send an SMS 'PWD<space...,security


In [15]:
df.drop(columns = ['Class'], inplace =  True)

In [16]:
df['Question'] = df['Question'].apply(lambda x : 'Question : ' + x if isinstance(x, str) else x)
df['Answer'] = df['Answer'].apply(lambda x : 'Answer : ' + x if isinstance(x, str) else x)


In [19]:
df.to_csv('..\TrialFAQHandling\FAQ_RAG.csv',index=False)

In [22]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import os
from sklearn.metrics.pairwise import cosine_similarity



# Load the data
faq_data = pd.read_csv("BankFAQs.csv")

# Preprocess the text data
faq_data['Question'] = faq_data['Question'].str.lower().str.replace('[^a-zA-Z\s]', '')
faq_data['Answer'] = faq_data['Answer'].str.lower().str.replace('[^a-zA-Z\s]', '')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(faq_data['Question'], faq_data['Class'], test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

# Load pre-trained model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(faq_data['Class'].unique()))

# Encode questions and classes
input_ids = []
attention_masks = []
labels = []
for question, class_label in zip(faq_data['Question'], faq_data['Class']):
    encoded_dict = tokenizer.encode_plus(question, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    labels.append(class_label)

# Convert labels to PyTorch tensor
label_map = {label: idx for idx, label in enumerate(faq_data['Class'].unique())}
labels = [label_map[label] for label in labels]
labels = torch.LongTensor(labels)

# Fine-tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
for epoch in range(3):
    model.train()
    for input_id, attention_mask, label in zip(input_ids, attention_masks, labels):
        optimizer.zero_grad()
        outputs = model(input_ids=input_id, attention_mask=attention_mask, labels=label)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Predict answer for a user query
# ... (same as before)
        

# Predict answer for a user query
def get_answer(user_query):
    input_dict = tokenizer.encode_plus(user_query, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
    outputs = model(input_dict['input_ids'], input_dict['attention_mask'])
    predicted_class = outputs.logits.argmax(-1).item()
    relevant_data = faq_data[faq_data['Class'] == predicted_class]
    # ... (find the most similar question and retrieve the answer)
    # relevant_data = faq_data #[faq_data['Class'] == predicted_class]
    if not relevant_data.empty:
        questions_vec = vectorizer.transform(relevant_data['Question'])
        similarities = cosine_similarity(user_query_vec, questions_vec)
        most_similar_index = similarities.argmax()
        most_similar_question = relevant_data.iloc[most_similar_index]
        answer = most_similar_question['Answer']
        return answer
    else:
        return "Sorry, I couldn't find a relevant answer in the FAQ."
    
# Example usage
user_query = "How long your OTP is valid?" #What is the validity period of the OTP"
answer = get_answer(user_query)
print(f"Answer: {answer}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Sentiment Analysis

In [21]:
from transformers.pipelines import PIPELINE_REGISTRY

#Get the list of tasks that are supported by Huggingface pipeline
print(PIPELINE_REGISTRY.get_supported_tasks())


['audio-classification', 'automatic-speech-recognition', 'conversational', 'depth-estimation', 'document-question-answering', 'feature-extraction', 'fill-mask', 'image-classification', 'image-feature-extraction', 'image-segmentation', 'image-to-image', 'image-to-text', 'mask-generation', 'ner', 'object-detection', 'question-answering', 'sentiment-analysis', 'summarization', 'table-question-answering', 'text-classification', 'text-generation', 'text-to-audio', 'text-to-speech', 'text2text-generation', 'token-classification', 'translation', 'video-classification', 'visual-question-answering', 'vqa', 'zero-shot-audio-classification', 'zero-shot-classification', 'zero-shot-image-classification', 'zero-shot-object-detection']


In [22]:
#Get information about a specific task
print("\nDefault Model for Sentiment Analysis: ")
print(PIPELINE_REGISTRY.check_task('sentiment-analysis')[1].get('default'))


Default Model for Sentiment Analysis: 
{'model': {'pt': ('distilbert/distilbert-base-uncased-finetuned-sst-2-english', 'af0f99b'), 'tf': ('distilbert/distilbert-base-uncased-finetuned-sst-2-english', 'af0f99b')}}


In [27]:
from transformers import pipeline
import os

#Load a pipeline. This will download the model checkpoint from huggingface and cache it
#locally on disk. If model is already available in cache, it will simply use the cached version
#Download will usually take a long time, depending on network bandwidth

sentiment_classifier = pipeline(task = "sentiment-analysis" , model = 'cardiffnlp/twitter-roberta-base-sentiment-latest')

#Cache usually available at : <<user-home>>.cache\huggingface\hub

cache_dir = os.path.expanduser('~') + "/.cache/huggingface/hub"
print("Huggingface Cache directory is : ", cache_dir)

#Contents of cache directory
os.listdir(cache_dir)

#Predict sentiment using the pipeline
sentiment_results=sentiment_classifier("This is a great course")
print(sentiment_results) #sentiment_results[0]['label']

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Huggingface Cache directory is :  C:\Users\nanth/.cache/huggingface/hub


['.locks',
 'models--cardiffnlp--twitter-roberta-base-sentiment-latest',
 'models--deepset--roberta-base-squad2',
 'models--distilbert--distilbert-base-uncased-finetuned-sst-2-english',
 'models--distilbert-base-cased-distilled-squad',
 'models--distilbert-base-uncased',
 'models--facebook--s2t-small-librispeech-asr',
 'models--facebook--wav2vec2-base-960h',
 'models--openai--whisper-large-v3',
 'models--openai--whisper-tiny.en',
 'models--SeamlessM4T',
 'models--speechbrain--sepformer-wham',
 'models--speechbrain--sepformer-whamr',
 'version.txt']

[{'label': 'positive', 'score': 0.9754324555397034}]


# Intent detection 

In [31]:
from datasets import load_dataset

dataset = load_dataset("PolyAI/banking77")

Downloading data: 100%|██████████| 298k/298k [00:00<00:00, 4.06MB/s]
Downloading data: 100%|██████████| 93.9k/93.9k [00:00<00:00, 1.33MB/s]
Generating train split: 100%|██████████| 10003/10003 [00:00<00:00, 341883.68 examples/s]
Generating test split: 100%|██████████| 3080/3080 [00:00<00:00, 385039.38 examples/s]


In [33]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("PolyAI/banking77")

# The dataset is a dictionary containing the train and test splits
train_dataset = dataset['train']
test_dataset = dataset['test']

# You can access the data in each split as a list
train_texts = train_dataset['text']
train_labels = train_dataset['label']

# You can also convert the dataset to a pandas DataFrame
import pandas as pd
train_df = pd.DataFrame({'text': train_texts, 'label': train_labels})

# Now you can use the data for training a model, performing analysis, etc.


In [34]:
train_labels

[11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
