In [1]:
!pip install accelerate
!pip install pandas
!pip install sqlite3
!pip install transformers
!pip install torch

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [2]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [3]:

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from nltk.corpus import wordnet
import random

# Load the intents and examples
intents_df = pd.read_csv('./intents_and_examples.csv')

# Prepare the dataset
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

class IntentDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Encode the labels
label_map = {label: idx for idx, label in enumerate(intents_df['Intent'].unique())}
intents_df['label'] = intents_df['Intent'].map(label_map)

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    intents_df['Example'].tolist(),
    intents_df['label'].tolist(),
    test_size=0.2,
    random_state=42
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [4]:

# Synonym replacement for data augmentation
def synonym_replacement(text, n):
    words = text.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if wordnet.synsets(word)]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = wordnet.synsets(random_word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    return ' '.join(new_words)

# Apply data augmentation
augmented_texts = []
augmented_labels = []

for text, label in zip(train_texts, train_labels):
    augmented_texts.append(text)
    augmented_labels.append(label)
    augmented_texts.append(synonym_replacement(text, 1))
    augmented_labels.append(label)


In [9]:

# Create new augmented dataset
train_dataset = IntentDataset(augmented_texts, augmented_labels, tokenizer, max_len=64)
val_dataset = IntentDataset(val_texts, val_labels, tokenizer, max_len=64)

# Load pre-trained XLM-RoBERTa model
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=len(label_map))

best_accuracy = 0
best_params = {}

param_grid = {
    'learning_rate': [5e-5],
    'per_device_train_batch_size': [2],
    'num_train_epochs': [4]
}

for lr in param_grid['learning_rate']:
    for batch_size in param_grid['per_device_train_batch_size']:
        for epochs in param_grid['num_train_epochs']:
            training_args = TrainingArguments(
                output_dir='./results',
                num_train_epochs=epochs,
                per_device_train_batch_size=batch_size,
                per_device_eval_batch_size=8,
                warmup_steps=500,
                weight_decay=0.01,
                logging_dir='./logs',
                logging_steps=30,
                learning_rate=lr,
                evaluation_strategy="epoch"
            )

            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, p.predictions.argmax(-1))}
            )

            trainer.train()
            eval_results = trainer.evaluate()
            accuracy = eval_results['eval_accuracy']
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params = {
                    'learning_rate': lr,
                    'per_device_train_batch_size': batch_size,
                    'num_train_epochs': epochs
                }

# Output the best parameters and accuracy
print("Best Parameters:", best_params)
print("Best Accuracy:", best_accuracy)

#model should provide information, should be able to detect unidentified objects
#solutions similar to bizbot
#


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,2.018,1.728548,0.478992
2,0.9855,0.830346,0.764706
3,0.41,0.777749,0.831933
4,0.1616,0.619163,0.865546


Best Parameters: {'learning_rate': 5e-05, 'per_device_train_batch_size': 2, 'num_train_epochs': 4}
Best Accuracy: 0.865546218487395


In [10]:
# Save the model
model.save_pretrained('./financial_xlm_roberta_model')
tokenizer.save_pretrained('./financial_xlm_roberta_tokenizer')

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Test Accuracy: {eval_results['eval_accuracy']:.4f}")

Test Accuracy: 0.8655


In [16]:
import pandas as pd
import sqlite3

# Load the CSV file
financial_data = pd.read_csv('./Dummy_Financial_Data.csv')

# Connect to SQLite database (or create it)
conn = sqlite3.connect('./financial_data.db')

# Convert the DataFrame to a SQL table
financial_data.to_sql('financial_data', conn, if_exists='replace', index=False)

# Verify the table
query = "SELECT * FROM financial_data LIMIT 5;"
financial_data_sample = pd.read_sql(query, conn)
print(financial_data_sample)

# Close the connection
conn.close()

   Account Number            Name  Balance                       Email  \
0      7230780854      John White  5960.25    casey.thomas@example.com   
1      8736393608  Casey Anderson  9245.09     cameron.white@yahoo.com   
2      2441783699        Jane Doe  6483.57       john.taylor@yahoo.com   
3      2452236112    Taylor Smith  5414.68    john.johnson@example.com   
4      2621649680    Jordan Smith  9384.00  cameron.taylor@example.com   

   Phone Number Account Type  Credit Score                       Address  
0    6916345629         Loan           592    851 Maple St, Fairview, NY  
1    3145173384      Savings           801  960 Broadway, Greenville, TX  
2    3104465091         Loan           840   854 Park Ave, Riverside, NY  
3    8774850601     Checking           329    789 Broadway, Fairview, PA  
4    7810087179       Credit           643   711 Maple St, Riverside, TX  


In [19]:

import pandas as pd
import sqlite3
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
import torch

# Load the trained model and tokenizer
model = XLMRobertaForSequenceClassification.from_pretrained('./financial_xlm_roberta_model')
tokenizer = XLMRobertaTokenizer.from_pretrained('./financial_xlm_roberta_tokenizer')
model.eval()

# Connect to the database
def query_database(query, params=()):
    conn = sqlite3.connect('./financial_data.db')
    cursor = conn.cursor()
    cursor.execute(query, params)
    result = cursor.fetchall()
    conn.close()
    return result

# Load the intents and examples to get the label_map
intents_df = pd.read_csv('./intents_and_examples.csv')
label_map = {label: idx for idx, label in enumerate(intents_df['Intent'].unique())}

# Map intent ID to label
def get_intent_label(intent_id):
    return list(label_map.keys())[list(label_map.values()).index(intent_id)]

# Function to process user input and generate response
def process_input(user_input, account_number=None):
    if account_number is None:
        if user_input.isdigit():  # Assuming account numbers are digits
            return None, f"Account number {user_input} recognized. How can I help you with your account today?", user_input
        else:
            return None, "Please provide your account number.", None

    # Process the user's request with the account number
    inputs = tokenizer(user_input, return_tensors='pt', truncation=True, padding=True, max_length=64)
    outputs = model(**inputs)
    intent_id = torch.argmax(outputs.logits).item()
    intent = get_intent_label(intent_id)

    print(f"DEBUG: User Input: {user_input}, Detected Intent: {intent}")

    if intent == 'check_balance':
        response = query_database("SELECT Balance FROM financial_data WHERE [Account Number] = ?", (account_number,))
        response_text = f"Your account balance is {response[0][0]}" if response else "Account not found."
    elif intent == 'search_transactions':
        response = query_database("SELECT * FROM financial_data WHERE [Account Number] = ? LIMIT 5", (account_number,))
        response_text = response if response else "No transaction history found."
    elif intent == 'check_human':
        response_text = "I am an AI created to assist you with financial inquiries."
    elif intent == 'open_account':
        response_text = "To open a new account, please visit our website or nearest branch with your identification documents."
    elif intent == 'close_account':
        response_text = "To close your account, please visit our nearest branch or contact our customer support."
    elif intent == 'loan_inquiry':
        response_text = "You can inquire about loans and their eligibility criteria on our website or by visiting our branch."
    elif intent == 'credit_card_application':
        response_text = "You can apply for a credit card through our website or by visiting our nearest branch."
    elif intent == 'contact_support':
        response_text = "You can contact our customer support through the chat feature on our website or by calling our support number."
    else:
        response_text = "I'm sorry, I didn't understand that."

    return intent, response_text, account_number

# Main loop to run the chatbot in the terminal
def run_chatbot():
    account_number = None
    print("Welcome to the Financial Chatbot!")
    while True:
        user_input = input("You: ")
        if user_input.lower() in ['exit', 'quit']:
            print("Chatbot: Goodbye!")
            break
        intent, response, account_number = process_input(user_input, account_number)
        print(f"Chatbot: {response}")

# Run the chatbot
run_chatbot()


Welcome to the Financial Chatbot!
You: hello
Chatbot: Please provide your account number.
You: 2441783699
Chatbot: Account number 2441783699 recognized. How can I help you with your account today?
You: show me my transaction history
DEBUG: User Input: show me my transaction history, Detected Intent: search_transactions
Chatbot: [(2441783699, 'Jane Doe', 6483.57, 'john.taylor@yahoo.com', 3104465091, 'Loan', 840, '854 Park Ave, Riverside, NY')]
You: what is my account balance
DEBUG: User Input: what is my account balance, Detected Intent: check_balance
Chatbot: Your account balance is 6483.57
You: are you human
DEBUG: User Input: are you human, Detected Intent: check_human
Chatbot: I am an AI created to assist you with financial inquiries.
You: good bye
DEBUG: User Input: good bye, Detected Intent: goodbye
Chatbot: I'm sorry, I didn't understand that.
You: hello
DEBUG: User Input: hello, Detected Intent: greet
Chatbot: I'm sorry, I didn't understand that.
You: we good
DEBUG: User Input: 