# Data Collection and Preprocessing

In [1]:
import pandas as pd

# Simulated dataset
data = {
    'Query': ['How can I reset my password?', 'What is the refund policy?', 'How do I track my order?'],
    'Response': ['To reset your password, click on "Forgot Password" on the login page.',
                 'Our refund policy lasts 30 days. To initiate a return, contact support.',
                 'You can track your order using the tracking number provided in your email.']
}

df = pd.DataFrame(data)
print(df)

                          Query  \
0  How can I reset my password?   
1    What is the refund policy?   
2      How do I track my order?   

                                            Response  
0  To reset your password, click on "Forgot Passw...  
1  Our refund policy lasts 30 days. To initiate a...  
2  You can track your order using the tracking nu...  


# Model Fine-Tuning

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Load the pre-trained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilgpt2")

# Set the padding token to the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['Query'], padding='max_length', truncation=True)

# Create a Dataset object
dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Prepare the labels
tokenized_dataset = tokenized_dataset.map(lambda examples: {'labels': examples['input_ids']}, batched=True)

# Convert to PyTorch tensors
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Fine-tuning setup
training_args = TrainingArguments(output_dir="./results", num_train_epochs=3, per_device_train_batch_size=4)
trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset)
trainer.train()

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 3/3 [00:00<00:00, 41.57 examples/s]
Map: 100%|██████████| 3/3 [00:00<00:00, 125.06 examples/s]
100%|██████████| 3/3 [04:28<00:00, 89.64s/it]

{'train_runtime': 268.8979, 'train_samples_per_second': 0.033, 'train_steps_per_second': 0.011, 'train_loss': 7.201426823933919, 'epoch': 3.0}





TrainOutput(global_step=3, training_loss=7.201426823933919, metrics={'train_runtime': 268.8979, 'train_samples_per_second': 0.033, 'train_steps_per_second': 0.011, 'total_flos': 2351670755328.0, 'train_loss': 7.201426823933919, 'epoch': 3.0})

# Embedding Creation and Storage

In [3]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load Sentence-BERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Generate embeddings for the queries
embeddings = model.encode(df['Query'].tolist())

# Store embeddings in a simulated vector database (dictionary)
vector_database = {}
for i, embedding in enumerate(embeddings):
    vector_database[i] = embedding



# Develop Retrieval-Augmented Generation (RAG) System

In [4]:
def retrieve_similar(query, vector_database, model):
    query_embedding = model.encode([query])
    similarities = cosine_similarity(query_embedding, list(vector_database.values()))
    most_similar = np.argmax(similarities)
    return most_similar

test_cases = """
I forgot my password, how do I reset it?
How can I change my account password?
What is your return policy?
Where can I find my order details?
""" 
query = "I forgot my password, how do I reset it?"
retrieved_index = retrieve_similar(query, vector_database, model)
response = df['Response'][retrieved_index]
print(response)

To reset your password, click on "Forgot Password" on the login page.
