Creating JSON file for our dataset

In [2]:
import json

#  Define input & output file paths
input_file = r"C:\Users\gvais\OneDrive\Desktop\faq\faq text.txt"
output_file = r"C:\Users\gvais\OneDrive\Desktop\faq\faq_data.json"

In [3]:
# pip install chardet

In [4]:
import chardet  # Detects file encoding

In [5]:
# List to store extracted FAQs
faqs = []
question = None
answer = ""

In [6]:
# Detect File Encoding First (To Avoid Unicode Errors)
with open(input_file, "rb") as f:
    raw_data = f.read(50000)  # Read a portion of the file
    result = chardet.detect(raw_data)
    detected_encoding = result["encoding"]

print(f" Detected Encoding: {detected_encoding}")

 Detected Encoding: Windows-1252


In [7]:
#  Read the text file with detected encoding (Avoid Unicode Errors)
with open(input_file, "r", encoding=detected_encoding, errors="ignore") as f:
    lines = f.readlines()

In [8]:
#  Try multiple encodings (utf-8-sig, latin-1, or errors="replace" to avoid crashes)
with open(input_file, "r", encoding="utf-8-sig", errors="replace") as file:
    for line in file:
        line = line.strip()

        if line.startswith("Q-"):  # Identify questions
            if question and answer:  # Store previous Q&A before starting a new one
                faqs.append({"question": question, "answer": answer.strip()})
            question = line[2:].strip()  # Remove 'Q-' and store question
            answer = ""  # Reset answer for new question

        elif line.startswith("A-"):  # Identify answers
            answer = line[2:].strip()  # Remove 'A-' and store answer

        else:  # Append multi-line answers
            answer += " " + line



In [9]:

# Append the last Q&A pair
if question and answer:
    faqs.append({"question": question, "answer": answer.strip()})

# Save FAQs to JSON file
with open(output_file, "w", encoding="utf-8") as json_file:
    json.dump({"faqs": faqs}, json_file, indent=4, ensure_ascii=False)

print(f" FAQs successfully extracted and saved to: {output_file}")

 FAQs successfully extracted and saved to: C:\Users\gvais\OneDrive\Desktop\faq\faq_data.json


In [10]:
import json

faq_file = r"C:\Users\gvais\OneDrive\Desktop\faq\faq_data.json"  # Update path

In [11]:
#  Load JSON file correctly
with open(faq_file, "r", encoding="utf-8") as f:
    faqs = json.load(f)

In [12]:
#  Check the type of `faqs`
print("Type of faqs:", type(faqs))  # Should be <class 'list'>

#  If it's a dictionary, print keys
if isinstance(faqs, dict):
    print("Dictionary keys:", faqs.keys())

#  If it's a list, print first 3 entries
if isinstance(faqs, list):
    print(faqs[:3])  # Should show a list of dictionaries

Type of faqs: <class 'dict'>
Dictionary keys: dict_keys(['faqs'])


Tokenizing our data

In [13]:
#loading llama's pretrained tokenizer
from transformers import LlamaTokenizer

tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

text = "What is BharatPe? BharatPe is a financial technology company..."
tokens = tokenizer(text, return_tensors="pt")

print(tokens.input_ids)  # Tokenized representation


  from .autonotebook import tqdm as notebook_tqdm


tensor([[    1,  1724,   338,   350,  8222,   271, 15666, 29973,   350,  8222,
           271, 15666,   338,   263, 18161, 15483,  5001,   856]])


In [14]:

# Load your FAQs
with open(r"C:\Users\gvais\OneDrive\Desktop\faq\bharatpe_faqs.json", "r", encoding="utf-8") as f:
    faqs = json.load(f)

In [15]:
import transformers
print(transformers.__version__)

4.48.1


In [16]:
#  Load JSON file correctly
with open(faq_file, "r", encoding="utf-8") as f:
    faqs_dict = json.load(f)  # This loads a dictionary

In [17]:
#  Extract the FAQ list from the dictionary
faqs = faqs_dict.get("faqs", [])  # Safely get the list

#  Check if it's now a list
print("Type of faqs after extraction:", type(faqs))  # Should be <class 'list'>
print(faqs[:3])  # Print first 3 FAQs to verify

Type of faqs after extraction: <class 'list'>
[{'question': 'what is bharatpe?', 'answer': "bharatpe is a new way to pay, and makes spending easy. in collaboration with lending partners, you can spend now and pay next month. a line of credit is offered through bharatpe by rbi registered lenders (after necessary checks and kyc completion) which can be used anytime, anywhere to purchase goods and services - either by scanning a qr, sending money directly to your bank account, making utility bill payments or availing a personal loan from the assigned bharatpe limit. when it's time to pay your bill, you can clear your dues in one go, or convert to an equated monthly installment (emi) with tenure of your choice. you can also earn exciting cashback and rewards. get started now!"}, {'question': 'how do i sign up for bharatpe?', 'answer': 'you can sign up for bharatpe by verifying your mobile number and then completing your kyc using pan and aadhaar number or any other information as required 

In [18]:


# Tokenize all questions and answers
tokenized_faqs = []
for faq in faqs:
    tokens = tokenizer(faq["question"] + " " + faq["answer"], return_tensors="pt")
    tokenized_faqs.append(tokens.input_ids.tolist())


In [19]:
# Save the tokenized dataset
with open("tokenized_faqs.json", "w") as f:
    json.dump(tokenized_faqs, f)


VECTOR EMBEDDING

In [20]:
import json
import torch
from transformers import AutoTokenizer, AutoModel

In [21]:
#  Load the pre-trained embedding model
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # Change to OpenAI model if needed
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

In [22]:
#  Load BPE Tokenized Dataset
file_path = r"C:\Users\gvais\OneDrive\Desktop\faq\bharatpe_faqs.json"  # Use full path
with open(file_path, "r", encoding="utf-8") as file:
    faq_data = json.load(file)

In [23]:
# Function to Convert Text to Vector Embeddings
def generate_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().tolist()

In [24]:
#  Generate Embeddings for Each FAQ
embeddings = []
for item in faq_data["faqs"]:
    vector = generate_embedding(item["question"] + " " + item["answer"])
    embeddings.append({"question": item["question"], "embedding": vector})


In [25]:
#  Save Embeddings to a JSON File
embedding_file = r"C:\Users\gvais\OneDrive\Desktop\faq\faq_embeddings.json"
with open(embedding_file, "w", encoding="utf-8") as f:
    json.dump(embeddings, f, indent=4)

print(f" Embeddings saved at: {embedding_file}")

 Embeddings saved at: C:\Users\gvais\OneDrive\Desktop\faq\faq_embeddings.json


Generate & Store Embeddings using Ollama

In [26]:
import json
import ollama
import chromadb

In [27]:
#  Load the FAQ dataset
file_path = r"C:\Users\gvais\OneDrive\Desktop\faq\bharatpe_faqs.json"
with open(file_path, "r", encoding="utf-8") as file:
    faq_data = json.load(file)

In [28]:
#  Initialize ChromaDB for Storing Embeddings
chroma_client = chromadb.PersistentClient(path=r"C:\Users\gvais\OneDrive\Desktop\faq\chroma_db")
collection = chroma_client.get_or_create_collection(name="faq_embeddings")

In [29]:
# ran for 1+ hr and still running so commemted it out for now
# #  Generate and Store Embeddings
# for item in faq_data["faqs"]:
#     text = item["question"] + " " + item["answer"]
#     response = ollama.embeddings(model="mistral", prompt=text)  # Use your preferred LLaMA model
#     embedding = response["embedding"]

#     # Store in ChromaDB
#     collection.add(
#         ids=[item["question"]],  # Use question as unique ID
#         embeddings=[embedding],
#         metadatas=[{"question": item["question"], "answer": item["answer"]}]
#     )

# print(" Embeddings successfully generated and stored in ChromaDB!")


Convert Our Tokenized Dataset into QLoRA-Compatible JSONL

In [50]:

#  Paths for input and output files
bpe_tokenized_file = r"C:\Users\gvais\OneDrive\Desktop\faq\bharatpe_faqs.json"  # BPE tokenized dataset
output_jsonl_file = r"C:\Users\gvais\OneDrive\Desktop\faq\bharatpe_qlora.jsonl"  # QLoRA fine-tuning dataset

In [51]:
#  Load the tokenized FAQ dataset
with open(bpe_tokenized_file, "r", encoding="utf-8") as file:
    faq_data = json.load(file)


In [52]:
#  Convert the dataset to QLoRA format
with open(output_jsonl_file, "w", encoding="utf-8") as jsonl_file:
    for item in faq_data["faqs"]:
        jsonl_entry = {
            "instruction": item["question"],  # Question from FAQ dataset
            "input": "",  # No additional input for now
            "output": item["answer"]  # Answer from FAQ dataset
        }
        jsonl_file.write(json.dumps(jsonl_entry) + "\n")  # Write in JSONL format

print(f" QLoRA dataset successfully saved at: {output_jsonl_file}")

 QLoRA dataset successfully saved at: C:\Users\gvais\OneDrive\Desktop\faq\bharatpe_qlora.jsonl


 Split into Training & Validation Sets

In [53]:
from sklearn.model_selection import train_test_split

In [54]:
#  Load formatted dataset
with open(output_file, "r", encoding="utf-8") as f:
    data = f.readlines()

In [55]:
#  Split into 90% training & 10% validation
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

In [56]:
#  Save splits
with open("train.jsonl", "w", encoding="utf-8") as f:
    f.writelines(train_data)

with open("val.jsonl", "w", encoding="utf-8") as f:
    f.writelines(val_data)

print(" Train-Test split done!")


 Train-Test split done!


Load the Model with LoRA (QLoRA for Efficient Fine-Tuning)

In [57]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
import torch

In [58]:
#  Load tokenizer & model
model_name = "facebook/opt-1.3b"  # You can also use "mistralai/Mistral-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [59]:
# Load training dataset
from datasets import load_dataset
dataset = load_dataset("json", data_files={"train": "train.jsonl", "validation": "val.jsonl"})

Generating train split: 173 examples [00:00, 3737.53 examples/s]
Generating validation split: 20 examples [00:00, 1077.69 examples/s]


In [62]:
# ✅ Define training parameters
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    logging_dir="./logs",
)
