Copyright © Agilesh Arumugam

We attempt to make a simple chatbot like old Siri / Google Assistant.

# Imports

In [14]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import CrossEncoder
import sentence_transformers as st
print("Sentence Transformer: ", st.__version__)
import numpy as np
print("Numpy:", np.__version__)
from typing import List
import os
import json

import torch
import torchvision
import torchaudio

print("PyTorch:", torch.__version__)
print("Torchvision:", torchvision.__version__)
print("Torchaudio:", torchaudio.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version (from torch):", torch.version.cuda)
print("cuDNN version:", torch.backends.cudnn.version())

Sentence Transformer:  5.0.0
Numpy: 2.0.1
PyTorch: 2.5.1
Torchvision: 0.20.1
Torchaudio: 2.5.1
CUDA available: True
CUDA version (from torch): 11.8
cuDNN version: 90100


# VectorDB Class

We make a simple Vector DB class that manages the vector representation of prompts from our dataset.

In [2]:
class VectorDB:
    def __init__(self, path=""):
        self.transformer_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.path = path
        self.similarity_checking_model = CrossEncoder('cross-encoder/stsb-roberta-base')
        os.makedirs(self.path, exist_ok=True)
        
        self.meta_path = os.path.join(self.path, "meta.txt")
        self.vector_path = os.path.join(self.path, "vectors.txt")
        self.clear()

    def add_vector(self, text: str):
        text = text.strip()
    
        with open(self.meta_path, "r", encoding="utf-8") as f:
            for line in f:
                if line.strip() == text:
                    return
    
        text_vector = self.transformer_model.encode(text)

        with open(self.meta_path, "a", encoding="utf-8") as f_meta:
            f_meta.write(text + "\n")
    
        with open(self.vector_path, "a", encoding="utf-8") as f_vec:
            f_vec.write(' '.join(map(str, text_vector)) + "\n")

    def get_most_similar(self, text):
        most_similar_text = None
        max_similarity = float('-inf')
    
        with open(self.meta_path, "r", encoding="utf-8") as f:
            for line in f:
                candidate = line.strip()
                if not candidate:
                    continue
                score = self.similarity_checking_model.predict([(text, candidate)])[0]
                if score > max_similarity:
                    max_similarity = score
                    most_similar_text = candidate
    
        if most_similar_text is not None:
            return most_similar_text, max_similarity
        return None

    def clear(self):
        if os.path.exists(self.meta_path):
            os.remove(self.meta_path)
        if os.path.exists(self.vector_path):
            os.remove(self.vector_path)
        open(self.meta_path, "a").close()
        open(self.vector_path, "a").close()

Sample code showing how to use the VectorDB class.

In [3]:
vector_db = VectorDB("VectorDB")

In [4]:
vector_db.add_vector("How to cook noodles?")
vector_db.add_vector("Hello World!")
vector_db.add_vector("How to cook pasta?")
print(vector_db.get_most_similar("How the FUCK do I cook spaghetti?"))

('How to cook noodles?', np.float32(0.84038234))


# Dataset Class

We make a dataset class so we can efficiently query questions for their answers, rather than having the entire dataset in memory.

In [5]:
class AgiDataset:
    def __init__(self, path: str):
        if os.path.exists(path):
            self.path = path
        else:
            raise Exception("That path doesn't exist!")
        if not path.endswith(".jsonl"):
            raise Exception("AgiDataset class instantiation needs a path to a .jsonl file.")
            
    def get_answer_for(self, question):
        with open(self.path, "r", encoding="utf-8") as f:
            for line in f:
                obj = json.loads(line)
                if obj["user"] == question:
                    return obj["assistant"]
        return None
    
    def add_questions_to_vector_db(self, vector_db: VectorDB):
        with open(self.path, "r", encoding="utf-8") as f:
            for line in f:
                obj = json.loads(line)
                vector_db.add_vector(obj["user"])

Sample code showing how to use the AgiDataset class.

In [6]:
dataset = AgiDataset("data/data.jsonl")

In [7]:
vector_db.clear()
dataset.add_questions_to_vector_db(vector_db)

# Pipeline

We now see how we shall go about responding to actual requests.

In [8]:
# assume a request as follows:
user_request = "I want to pay my exam fees, how do I do it?"

# first we get the most similar question that already exists in the database
most_similar_string, similarity = vector_db.get_most_similar(user_request)
print(f"Most similar question in database: {most_similar_string}, with similarity of {similarity}.")

# then we get the answer for the question from the dataset
answer = dataset.get_answer_for(most_similar_string)
print(f"Answer for the request is: {answer}")

Most similar question in database: Where do I pay my exam fees?, with similarity of 0.7649897336959839.
Answer for the request is: Login to your student portal (https://sp.srmist.edu.in/srmiststudentportal/students/loginManager/youLogin.jsp) and click “Fee Payment” and click “Fee Details”. There you can see how much exam fees or fees in general you have to pay.


So we define the following function. 

In [9]:
def get_answer_for(request, vector_db, dataset):
    most_similar_string, similarity = vector_db.get_most_similar(request)
    if similarity < 0.5:
        return "I can't answer that.", similarity
    return dataset.get_answer_for(most_similar_string), similarity

In [10]:
answer, similarity = get_answer_for("I want to see my exam timetable?", vector_db, dataset)
print(answer, similarity)

Login to your student portal (https://sp.srmist.edu.in/srmiststudentportal/students/loginManager/youLogin.jsp) and click “Scribe Request” on the left. Choose the month and year of your exams. There you can see the dates of your examinations. 0.7891468
