In [1]:
import pandas as pd
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain.chains import TransformChain
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain.schema import Document
from langchain.load import dumps, loads
from langchain.chains.query_constructor.schema import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_core.runnables import RunnablePassthrough

from matplotlib import pyplot as plt

In [2]:
DATASET_PATH = './Dataset/instagram.csv'
K_RETRIEVER_VALUE = 5

openai_embedding = OpenAIEmbeddings()

llm = ChatOpenAI(temperature=0) 

df = pd.read_csv(DATASET_PATH)[:10000]
dict_data = df.to_dict(orient="records")

documents = [
    Document(
        page_content=item["review_description"],
        metadata={"rating": item["rating"], "review_date": item["review_date"]}
    )
    for item in dict_data
]

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=120,  
    chunk_overlap=10  
)

print("Splitting...")
splits = text_splitter.split_documents(documents)
print("Storing...")
vector_store = Chroma.from_documents(documents=documents, embedding=openai_embedding)
retriever = vector_store.as_retriever(search_kwargs={"k": K_RETRIEVER_VALUE})
print("Complete")


Splitting...
Storing...
Complete


In [6]:
template = """You are an AI language model assistant. Your task is to generate five different versions of the given user question to retrieve relevant documents from a vector database. By generating multiple perspectives on the user question, your goal is to help the user overcome some of the limitations of the distance-based similarity search. Please focus on the clarity of the question and add more details to it. Provide these alternative questions separated by newlines. Original question: {query}"""

prompt_perspectives = ChatPromptTemplate.from_template(template)

generate_queries = (
            prompt_perspectives 
            | llm
            | StrOutputParser() 
            | (lambda x: x.split("\n"))
        )

metadata_field_info = [
    
    AttributeInfo(
        name="rating", 
        description="A user rating scale ranging from 1 to 5, where 1 indicates poor quality and 5 represents excellent quality", 
        type="float"
    ),
]

document_content_description = "User rating of an application"

self_retriever = SelfQueryRetriever.from_llm(
    llm,
    vector_store,
    document_content_description,
    metadata_field_info,
)


In [7]:
def get_only_unique(documents):
    flattened = [dumps(doc) for sublist in documents for doc in sublist]
    unique_docs = list(set(flattened))
    return [loads(doc) for doc in unique_docs]

def process_multiple_queries(inputs):
    queries = inputs["query"]
    retrieval_results = [self_retriever.invoke(query) for query in queries]
    unique_docs = get_only_unique(retrieval_results)
    
    return {"documents": unique_docs}

retrieval_chain = TransformChain(
    input_variables=["query"],  
    output_variables=["documents"],  
    transform=process_multiple_queries 
)

template = """You are an AI assistant for question-answering tasks. Use the following pieces of retrieved context and information to answer the question. If you don't know the answer, say that you don't know. If the data is not relevant to the question, don't use the data. Put newline symbol if necessary. Context: {context} Question: {query}"""
prompt = ChatPromptTemplate.from_template(template)

generation_chain = (
    {"context": RunnablePassthrough(), "query": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)



In [9]:
from flask import Flask, url_for,redirect, render_template, Response,request
from flask import Flask
app = Flask('__name__')

@app.route('/')
def home():
    global generate_queries, self_retriever, retrieval_chain, generation_chain
    prompt = request.args.get('prompt', '') 

    result = ""
    if prompt != '':
        prompt = prompt
        queries = generate_queries.invoke({"query":prompt})
        print(f"Query Translation Length : {len(queries)}")
        retrieval_chain = TransformChain(
            input_variables=["query"],  
            output_variables=["documents"],  
            transform=process_multiple_queries 
        )
        docs = retrieval_chain.invoke({"query": queries})
        print("Relevant docs length : "+ str(len(docs['documents'])))
        result = generation_chain.invoke({"query": prompt, "context":docs['documents']})
        # print(result)
        result = result.replace('\n', '<br>')


    
    return render_template('index.html', prompt=prompt, result=result)

if __name__ == "__main__":
    app.run(port=5050,debug=True, use_reloader=False)


 * Serving Flask app '__name__'
 * Debug mode: on


 * Running on http://127.0.0.1:5050
Press CTRL+C to quit
127.0.0.1 - - [13/Jan/2025 03:39:41] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [13/Jan/2025 03:39:41] "GET /static/js/script.js HTTP/1.1" 404 -
127.0.0.1 - - [13/Jan/2025 03:39:41] "GET /static/styles/index.css HTTP/1.1" 304 -


Query Translation Length : 5
Relevant docs length : 20


127.0.0.1 - - [13/Jan/2025 03:39:54] "GET /?prompt=What+are+the+specific+features+or+aspects+that+users+appreciate+the+most+in+our+application? HTTP/1.1" 200 -
127.0.0.1 - - [13/Jan/2025 03:39:54] "GET /static/styles/index.css HTTP/1.1" 304 -
127.0.0.1 - - [13/Jan/2025 03:39:54] "GET /static/js/script.js HTTP/1.1" 404 -


Query Translation Length : 5
Relevant docs length : 17


127.0.0.1 - - [13/Jan/2025 03:40:10] "GET /?prompt=What+are+the+specific+features+or+aspects+that+users+appreciate+the+most+in+our+application?+Make+it+in+points! HTTP/1.1" 200 -
127.0.0.1 - - [13/Jan/2025 03:40:10] "GET /static/styles/index.css HTTP/1.1" 304 -
127.0.0.1 - - [13/Jan/2025 03:40:10] "GET /static/js/script.js HTTP/1.1" 404 -


Query Translation Length : 5
Relevant docs length : 17


127.0.0.1 - - [13/Jan/2025 03:40:31] "GET /?prompt=What+are+the+primary+reasons+users+express+dissatisfaction+with+Instagram? HTTP/1.1" 200 -
127.0.0.1 - - [13/Jan/2025 03:40:31] "GET /static/styles/index.css HTTP/1.1" 304 -
127.0.0.1 - - [13/Jan/2025 03:40:31] "GET /static/js/script.js HTTP/1.1" 404 -


Query Translation Length : 5
Relevant docs length : 15


127.0.0.1 - - [13/Jan/2025 03:40:48] "GET /?prompt=Can+you+identify+emerging+trends+or+patterns+in+recent+user+reviews+that+may+impact+our+product+strategy? HTTP/1.1" 200 -
127.0.0.1 - - [13/Jan/2025 03:40:48] "GET /static/styles/index.css HTTP/1.1" 304 -
127.0.0.1 - - [13/Jan/2025 03:40:48] "GET /static/js/script.js HTTP/1.1" 404 -
