In [1]:
import pandas as pd
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain.chains import TransformChain
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain.schema import Document
from langchain.load import dumps, loads
from langchain.chains.query_constructor.schema import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_core.runnables import RunnablePassthrough

from matplotlib import pyplot as plt

In [2]:
DATASET_PATH = './Dataset/instagram.csv'
K_RETRIEVER_VALUE = 5

openai_embedding = OpenAIEmbeddings()

llm = ChatOpenAI(temperature=0) 

query = "What are the specific features or aspects that users appreciate the most in our application?"
# # query = "What are the primary reasons users express dissatisfaction with Instagram?"
# # query = "Can you identify emerging trends or patterns in recent user reviews that may impact our product strategy?

### Store to Vector DB

In [None]:
df = pd.read_csv(DATASET_PATH)[:5000]
dict_data = df.to_dict(orient="records")
df.info()

In [None]:
df.head(3)
print(df['rating'].value_counts())

In [None]:
display(df.isna().sum())
df.duplicated().sum()

In [None]:
display(df.head(3))

In [None]:
data = df['review_description']
token_count = []
encoding = tiktoken.get_encoding('cl100k_base')
max_token = 0
for i in data:
    num_tokens = len(encoding.encode(i))
    if max_token < num_tokens:
        max_token = num_tokens
    token_count.append(num_tokens)
    # print(num_tokens)
print(f"Highest Token Count : {max_token}")
plt.hist(token_count, bins=100, color='cyan', edgecolor='black')
plt.title('Token Count Distribution')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.show()

In [None]:

documents = [
    Document(
        page_content=item["review_description"],
        metadata={"rating": item["rating"], "review_date": item["review_date"]}
    )
    for item in dict_data
]

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=120,  
    chunk_overlap=10  
)

print("Splitting...")
splits = text_splitter.split_documents(documents)
print("Storing...")
vector_store = Chroma.from_documents(documents=documents, embedding=openai_embedding)
retriever = vector_store.as_retriever(search_kwargs={"k": K_RETRIEVER_VALUE})


In [None]:
print(f"Question : {query}")

results = retriever.get_relevant_documents(query)
for result in results:
    print(f"Review Chunk: {result.page_content}")
    print(f"Metadata: {result.metadata}")
    print()
    # break

### Query Translation

In [None]:
template = """You are an AI language model assistant. Your task is to generate five different versions of the given user question to retrieve relevant documents from a vector database. By generating multiple perspectives on the user question, your goal is to help the user overcome some of the limitations of the distance-based similarity search. Please focus on the clarity of the question and add more details to it. Provide these alternative questions separated by newlines. Original question: {query}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

generate_queries = (
    prompt_perspectives 
    | llm
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

print(generate_queries.invoke({"query":query}))

### Query Construction

In [11]:
metadata_field_info = [
    AttributeInfo(
        name="review_date",
        description="The time when the review was submitted",
        type="integer",
    ),
    AttributeInfo(
        name="rating", 
        description="A user rating scale ranging from 1 to 5, where 1 indicates poor quality and 5 represents excellent quality", 
        type="float"
    ),
]

document_content_description = "User rating of an application"

self_retriever = SelfQueryRetriever.from_llm(
    llm,
    vector_store,
    document_content_description,
    metadata_field_info,
)

self_retriver_result = self_retriever.invoke(query)

In [None]:
print(self_retriver_result)
print(len(self_retriver_result))

for i in self_retriver_result:
    print(i)

### Retrieval

In [13]:
def get_only_unique(documents):
    flattened = [dumps(doc) for sublist in documents for doc in sublist]
    unique_docs = list(set(flattened))
    return [loads(doc) for doc in unique_docs]

def process_multiple_queries(inputs):
    # print(type(inputs))
    # print(inputs)
    queries = inputs["query"]
    retrieval_results = [self_retriever.invoke(query) for query in queries]
    # print(retrieval_results)
    unique_docs = get_only_unique(retrieval_results)
    
    return {"documents": unique_docs}


queries = generate_queries.invoke({"query":query})

In [None]:
print(queries)

In [None]:
retrieval_chain = TransformChain(
    input_variables=["query"],  
    output_variables=["documents"],  
    transform=process_multiple_queries 
)
docs = retrieval_chain.invoke({"query": queries})
print(docs)


In [None]:
test = docs['documents']
print(len(test))
for i in test:
    print(i)

### Generation

In [None]:
template = """You are an AI assistant for question-answering tasks. Use the following pieces of retrieved context and information to answer the question. If you don't know the answer, say that you don't know. If the data is not relevant to the question, don't use the data. Context: {context} Question: {query}"""
prompt = ChatPromptTemplate.from_template(template)

generation_chain = (
    {"context": RunnablePassthrough(), "query": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# print(generation_chain)
generation_chain.invoke({"query": query, "context":docs['documents']})
