# LangChain: Q&A over Documents

In [1]:
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

In [None]:
# Loading packages
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import CSVLoader, JSONLoader
from langchain.vectorstores import DocArrayInMemorySearch
from IPython.display import display, Markdown
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.indexes.vectorstore import VectorstoreIndexCreator

# from langchain.callbacks import get_openai_callback

from IPython.display import Markdown

In [4]:
file = './Data.csv'
loader = CSVLoader(file_path=file)

In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embeddings
).from_loaders([loader])

In [None]:
llm_model = ChatOpenAI(model_name = "gpt-4o",
                       max_tokens=500)

In [None]:
query = "I want a cheap pillow, which one should I choose ?"

In [None]:
response = index.query(query,
                       llm=llm_model)

In [None]:
Markdown(response)

## For JSON

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [None]:
llm_model = ChatOpenAI(model_name = "gpt-4o")

In [None]:
file = './august_ranked_player_count_info.json'
json_loader = JSONLoader(jq_schema='.[] | {date: .date, games: .data[] | {game_id, player_count, game_rank, steam_app_id}}', 
                         file_path=file,
                         text_content=False)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=5000,  # Increase this value as needed
    chunk_overlap=200,  # Increase this value as needed
    length_function=len
)

In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embeddings,
    text_splitter=text_splitter
).from_loaders([json_loader])

In [None]:
query1 = "pull out the player count trend for game id : 21095 , sort them by descending order of the player count"

In [None]:
with get_openai_callback() as cb:
    response = index.query(query1, llm=llm_model)
    
    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Total Cost (USD): ${cb.total_cost}")

In [None]:
Markdown(response)

In [None]:
query2 = "Find the top 20 game ids which have highest player count across all the dates"

In [None]:
with get_openai_callback() as cb:
    response = index.query(query2, llm=llm_model)
    
    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Total Cost (USD): ${cb.total_cost}")

In [None]:
Markdown(response)

## Tackling Aggregations with JSON

In [7]:
import pandas as pd
import json
from langchain.agents import create_json_agent
from langchain.agents.agent_toolkits import JsonToolkit
from langchain.tools.json.tool import JsonSpec
from langchain.llms import OpenAI
from langchain.requests import TextRequestsWrapper

In [None]:
with open("./august_ranked_player_count_info.json","r") as j_file:
    json_data = json.load(j_file)

In [None]:
# def transform_data(data):
#     transformed = {}
#     for item in data:
#         date = item["date"]
#         games = {str(game["game_id"]): {
#                     "steam_app_id": game["steam_app_id"],
#                     "player_count": game["player_count"],
#                     "game_rank": game["game_rank"]
#                   } for game in item["data"]}
#         transformed[date] = games
#     return transformed

In [None]:
modified_json = {}
for i in range(len(json_data)):
    modified_json[json_data[i]['date']] = json_data[i]['data']

# # Transform the data
# modified_json = transform_data(json_data)

# Transforming the structure
# modified_json = {item['date']: item['data'] for item in json_data}

In [None]:
# Create a JsonSpec object
json_spec = JsonSpec(dict_=modified_json)

# Create a JsonToolkit
json_toolkit = JsonToolkit(spec=json_spec)

In [None]:
llm_model = ChatOpenAI(model_name = "gpt-4o-mini")

In [None]:
json_agent = create_json_agent(
    llm=llm_model,
    toolkit=json_toolkit,
    verbose=True,
    # max_iterations=30,
    # early_stopping_method="generate"
    # handle_parsing_errors = True
)

In [None]:
# query = "What is the game id which has highest player count"
# result = json_agent.run(query)
# print(result)

In [None]:
query = "What is the highest player count on any day and what's the game id of it across all dates"

In [None]:
with get_openai_callback() as cb:
    result = json_agent.run(query)
    
    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Total Cost (USD): ${cb.total_cost}")

In [None]:
Markdown(result)

## Achieving aggregations with Normal prompting

In [179]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

In [180]:
llm_model = ChatOpenAI(model_name = 'gpt-4o-mini')

In [181]:
template = """
You are an AI assistant specialized in analyzing game statistics data. Given the following JSON data structure representing daily game statistics:

{data}

Please answer the following question:
{question}

Provide a detailed answer with relevant calculations and explanations. If the question requires aggregation or analysis across multiple dates or games, perform the necessary calculations.
"""

In [182]:
prompt = ChatPromptTemplate.from_template(template)

In [183]:
# create a sequence with pipe operator
chain = prompt | llm_model | StrOutputParser()

In [188]:
query = "What is the highest player count on any day and what's the game id of it across last 40% of data"

In [None]:
with get_openai_callback() as cb:
    result = chain.invoke({'data':json_data,'question':query})
    
    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Total Cost (USD): ${cb.total_cost}")