In [30]:
import csv
from langchain.docstore.document import Document 
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

columns_to_embed=["Match Name","Series Name", "Match Date","Team1 Name", "Team1 Runs Scored","Team1 Wickets Fell","Team2 Name","Team2 Runs Scored","Team2 Wickets Fell","Match Venue (Stadium)","Match Venue (City)","Match Venue (Country)","Umpire 1","Umpire 2","Match Referee","Toss Winner","Toss Winner Choice","Match Winner","Match Result Text"]
columns_to_metadata = ["Match Name","Series Name", "Match Date","Team1 Name", "Team1 Runs Scored","Team1 Wickets Fell","Team2 Name","Team2 Runs Scored","Team2 Wickets Fell","Match Venue (Stadium)","Match Venue (City)","Match Venue (Country)","Umpire 1","Umpire 2","Match Referee","Toss Winner","Toss Winner Choice","Match Winner","Match Result Text"]

model="llama3"
llm = ChatOllama(model=model, temperature=0)


In [31]:
# Process the CSV into the embedable content vs the metadata and put it into Document format so that we can chunk it into pieces.
docs = []
with open('t20i_Matches_Data.csv', newline="", encoding='utf-8-sig') as csvfile:
    csv_reader = csv.DictReader(csvfile)
    for i, row in enumerate(csv_reader):
        to_metadata = {col: row[col] for col in columns_to_metadata if col in row}
        values_to_embed = {k: row[k] for k in columns_to_embed if k in row}
        to_embed = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in values_to_embed.items())
        newDoc = Document(page_content=to_embed, metadata=to_metadata)
        docs.append(newDoc)

print(docs[0])

page_content='Match Name: Australia Vs India Only T20I\nSeries Name: India tour of Australia  - 2007 (2007/08)\nMatch Date: 2008-02-01\nTeam1 Name: India\nTeam1 Runs Scored: 74.0\nTeam1 Wickets Fell: 10.0\nTeam2 Name: Australia\nTeam2 Runs Scored: 75.0\nTeam2 Wickets Fell: 1.0\nMatch Venue (Stadium): Melbourne Cricket Ground\nMatch Venue (City): Melbourne\nMatch Venue (Country): Australia\nUmpire 1: BNJ Oxenford\nUmpire 2: SJA Taufel\nMatch Referee: JJ Crowe\nToss Winner: India\nToss Winner Choice: bat\nMatch Winner: Australia\nMatch Result Text: Australia won by 9 wickets (with 52 balls remaining)' metadata={'Match Name': 'Australia Vs India Only T20I', 'Series Name': 'India tour of Australia  - 2007 (2007/08)', 'Match Date': '2008-02-01', 'Team1 Name': 'India', 'Team1 Runs Scored': '74.0', 'Team1 Wickets Fell': '10.0', 'Team2 Name': 'Australia', 'Team2 Runs Scored': '75.0', 'Team2 Wickets Fell': '1.0', 'Match Venue (Stadium)': 'Melbourne Cricket Ground', 'Match Venue (City)': 'Melbou

In [32]:
splitter = CharacterTextSplitter(separator = "\n",
                                chunk_size=500, 
                                chunk_overlap=0,
                                length_function=len)
documents = splitter.split_documents(docs)

In [33]:
ollama_emb = OllamaEmbeddings(
    model=model,
)
vectorstore = Chroma.from_documents(
    documents,
    embedding=ollama_emb,
)

In [34]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info = [
    AttributeInfo(
        name="Match Name",
        description="Name of the match",
        type="string",
    ),
    AttributeInfo(
        name="Series Name",
        description="Name of the series",
        type="string",
    ),
    AttributeInfo(
        name="Rating",
        description="The rating of the product as a number from 0 to 5. Ex. 4.5",
        type="string",
    ),
    AttributeInfo(
        name="Description", 
        description="Description of the product", type="string"
    ),
    AttributeInfo(
        name="Features", 
        description="Features of the product", 
        type="string"
    ),
]


retriever = vectorstore.as_retriever(
    search_type="similarity",
)

In [76]:
# message = """
# Answer this question using the provided context only.

# {question}

# Context:
# {context}
# """

# prompt = ChatPromptTemplate.from_messages([("human", message)])

# rag_chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | llm

# response = rag_chain.invoke("when did the first australia vs india match happen")

# print(response.content)

from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.messages import HumanMessage
from operator import itemgetter
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

### Contextualize question ###
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)


### Answer question ###
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "The context provided has data regarding cricket matches played for years"
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know."
    "\n\n"
    "{context}"
)
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)


### Statefully manage chat history ###
store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)
# response = rag_chain.invoke("among all the matches that have happened, when did the first australia vs india match happen")
config = {"configurable": {"session_id": "first"}}

conversational_rag_chain.invoke(
    {"input": "How many matches did we have Match Winner as India"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

Parent run ba749533-2ce9-4a17-9a0b-c1def5ddfbfe not found for run 2b4d5649-5a95-47c8-b8a5-9052cfa3b182. Treating as a root run.


'Based on the provided context, there are three matches where the match winner was India:\n\n1. India Vs Ireland 12Th Match Group A (2009)\n2. India Vs Pakistan 19Th Match Super 10 Group 2 (2016)\n3. Australia Vs India 1St T20I (2020)'

In [77]:
conversational_rag_chain.invoke(
    {"input": "how many total cricket matches are there in the context"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

Parent run cdb928ce-bc0d-4221-82fc-73c93fd27556 not found for run 65327ee4-cc5f-4daf-a710-05f0427d4ab2. Treating as a root run.


'Based on the provided context, there are 4 matches mentioned:\n\n1. Ireland vs ? (2009)\n2. ? vs ? (abandoned with a toss)\n3. ? vs ? (abandoned with a toss)\n4. New Zealand vs ? (2016)\n\nNote that not all match details are provided in the context, but there are 4 mentions of matches in total.'

In [79]:
vectorstore.delete_collection()