In [1]:
import chromadb

chroma_client = chromadb.HttpClient(host='localhost', port=8000)

In [2]:
from chromadb.utils import embedding_functions

sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


Created collection (name: test_<name_of_the_game>)

- cyberpunk2077 "test_cyberpunk2077"
- Monster Hunter World "test_mhw"
- dota2 "test_dota2"

In [10]:
collection = chroma_client.get_collection('test_cyberpunk2077')

Create collection

In [16]:
collection = chroma_client.create_collection(name="test_dota2", embedding_function=sentence_transformer_ef)

In [17]:
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from pathlib import Path
from langchain.text_splitter import CharacterTextSplitter

file_dir_path = Path("dota2/")

loader = DirectoryLoader(str(file_dir_path), glob="./*.txt", loader_cls=TextLoader)
docs = loader.load()

# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(docs)

# create documents list and metadata list for Chroma
documents = []
metadata = []
for doc in docs:
    documents.append(doc.page_content)
    metadata.append(doc.metadata)

In [18]:
# add the documents to the collection
collection.add(
    documents=documents,
    ids=[str(i) for i in range(len(documents))],
    metadatas=metadata
)

---

In [19]:
collection.peek()

{'ids': ['0', '1', '10', '11', '12', '13', '14', '15', '16', '17'],
 'embeddings': [[-0.08958929032087326,
   -0.001978689106181264,
   0.024290025234222412,
   -0.026174167171120644,
   0.009678468108177185,
   -0.05608528107404709,
   -0.016450325027108192,
   0.042121537029743195,
   0.017129477113485336,
   0.06613221019506454,
   -0.08582614362239838,
   -0.046830203384160995,
   -0.0449550598859787,
   -0.022964831441640854,
   0.041414257138967514,
   0.018190573900938034,
   0.05469305068254471,
   -0.09486570954322815,
   0.05145211145281792,
   -0.057726457715034485,
   0.02094111405313015,
   -0.10176771134138107,
   -0.023256609216332436,
   0.07869853079319,
   -0.04862517863512039,
   0.05026165768504143,
   -0.050506118685007095,
   0.03541887179017067,
   -0.08270946890115738,
   -0.03742228448390961,
   -0.017057787626981735,
   0.07862571626901627,
   -0.008853079751133919,
   -0.06284867972135544,
   0.021295974031090736,
   0.003843621350824833,
   0.105389393866062

---

LLM retrieval with Chromadb docker

In [12]:
from langchain_community.llms import Ollama

In [13]:
llm = Ollama(model="llama2")        # assuming the port is 11434

In [25]:
from langchain.vectorstores import Chroma
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)

embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

db = Chroma(collection_name="test_dota2", client=chroma_client, embedding_function=embedding_function)

In [26]:
# make a chain

from langchain.chains import RetrievalQA

chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=db.as_retriever(),
    return_source_documents=True
)

In [27]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [31]:
prompt = 'What is the game about?'

# retriever
retriever = db.as_retriever(search_kwargs={'k': 5})     # define number of documents to retrieve

docs = retriever.get_relevant_documents(prompt)

print(len(docs))
print('\n\n')
for doc in docs:
    print(doc.page_content)
    print()
    print('Source:', end='')
    print(doc.metadata['source'])
    print('\n\n')

5



Play a match every evening for a couple of weeks, and you start to see how Dota 2's wealth of disparate systems and mechanics combine into their own harmony, and you begin to understand how there are hundreds of elements that affect the game. Dota 2 is a tense war of accumulation and attrition. The biggest problem, which is coincidentally where the real excitement of the game lies, comes from struggling to process and interpret dozens upon dozens of mitigating circumstances while simultaneously trying to keep your cool.

Source:dota2/dota2_03.txt



Aiding these two teams, named the Radiant and the Dire, are waves of AI assistants, called creeps. Batches of creeps spawn at 30-second intervals and charge merrily up the map's three pathways. In the clumps of remaining land lies a jungle, where numerous AI opponents spawn, offering lucrative benefits to players who take them on successfully. Finally, and in a bid to stop both teams from simply marching into one another's base, each t

---

In [32]:
# make a chain

# create the chain to answer questions 
chain = RetrievalQA.from_chain_type(llm=llm, 
                                    chain_type="stuff", 
                                    retriever=retriever, 
                                    return_source_documents=True)

# full example
llm_response = chain.invoke(prompt)
llm_response

{'query': 'What is the game about?',
 'result': "Based on the given context, Dota 2 is a multiplayer online battle arena (MOBA) game that combines various elements such as RPG, RTS, and twitch-based action-RPG. The game involves two teams, Radiant and Dire, competing to destroy each other's base while defending their own. The game has a steep learning curve and requires players to work together and coordinate their actions effectively to succeed.\n\nThe gameplay involves right-clicking on points on the map to move to them, right-clicking on enemies to attack them, and using special abilities and items bought at shops around the map to gain an advantage over the opponents. The game is known for its complexity and depth, with hundreds of elements that affect the gameplay, making it a challenging and rewarding experience for players who commit to learning it.\n\nIn summary, Dota 2 is a complex and deep MOBA game that requires teamwork, strategy, and quick reflexes to succeed.",
 'source_d

In [30]:
prompt = \
'''you are a gamer who are reading reviews of a game to understand the characteristics of the game, then deciding whether purchasing the game or not. Generate seven short sentence for each aspect in ['Gameplay', 'Audio', 'Graphics', 'Community', 'Performance', 'Bug', 'Suggestion'].} Output them in json format as {'ASPECT':'SUMMARY'}. Output 'NA' in the 'SUMMARY' if the review does not contain content related to that 'ASPECT'. Do not output other thing except the json.'''

llm_response = chain(prompt)
process_llm_response(llm_response)

{
"ASPECT": "Gameplay",
"SUMMARY": "Dota 2 is complicated and exhausting, but also rewarding for those who commit to learning its complexities."
},
{
"ASPECT": "Audio",
"SUMMARY": "The game features a variety of voice lines and sound effects that add to the overall immersion."
},
{
"ASPECT": "Graphics",
"SUMMARY": "Dota 2's visuals are slick and polished, with detailed character models and environments."
},
{
"ASPECT": "Community",
"SUMMARY": "The game has a dedicated community of players who create and share custom content, such as taunts and announcers."
},
{
"ASPECT": "Performance",
"SUMMARY": "Dota 2 is available for free-to-play and runs smoothly on most computers, but some players may experience lag or other performance issues."
},
{
"ASPECT": "Bug",
"SUMMARY": "The game may have some bugs or glitches, but the developers are actively working to fix them."
},
{
"ASPECT": "Suggestion",
"SUMMARY": "Some players may suggest improvements to the game's design or features, but the overa

---

In [66]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_core.prompts import PromptTemplate

# system_template = \
# '''You are a reviewer of the game. Use the following pieces of context to answer any question about the game.
# If you don't know the answer, just say 'NA'. Do NOT try to make up an answer.
# ---
# {context}'''

# prompt, let say write a summary of the game with some predefined aspects
# Gameplay, Graphics, Sound, Performance, Bug, Suggestion, Price, Overall

# TODO: fine-tune the prompt to use the theory I stated below
prompt_template = \
'''You are reading reviews of a game to understand the characteristics of the game. Use the following pieces of context to answer user's question. 

{summaries}

Question: {question}

If you don't know the answer, just output a json with all values in the json as 'NA'. Do NOT try to make up an answer.
Only output the JSON. Do NOT output other text.'''

my_question = \
'''Extract the following aspects of the game from the reviews. Output a json with each of the aspects as key, and the extracted information as the value. The format of the json is {"ASPECT":"INFORMATION"}. The aspects are: [Gameplay, Graphics, Sound, Performance, Bug, Suggestion, Price, Overall]
'''

chain =  RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={
        "prompt": PromptTemplate(
            template=prompt_template,
            input_variables=["summaries", "question"],
        )
    },
    return_source_documents=True,
)

In [67]:
response = chain.invoke(
    {
        'question': my_question
    }
)

response

{'question': 'Extract the following aspects of the game from the reviews. Output a json with each of the aspects as key, and the extracted information as the value. The format of the json is {"ASPECT":"INFORMATION"}. The aspects are: [Gameplay, Graphics, Sound, Performance, Bug, Suggestion, Price, Overall]\n',
 'answer': '{\n"Gameplay": "Incredibly deep and rewarding gameplay",\n"Graphics": "Beautiful and varied worlds, each with different stats attached that you can appreciate and compare. When it launched Monster Hunter World quickly earned its place as one of the best looking console games available. Now it’s done the same on PC.",\n"Sound": "Engaging story",\n"Performance": "Multiplayer matches can be fiddly to set up",\n"Bug": "NA",\n"Suggestion": "NA",\n"Price": "NA",\n"Overall": "One of the best games I have ever played. It is that rich. It is that deep. It is that good."\n}',
 'sources': '',
 'source_documents': [Document(page_content='Monster Hunter World: Just as good on PC R

In [68]:
print(response['answer'])

{
"Gameplay": "Incredibly deep and rewarding gameplay",
"Graphics": "Beautiful and varied worlds, each with different stats attached that you can appreciate and compare. When it launched Monster Hunter World quickly earned its place as one of the best looking console games available. Now it’s done the same on PC.",
"Sound": "Engaging story",
"Performance": "Multiplayer matches can be fiddly to set up",
"Bug": "NA",
"Suggestion": "NA",
"Price": "NA",
"Overall": "One of the best games I have ever played. It is that rich. It is that deep. It is that good."
}


---

Chain of thought (break large task to smaller task)

- Prompt the llm for each aspect of the game.
- Then ask llm to output a json with better summarizing.

It performs better, as allows knowledge distills to each task.

In [33]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_core.prompts import PromptTemplate

# system_template = \
# '''You are a reviewer of the game. Use the following pieces of context to answer any question about the game.
# If you don't know the answer, just say 'NA'. Do NOT try to make up an answer.
# ---
# {context}'''

# prompt, let say write a summary of the game with some predefined aspects
# Gameplay, Graphics, Sound, Performance, Bug, Suggestion, Price, Overall

# TODO: fine-tune the prompt to use the theory I stated below

prompt_template = \
'''You are reading reviews of a game to understand the characteristics of the game. Use the following pieces of context to answer user's question. 

{summaries}

Question: {question}

If you don't know the answer, output only "NA". Do NOT try to make up an answer. Do NOT output other text.'''

my_question_template = \
'''Extract the the following aspect of the game from the reviews. Output a paragraph with less than 200 words. The aspect is: '''

aspects = ['Gameplay', 'Sound', 'Graphics', 'Performance', 'Bug', 'Suggestion', 'Price', 'Overall']
aspects_response = {k: '' for k in aspects}

for aspect in aspects:
    my_question = my_question_template + f'{aspect}'
    print(my_question)



    chain =  RetrievalQAWithSourcesChain.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        chain_type_kwargs={
            "prompt": PromptTemplate(
                template=prompt_template,
                input_variables=["summaries", "question"],
            )
        },
        return_source_documents=True,
    )

    response = chain.invoke(
        {
            'question': my_question
        }
    )

    print(response)
    print('\n\n')
    print(response['answer'])
    aspects_response[aspect] = response['answer']
    print('\n\n')
    print(response['source_documents'])

    print('\n\n\n')

Extract the the following aspect of the game from the reviews. Output a paragraph with less than 200 words. The aspect is: Gameplay
{'question': 'Extract the the following aspect of the game from the reviews. Output a paragraph with less than 200 words. The aspect is: Gameplay', 'answer': "Gameplay: Dota 2 is a game of near infinite depth and variety that rewards dedication and teamwork with a brilliantly social experience. The game is complicated, exhausting, and sometimes cruel, but its many complexities form an incredibly satisfying and exciting multiplayer game. The game's punishing design is sometimes enough to drive you up the wall, but it's worth sticking the hours in: success in Dota 2 is about learning to effectively juggle both the broad strokes and finer details. The game's wealth of disparate systems and mechanics combine into their own harmony, and understanding how there are hundreds of elements that affect the game takes time and dedication.", 'sources': '', 'source_docu

In [34]:
for k, v in aspects_response.items():
    print(k)
    print(v)
    print('\n\n')

Gameplay
Gameplay: Dota 2 is a game of near infinite depth and variety that rewards dedication and teamwork with a brilliantly social experience. The game is complicated, exhausting, and sometimes cruel, but its many complexities form an incredibly satisfying and exciting multiplayer game. The game's punishing design is sometimes enough to drive you up the wall, but it's worth sticking the hours in: success in Dota 2 is about learning to effectively juggle both the broad strokes and finer details. The game's wealth of disparate systems and mechanics combine into their own harmony, and understanding how there are hundreds of elements that affect the game takes time and dedication.



Sound
NA. According to the reviews, there is no information provided about the sound design of Dota 2.



Graphics
The reviewers mention the graphics of Dota 2 in the following way:

* "Valve's artists deserve praise for a crisp and readable style that, after some practice, makes it possible to tell what's 

In [35]:
str(aspects_response)

'{\'Gameplay\': "Gameplay: Dota 2 is a game of near infinite depth and variety that rewards dedication and teamwork with a brilliantly social experience. The game is complicated, exhausting, and sometimes cruel, but its many complexities form an incredibly satisfying and exciting multiplayer game. The game\'s punishing design is sometimes enough to drive you up the wall, but it\'s worth sticking the hours in: success in Dota 2 is about learning to effectively juggle both the broad strokes and finer details. The game\'s wealth of disparate systems and mechanics combine into their own harmony, and understanding how there are hundreds of elements that affect the game takes time and dedication.", \'Sound\': \'NA. According to the reviews, there is no information provided about the sound design of Dota 2.\', \'Graphics\': \'The reviewers mention the graphics of Dota 2 in the following way:\\n\\n* "Valve\\\'s artists deserve praise for a crisp and readable style that, after some practice, ma

In [36]:
from langchain_core.prompts import ChatPromptTemplate

system_template = \
'''You are reading reviews of a game to understand the characteristics of the game. Use the following pieces of context to answer user's question.
'''

summary_template = \
'''Extract the following aspects of the game from the reviews, and write a short 20 words description for each aspect. The aspects are: [Gameplay, Graphics, Sound, Performance, Bug, Suggestion, Price, Overall]. Output a JSON with each of the aspects as key, and the information as the value. Only output the JSON. Do NOT output other text.

The context is wrapped by three consecutive apostrophes. The context is as follows:
\'\'\'
{context}
\'\'\'
'''

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", system_template),
    ("human", summary_template),
])

chain = chat_prompt | llm
response = chain.invoke({"context":str(aspects_response)})

print(response)

Here is the JSON output:

{
"Gameplay": "Near infinite depth and variety that rewards dedication and teamwork with a brilliantly social experience.",
"Graphics": "Crisp and readable style that makes it possible to tell what's going on even in massive brawls.",
"Sound": "NA",
"Performance": "Complex and challenging, but also satisfying and exciting for those who are willing to put in the effort to learn and play the game.",
"Bug": "NA",
"Suggestion": "Extensive customization options for players to personalize their experience, including new taunts, announcers, and HUD skins created and voted into the game by the community.",
"Price": "Free-to-play, with no in-game purchases or pay-to-win elements.",
"Overall": "Excellent game that offers a unique and rewarding experience for players, despite some challenges in the learning curve and online community."
}
