In [1]:
import _sample_reviews

In [2]:
from datetime import datetime
from hashlib import sha224

# create an object with review_ID, review_text and the datetime of creation
def create_review(review_text, review_ID):    
    review_obj = {
        "review_ID": review_ID,
        "review_text": review_text,
        "datetime": datetime.now()
    }

    # create a hash of the review_text for creating unique collection with Chromadb
    hash = sha224(str(review_obj).encode()).hexdigest()
    review_obj['hash'] = hash

    return review_obj

# the temporary review_ID
review_ID = 123
# select a comment to test with
review_text = _sample_reviews.sample_fyp_433
review_obj = create_review(review_text, review_ID)

print(review_obj)
print('\n\n')
print('Len of review_text:', len(review_text.split()))


{'review_ID': 123, 'review_text': 'Great time! played many hours on console first! now trying to get my collection to switch to pc. Highly recommend', 'datetime': datetime.datetime(2024, 3, 21, 13, 17, 51, 4601), 'hash': 'b40752733fe1d3059228a21bd596ee23e16e85e4b25f7138027da300'}



Len of review_text: 20


In [29]:
# define the llm to run with (different LLMs)

from langchain_community.llms import Ollama

llm_mistral7b = Ollama(
    model='mistral:7b-instruct-v0.2-q4_0', temperature=0.4,
    num_gpu = 1,        # disable/enable gpu for testing
)

llm_mixtral8x7b = Ollama(
    model='mixtral:8x7b-instruct-v0.1-q4_0', temperature=0.4,
    num_gpu = 1,        # disable/enable gpu for testing
)

llm_llama2_7b = Ollama(
    model='llama2:7b-chat-q4_0', temperature=0.4, num_gpu = 1
)
# llm_gemma_7b = Ollama()
# llm_gemma_2b = Ollama()

# for looping through the LLM models
Llm_models = {
    'mistral_7b': llm_mistral7b,
    # 'mixtral_8x7b': llm_mixtral8x7b,
    'llama2_7b': llm_llama2_7b
}

In [4]:
# for embedding in testing, we use sentence transformer
# sentence transformer embed text into vector space with dimension 384 for semantic search.

from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from transformers import AutoTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain.vectorstores import Chroma

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

embedding_func = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

def custom_len_func(text:str) -> int:
  return len(tokenizer.encode(text, add_special_tokens=False))

text_spliter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
  tokenizer=tokenizer,
  chunk_size=250,
  chunk_overlap=40,
)

docs = text_spliter.create_documents(
  [review_obj['review_text']], metadatas=[{"source":"review_01"}]
)

# define in-memory db and retriever for RAG
db = Chroma.from_documents(documents=docs, embedding=embedding_func, collection_name=review_obj['hash'])
retriever = db.as_retriever(search_kwargs={"k": 5})

  from .autonotebook import tqdm as notebook_tqdm
Downloading tokenizer_config.json: 100%|██████████| 350/350 [00:00<00:00, 270kB/s]
Downloading vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 657kB/s]
Downloading tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 794kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 373kB/s]


In [37]:
# define callbacks for detecting token usage

from langchain_core.callbacks.base import BaseCallbackHandler
from langchain_core.outputs.llm_result import LLMResult
from collections import deque

class TokenUsageCallbackHandler(BaseCallbackHandler):

    def __init__(self, deque: deque = None):
        super().__init__()
        self.deque = deque

    def on_llm_end(self, response: LLMResult, **kwargs) -> None:
        # print(response)

        generation = response.generations[0][0]
        gen_info = generation.generation_info

        # get token usage
        # ref: https://github.com/orgs/langfuse/discussions/1179
        token_usage = gen_info.get('prompt_eval_count', 0) + gen_info.get('eval_count', 0)
        # get time costed (local machine)
        # instead of getting total duration, we get the prompt_eval_duration and eval_duration to exclude the load duration (e.g. to load the model to the gpu, etc.)
        time_costed = gen_info.get('prompt_eval_duration', 1e-10) + gen_info.get('eval_duration', 1e-10)     # in ns, a small value to indicate a inf time when it fails


        # create an object to store the token usage and time costed
        token_usage_obj = {
            'token_usage': token_usage,
            'time_costed': time_costed
        }

        # append the object to the deque
        self.deque.append(token_usage_obj)



common_deque = deque()
chain_config = {
    "callbacks": [TokenUsageCallbackHandler(common_deque)],
}

In [38]:
def combine_token_usage_obj(token_usage_obj_list:list[dict]) -> dict:
    token_usage = 0
    time_costed = 0
    for obj in token_usage_obj_list:
        token_usage += obj['token_usage']
        time_costed += obj['time_costed']

    return {
        'token_usage': token_usage,
        'time_costed': time_costed
    }

---

Task 1: Spam detection

A chain to first prompt reasons and a pre-mature decision of whether the review is a prompt, then a second prompt is used to determine whether the review is a prompt.

Evaluation: accuracy of classification

In [39]:
import _prompts

In [40]:
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate

def prompt_spam_detection(llm_model_name:str, llm_model):

    chat_prompt_01 = ChatPromptTemplate.from_messages([
        ("system", _prompts.SYSTEM_TEMPLATE),
        ("user", _prompts.SPAM_TEMPLATE_01)
    ])

    chain_01 = chat_prompt_01 | llm_model

    resp_01 = chain_01.invoke({
        "review":str(review_text)
    }, config=chain_config)

    token_usage_01 = common_deque.popleft()

    chat_prompt_02 = ChatPromptTemplate.from_messages([
        ("system", _prompts.SYSTEM_TEMPLATE),
        ("user", _prompts.SPAM_TEMPLATE_01),
        ("ai", resp_01),
        ("user", _prompts.SPAM_TEMPLATE_02)
    ])

    chain_02 = chat_prompt_02 | llm_model

    resp_02 = chain_02.invoke({
        "review":str(review_text)
    }, config=chain_config)

    token_usage_02 = common_deque.popleft()


    # get token usage result and time costed
    token_usage_result = combine_token_usage_obj([token_usage_01, token_usage_02])

    # show result:
    print('-' * 10 + f'Result for {llm_model_name}' + '-' * 10)
    print(resp_01); print(); print(resp_02)
    print('Token usage:', token_usage_result['token_usage'])
    print('Time costed: {:.04} s'.format(float(token_usage_result['time_costed']) / 1e9))
    print('\n\n')

    return {
        'resp_01': resp_01,
        'resp_02': resp_02,
        'token_usage': token_usage_result['token_usage'],
        'time_costed': token_usage_result['time_costed']
    }


In [41]:
# loop over the LLM models

spam_detection_result = {}

for llm_model_name, llm_model in Llm_models.items():
    spam_detection_result[llm_model_name] = prompt_spam_detection(llm_model_name, llm_model)

----------Result for mistral_7b----------
 1. The review contains positive sentiment towards the game, which is not unusual or suspicious in itself. However, it's quite short and lacks specific details that would typically be found in a more informative review.
2. The review mentions playing the game on both console and PC, but it does not provide any comparison or indication of why the reviewer is trying to switch platforms. This lack of context could potentially raise suspicions if this pattern were consistent across multiple reviews.
3. The review ends with a recommendation, which is a common feature in genuine reviews. However, the absence of any additional information (such as the game's name or genre) makes it difficult to verify the authenticity of the recommendation.

 NO.
Token usage: 469
Time costed: 3.282 s



----------Result for llama2_7b----------
Based on the provided review, here are three reasons why I believe it is not a spam for a game:

1. The reviewer mentions play

Task 2: Summarization (content extraction with RAG)

Extract related content with vector storage and write a summary for the extracted aspects

Then output a JSON with the aspect as key, and the summary as value

Evaluation: hallucination, relevence of the summaries, ability to ahere with the output format.

We test with 10 aspects. Different combinations in wrapping the aspects were tested before, and (3, 3, 4) balances token efficiency and output quality the best.

To evaluate the ability to ahere with the output format, we run the same prompt 5 times and observe the output consistency.


In [42]:
GAME_ASPECTS = ['Gameplay', 'Narrative', 'Accessibility', 'Sound', 'Graphics & Art Design', 'Performance', 'Bug', 'Suggestion', 'Price', 'Overall']

In [51]:
def prompt_aspect_extraction(llm_model_name:str, llm_model, repeat_count=5):

    # aspects_response = {k: '' for k in GAME_ASPECTS}
    responses = {}

    for (start, end) in [(0, 3), (3, 6), (6, 10)]:
        aspects = GAME_ASPECTS[start:end]
        responses[str(aspects)] = {}        # storing the result

        rag_question = _prompts.QUESTION_TEMPLATE_01 + f"{'is ' if len(aspects) <= 1 else 'are '}" + ': ' + f'{aspects}'
        output_format_tempate = _prompts.OUTPUT_FORMAT_TEMPATE.format(
            aspects_list_01=str(aspects)[1:-1].replace('\'', '\"'), output_json_template=str({k: '...' for k in aspects}).replace('\'', '\"')
        )

        relevant_docs = retriever.get_relevant_documents(query=rag_question, k=5)

        for i in range(repeat_count):

            chat_prompt_01 = ChatPromptTemplate.from_messages([
                ("system", _prompts.SYSTEM_TEMPLATE),
                ("user", _prompts.KEYWORD_TEMPLATE_01)
            ])

            chain_01 = chat_prompt_01 | llm_model

            print('The prompt:', chat_prompt_01.format_messages(
                aspects = str(aspects),
                output_format = output_format_tempate,
                summaries = str('\n'.join([d.page_content for d in relevant_docs]))
            ))

            resp_01 = chain_01.invoke({
                "aspects":str(aspects),
                "output_format":output_format_tempate,
                "summaries": str('\n'.join([d.page_content for d in relevant_docs]))
            }, config=chain_config)

            token_usage_01 = common_deque.popleft()

            # get token usage result and time costed
            token_usage_result = combine_token_usage_obj([token_usage_01])

            # show result:
            print('-' * 10 + f'Result for {llm_model_name} at attempt={i+1}' + '-' * 10)
            print(resp_01); print()
            print('Token usage:', token_usage_result['token_usage'])
            print('Time costed: {:.04} s'.format(float(token_usage_result['time_costed']) / 1e9))
            print('\n\n')

            responses[str(aspects)][f'attempt_{i}'] = {
                'resp_01': resp_01,
                'token_usage': token_usage_result['token_usage'],
                'time_costed': token_usage_result['time_costed']
            }

    return responses


In [52]:
aspect_extraction_result = {}

for llm_model_name, llm_model in Llm_models.items():
    aspect_extraction_result[llm_model_name] = prompt_aspect_extraction(llm_model_name, llm_model)

Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1


The prompt: [SystemMessage(content="You are reading reviews of a game to understand the characteristics of the game. Use the following pieces of context to answer user's question."), HumanMessage(content='You are reading reviews of a game to understand the characteristics of the game. Extract the following aspect of the game from the reviews.\nThe aspects are [\'Gameplay\', \'Narrative\', \'Accessibility\']. For each aspect, output a paragraph with less than 50 words. Then create a JSON with apsects name as key and the paragraph as value.\nOutput the JSON as a single line with no spaces between the key, value pairs. For example, if the aspects are "Gameplay", "Narrative", "Accessibility", the JSON should be: {"Gameplay": "...", "Narrative": "...", "Accessibility": "..."}\nOnly output the JSON. Do NOT output other text.\n\nGreat time! played many hours on console first! now trying to get my collection to switch to pc. Highly recommend\n\nIf you don\'t know the answer, output only "NA". 

Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1


----------Result for mistral_7b at attempt=5----------
 {"Gameplay": "Played many hours on console, recommending the game.", "Narrative": "NA", "Accessibility": "NA"}

Token usage: 32
Time costed: 0.5554 s



The prompt: [SystemMessage(content="You are reading reviews of a game to understand the characteristics of the game. Use the following pieces of context to answer user's question."), HumanMessage(content='You are reading reviews of a game to understand the characteristics of the game. Extract the following aspect of the game from the reviews.\nThe aspects are [\'Sound\', \'Graphics & Art Design\', \'Performance\']. For each aspect, output a paragraph with less than 50 words. Then create a JSON with apsects name as key and the paragraph as value.\nOutput the JSON as a single line with no spaces between the key, value pairs. For example, if the aspects are "Sound", "Graphics & Art Design", "Performance", the JSON should be: {"Sound": "...", "Graphics & Art Design": "...", "Performan

Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1


----------Result for mistral_7b at attempt=5----------
 {"Sound": "Review mentions playing the game for many hours on console without specifying anything about the sound.",
 "Graphics & Art Design": "No explicit mention of graphics or art design in this review.",
 "Performance": "NA"}

Token usage: 51
Time costed: 0.8704 s



The prompt: [SystemMessage(content="You are reading reviews of a game to understand the characteristics of the game. Use the following pieces of context to answer user's question."), HumanMessage(content='You are reading reviews of a game to understand the characteristics of the game. Extract the following aspect of the game from the reviews.\nThe aspects are [\'Bug\', \'Suggestion\', \'Price\', \'Overall\']. For each aspect, output a paragraph with less than 50 words. Then create a JSON with apsects name as key and the paragraph as value.\nOutput the JSON as a single line with no spaces between the key, value pairs. For example, if the aspects are "Bug", "Suggest

Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1


----------Result for mistral_7b at attempt=5----------
 {"Bug": "NA", "Suggestion": "Highly recommend switching from console to PC version", "Price": "NA", "Overall": "Great time played many hours on console"}

Token usage: 42
Time costed: 0.7424 s



The prompt: [SystemMessage(content="You are reading reviews of a game to understand the characteristics of the game. Use the following pieces of context to answer user's question."), HumanMessage(content='You are reading reviews of a game to understand the characteristics of the game. Extract the following aspect of the game from the reviews.\nThe aspects are [\'Gameplay\', \'Narrative\', \'Accessibility\']. For each aspect, output a paragraph with less than 50 words. Then create a JSON with apsects name as key and the paragraph as value.\nOutput the JSON as a single line with no spaces between the key, value pairs. For example, if the aspects are "Gameplay", "Narrative", "Accessibility", the JSON should be: {"Gameplay": "...", "Narrative

Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1


----------Result for llama2_7b at attempt=5----------
{"Gameplay": "...Great time! played many hours on console first! now trying to get my collection to switch to pc...", "Narrative": "...", "Accessibility": "..."}

Token usage: 41
Time costed: 0.6909 s



The prompt: [SystemMessage(content="You are reading reviews of a game to understand the characteristics of the game. Use the following pieces of context to answer user's question."), HumanMessage(content='You are reading reviews of a game to understand the characteristics of the game. Extract the following aspect of the game from the reviews.\nThe aspects are [\'Sound\', \'Graphics & Art Design\', \'Performance\']. For each aspect, output a paragraph with less than 50 words. Then create a JSON with apsects name as key and the paragraph as value.\nOutput the JSON as a single line with no spaces between the key, value pairs. For example, if the aspects are "Sound", "Graphics & Art Design", "Performance", the JSON should be: {"Sound": 

Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1


----------Result for llama2_7b at attempt=5----------
{"Sound": "Great soundtrack and immersive audio experience.", "Graphics & Art Design": "Visually stunning with detailed character designs and environments.", "Performance": "Smooth gameplay with minimal lag or glitches."}

Token usage: 51
Time costed: 0.8594 s



The prompt: [SystemMessage(content="You are reading reviews of a game to understand the characteristics of the game. Use the following pieces of context to answer user's question."), HumanMessage(content='You are reading reviews of a game to understand the characteristics of the game. Extract the following aspect of the game from the reviews.\nThe aspects are [\'Bug\', \'Suggestion\', \'Price\', \'Overall\']. For each aspect, output a paragraph with less than 50 words. Then create a JSON with apsects name as key and the paragraph as value.\nOutput the JSON as a single line with no spaces between the key, value pairs. For example, if the aspects are "Bug", "Suggestion", "Pri

Task 3: keyword generation

Given the extracted summary from task 2, we asked LLM to generate not more than 5 keywords/keyphrase to describe each aspect. Also output a JSON object, at once.

However, if there is no content (or NA) for an aspect, there should be no keyword, or only 'NA' for that aspect

Evaluation: hallucination, relevent of the keywords to the generated summaries, ability to ahere with the output format.

We have least strigent output format for that (there's no need to pass a output format for this task), as passing one will shift the attention towards the output format, ignoring the context. Also, the context itself is a good example of the expected output format.

Passing 10 aspects at once.

In our application, task 2 and task 3 is chained (i.e. response from task 2 will be directly applied to task 3)

In [53]:
# we provide 10 generated aspects response from different reviews
# instead of using the response from different models, we believed this is more controlled and thus better evaluate the performance of the models
# the aspects response are first generated using mixtral8x7b and then manually reviewed and corrected to ensure the quality of the response

import _aspects_responses

In [54]:
def prompt_aspect_keyword_extraction(llm_model_name:str, llm_model, aspects_response:dict, repeat_count=5):

    responses = {}

    for i in range(repeat_count):

        chat_prompt_01 = ChatPromptTemplate.from_messages([
            ("system", _prompts.SYSTEM_TEMPLATE),
            ("user", _prompts.KEYWORD_TEMPLATE_02)
        ])

        chain_01 = chat_prompt_01 | llm_model

        resp_01 = chain_01.invoke({
            "aspects":GAME_ASPECTS,
            "context": aspects_response
        }, config=chain_config)

        token_usage_01 = common_deque.popleft()
        
        token_usage_result = combine_token_usage_obj([token_usage_01])

        # show result:
        print('-' * 10 + f'Result for {llm_model_name} at attempt={i+1}' + '-' * 10)
        print(resp_01); print()
        print('Token usage:', token_usage_result['token_usage'])
        print('Time costed: {:.04} s'.format(float(token_usage_result['time_costed']) / 1e9))
        print('\n\n')

        responses[f'attempt_{i}'] = {
            'resp_01': resp_01,
            'token_usage': token_usage_result['token_usage'],
            'time_costed': token_usage_result['time_costed']
        }

    return responses


In [56]:
aspect_keyword_extraction_result = {}

for i, aspects_response in enumerate(_aspects_responses.ALL_ASPECTS_RESPONSES):

    aspect_keyword_extraction_result_per_aspect_response = {}
    print('-'* 10 + f'Aspect response {i:02}' + '-'* 10)

    for llm_model_name, llm_model in Llm_models.items():
        aspect_keyword_extraction_result_per_aspect_response[llm_model_name] = prompt_aspect_keyword_extraction(llm_model_name, llm_model, aspects_response, repeat_count=2)

    aspect_keyword_extraction_result[f'aspects_response_{i:02}'] = aspect_keyword_extraction_result_per_aspect_response

    print('\n\n')

----------Aspect response 00----------
----------Result for mistral_7b at attempt=1----------
 {
"Gameplay": ["performance issues", "running 25-35 fps", "disappointing for $70 title"],
"Narrative": ["NA"],
"Accessibility": ["NA"],
"Sound": ["sound details not mentioned"],
"Graphics & Art Design": ["odd appearance", "max settings", "disabling up-scaling doesn't help"],
"Performance": ["poorly optimized", "running 25-35 fps on both low and ultra settings", "not acceptable for $70 title"],
"Bug": ["NA"],
"Suggestion": ["run at 50-60 fps on lowest settings", "optimize settings to get better frames"],
"Price": ["$70"],
"Overall": ["time-consuming optimization process", "refund not possible due to hours invested"]
}

Token usage: 426
Time costed: 4.109 s



----------Result for mistral_7b at attempt=2----------
 ```json
{
  "Gameplay": ["performance issues", "25 - 35 fps on low settings", "disappointing for $70 title"],
  "Narrative": ["NA"],
  "Accessibility": ["NA"],
  "Sound": ["sound det

KeyboardInterrupt: 