In [1]:
import os
import sys
from llama_index import SimpleDirectoryReader
from translate import Translator
sys.path.append("/home/k017/Documents/llm_studienprojekt/chatbot/llm_string_mapping.py")

import torch
from llama_index import ServiceContext, VectorStoreIndex, StorageContext, load_index_from_storage, \
    set_global_service_context
from llama_index.llms import HuggingFaceLLM
from transformers import AutoModelForCausalLM, AutoTokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from chatbot.llm_string_mapping import system_prompts_mapping, query_wrapper_prompts_mapping, model_mapping
model_str="neural_chat"
vector_stores_path = os.path.join(os.path.abspath(os.path.dirname(os.getcwd())), "../vector_stores")
vector_store_name = "index_test"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
device = "cuda" if torch.cuda.is_available() else "cpu"
documents = SimpleDirectoryReader(os.path.join(os.getcwd(), "file_storage/captures")).load_data()

torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = model_mapping[model_str]
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)

tokenizer = AutoTokenizer.from_pretrained(model_id)
llm = HuggingFaceLLM(
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=128,
    generate_kwargs={"temperature": 0.7, "do_sample": True},
    system_prompt=system_prompts_mapping[model_str],
    query_wrapper_prompt=query_wrapper_prompts_mapping[model_str],
    device_map="auto",
    # stopping_ids=[50278, 50279, 50277, 1, 0],
    # tokenizer_kwargs={"max_length": 4096},
    model_kwargs={"torch_dtype":torch_dtype}
)



# TODO: Test different embedding models? (jinaai/jina-embeddings-v2-base-en, BAAI/bge-small-en-v1.5)
service_context = ServiceContext.from_defaults(
    llm=llm, embed_model="local:BAAI/bge-base-en-v1.5")
set_global_service_context(service_context)

# TODO: Attention! Currently, this always looks for the vector-store "index_test",
#  which was only built for one audio file.
#  When using on more or different one, this name should be changed!
try:
    st_cont = StorageContext.from_defaults(persist_dir=os.path.join(vector_stores_path, vector_store_name))
    vector_index = load_index_from_storage(st_cont)
except:
    vector_index = VectorStoreIndex.from_documents(documents, service_context=service_context,
                                                        show_progress=True)
    vector_index.storage_context.persist(
        persist_dir=os.path.join(vector_stores_path, vector_store_name))

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.06it/s]


Testfälle definieren

In [3]:
# Testfälle definieren
test_cases = [
    {
        "frage": "Who is  Annika Jüris?",
        "erwartete_antwort":  "Anna Jüris is a journalist and correspondent in France. She juggles two roles, covering both French politics and climate issues in the country.",
    },
    {
        "frage": "What does she say about the borders in context of climate change?",
        "erwartete_antwort": "She says that climate reporters research globally. But they don't recognize borders because the climate doesn't adhere to them.",
    },
    {
        "frage": "What does she say about the borders in context of climate change?",
        "erwartete_antwort":"She says that climate reporters research globally. But they don't recognize borders because the climate doesn't adhere to them.",
    },
    {
        "frage": "What title does her book have?",
        "erwartete_antwort": "The book that she wrote has the title The Climate Dirt Lobby",

    },
    {
        "frage": "Can you tell me more about it?",
        "erwartete_antwort":"The book addresses the escalating emissions both in Germany and globally and explores the underlying reasons behind this phenomenon. It aims to shed light on the unanswered question of why emissions continue to rise and to identify the obstacles hindering progress.limate change deniers, those who say it's not made from human and the breakers. Those who say from morning to evening that the climate crisis is very bad and now we have to do a lot about it and we want to do that and that is the goal of our government.And in the end, however, prevent any law that would actually have dampened the climate crisis or that would then terminate the coal exit in 2038 or something like that",

    },
    {
        "frage": "According to this Interview, there are three different types of people in regard to the climate change. What types are this?",
        "erwartete_antwort": "climate change deniers, those who say it's not made from human and the breakers. Those who say from morning to evening that the climate crisis is very bad and now we have to do a lot about it and we want to do that and that is the goal of our government.And in the end, however, prevent any law that would actually have dampened the climate crisis or that would then terminate the coal exit in 2038 or something like that"
    },
    {
        "frage": "On what Group of people do they focus out of those three groups?",
        "erwartete_antwort": "Those who say from morning to evening that the climate crisis is very bad and now we have to do a lot about it and we want to do that and that is the goal of our government."
    },
    {
        "frage": "Does she think that the politicians don't understand that the climate change is there?",
        "erwartete_antwort": "She thinks that have understood it intellectually but they still fight against it ideologically."
    },
    {
        "frage": "What is the goal of the federal governemnt in this regard?",
        "erwartete_antwort": "The goal of the federal government is to have zero emissions by 2045."
    },
    {
        "frage": "Do Germany have enough wind turbines?",
        "erwartete_antwort": "No, they don't have enough wind turbines. They fight for every wind turbine in the village"
    },
    {
        "frage": "Why don't they have enough?",
        "erwartete_antwort": "It's not easy to build them up."
    },
    {
        "frage": "What is the biggest problem in Germany regarding climate change?",
        "erwartete_antwort": "The biggest problem is the coal energy and lobby."
    },
    {
        "frage": "When should this be abolished?",
        "erwartete_antwort": "The date of departure is 2038, later than in any other country, except Poland, in the EU."
    },
    {
        "frage": "What does she tell about Renate Künast?",
        "erwartete_antwort": "Renate Künast was in the green Party and the Minister of Agriculture."
    },
    {
        "frage": "In which Party has she been?",
        "erwartete_antwort": "Renate Künast has been in the green party."
    },
]


Condense Parameter Test

In [4]:
import Levenshtein
import pandas as pd
from itertools import product
def test_chat_engine(vector_index, test_cases):
    condense_factors = [0.3, 0.9]
    max_tokens_values = [150, 200]
    split_condense_threshold_values = [100, 150]
    query_engine_tool_probabilities = [0.3, 0.9]
    parameter_combinations = list(product(condense_factors, max_tokens_values, split_condense_threshold_values, query_engine_tool_probabilities))
    results = []
    x = 0
    distance_list = []
    list_answer = []
    list_expected_answer=[]
    list_question=[]
    for chat_mode in ["condense"]:
        for params in parameter_combinations:
            condense_factor, max_tokens, split_condense_threshold, query_engine_tool_probability = params
            chat_engine = vector_index.as_chat_engine(
                chat_mode=f"{chat_mode}_question",
                condense_factor=condense_factor,
                max_tokens=max_tokens,
                split_condense_threshold=split_condense_threshold,
                condense_strategy="extractive",
                query_engine_tool_probability=query_engine_tool_probability
            )
            for test_case in test_cases:
                frage = test_case["frage"]
                erwartete_antwort = test_case["erwartete_antwort"]
                
                response_chat = chat_engine.chat(frage)
                antwort = response_chat.response
                distance = Levenshtein.distance(antwort, erwartete_antwort)
                distance_list.append(distance)
                
                list_question.append(frage)
                list_answer.append(antwort)
                list_expected_answer.append(erwartete_antwort)

            results.append({
                "Chat Engine": chat_mode,
                "Condense Factor": condense_factor, 
                "max Tokens": max_tokens, 
                "split": split_condense_threshold,
                "Query engine prob.": query_engine_tool_probability,
                "Distance List": distance_list,
                "Questions": list_question,
                "Answers": list_answer,
                "Expected Answers": list_expected_answer
            })
    return results

# Test durchführen
results = test_chat_engine(vector_index, test_cases)

# DataFrame erstellen
df_results = pd.DataFrame(results)

# CSV-Datei speichern
df_results.to_csv("test_results.csv", index=False)


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

KeyboardInterrupt: 

Condense Plus Context

In [6]:
import Levenshtein
import pandas as pd
from itertools import product
def test_chat_engine(vector_index, test_cases):
    distance_list = []
    list_answer = []
    list_expected_answer=[]
    list_question=[]
    results=[]
    for chat_mode in ["condense_plus_context"]:
        distance_list = []
        list_answer = []
        list_expected_answer = []
        list_question = []
        
        chat_engine = vector_index.as_chat_engine(
            chat_mode="condense_question",
            verbose=False,
        )
        for test_case in test_cases:
            frage = test_case["frage"]
            erwartete_antwort = test_case["erwartete_antwort"]
            
            response_chat = chat_engine.chat(frage)
            antwort = response_chat.response
            distance = Levenshtein.distance(antwort, erwartete_antwort)
            distance_list.append(distance)
            
            list_question.append(frage)
            list_answer.append(antwort)
            list_expected_answer.append(erwartete_antwort)

        results.append({
            "Chat Engine": chat_mode,
            "Distance List": distance_list,
            "Questions": list_question,
            "Answers": list_answer,
            "Expected Answers": list_expected_answer
        })

    return results

# Test durchführen
results = test_chat_engine(vector_index, test_cases)

# DataFrame erstellen
df_results = pd.DataFrame(results)

# CSV-Datei speichern
df_results.to_csv("test_results.csv", index=False)


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [7]:
import Levenshtein
import pandas as pd
from itertools import product
from llama_index.memory import ChatMemoryBuffer
def test_chat_engine(vector_index, test_cases):
    list_question=[]
    list_expected_answer =[]
    list_answer=[]
    results=[]

    memory = ChatMemoryBuffer.from_defaults(token_limit=3900)
    chat_engine = vector_index.as_chat_engine(
        chat_mode="condense_plus_context",
        memory=memory,
        context_prompt=(
            "You are a chatbot, able to have normal interactions, as well as talk"
        ),
        verbose=False,
    )
    for test_case in test_cases:
        frage = test_case["frage"]
        erwartete_antwort = test_case["erwartete_antwort"]
        
        
        response_chat = chat_engine.chat(frage)
        antwort = response_chat.response
        distance = Levenshtein.distance(antwort, erwartete_antwort)
        
        list_question.append(frage)
        list_answer.append(antwort)
        list_expected_answer.append(erwartete_antwort)

    results.append({
        "Questions": list_question,
        "Answers": list_answer,
        "Expected Answers": list_expected_answer
    })
    
        # DataFrame erstellen
    df_results = pd.DataFrame(results)
    # CSV-Datei speichern
    csv_filename = f"test_results_query.csv"
    df_results.to_csv(csv_filename, index=False)

    # DataFrame ausgeben
    print(f"DataFrame für Chat Engine query engine:")
    print(df_results)

    return results
# Test durchführen
results = test_chat_engine(vector_index, test_cases)

# DataFrame erstellen
df_results = pd.DataFrame(results)

# CSV-Datei speichern
df_results.to_csv("test_results.csv", index=False)



No chat template is defined for this tokenizer - using the default template for the LlamaTokenizerFast class. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setti

KeyboardInterrupt: 

In [11]:
!pip install scikit-learn
!pip install matplotlib
%matplotlib inline
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze(0)  # Durchschnitt über Token
    return embeddings

# Pfad zur CSV-Datei angeben
csv_datei_pfad = '/home/k017/Documents/llm_studienprojekt/test_results.csv'
df = pd.read_csv(csv_datei_pfad, sep=",")

row = df.iloc[-1]

question = row["Questions"]
given_answers = eval(row["Answers"])
expected_answers = eval(row["Expected Answers"])

print(f"Frage: {question}")
print("="*50)

similarities = []

for given, expected in zip(given_answers, expected_answers):
    given_embedding = get_bert_embedding(given)
    expected_embedding = get_bert_embedding(expected)

    similarity = cosine_similarity(given_embedding.reshape(1, -1), expected_embedding.reshape(1, -1))

    print(f"Frage: {question}")
    print(f"Gegebene Antwort: {given}")
    print(f"Erwartete Antwort: {expected}")
    print(f"Cosine Similarity: {similarity[0][0]}")
    print("-"*50)

    similarities.append(similarity[0][0])

print("="*50)
print("="*50)  

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Frage: ['Who is  Annika Jüris?', 'What does she say about the borders in context of climate change?', 'What does she say about the borders in context of climate change?', 'What title does her book have?', 'Can you tell me more about it?', 'According to this Interview, there are three different types of people in regard to the climate change. What types are this?', 'On what Group of people do they focus out of those three groups?', "Does she think that the politicians don't understand that the climate change is there?", 'What is the goal of the federal governemnt in this regard?', 'Do Germany have enough wind turbines?', "Why don't they have enough?", 'What is the biggest problem in Germany regarding climate change?', 'When should this be a