In [None]:
### EXECUTED IN CHATBOT ENVIRONMENT ###
import pandas as pd
import json
from langchain.chains.conversation.memory import ConversationBufferWindowMemory

from src.chatbot.agents.agent_openai_tools import CampusManagementOpenAIToolsAgent
from src.chatbot.tools.search_web_tool import visited_links
from src.config.core_config import settings

# Read the CSV
df = pd.read_csv("data/questions_dataset_reviewed_translated_de_en_cut.csv")
df_copy = df.head(2).copy()

all_results = []

for i, row in df_copy.iterrows():
    # ------------------------------------------------------------
    # 1) GET GERMAN QUESTION
    # ------------------------------------------------------------
    question_de = row["german_question_text_q"]
    print(question_de)
    question_id = row["question_id_q"]
    settings.language = "Deutsch"

    # Now create a fresh agent with memory
    agent_executor_de = CampusManagementOpenAIToolsAgent.run(
        language="Deutsch",
        memory=ConversationBufferWindowMemory(
            memory_key="chat_history", return_messages=True, k=0
        ),
    )
    # Chatbot returns an answer in German
    result_de = agent_executor_de(question_de)
    visited_links_de = visited_links()
    visited_links.clear()

    # ------------------------------------------------------------
    # 2) GET ENGLISH QUESTION
    # ------------------------------------------------------------
    question_en = row["english_question_text_q"]
    print(question_en)
    settings.language = "English"

    # Create a fresh agent with memory
    agent_executor_en = CampusManagementOpenAIToolsAgent.run(
        language="English",
        memory=ConversationBufferWindowMemory(
            memory_key="chat_history", return_messages=True, k=0
        ),
    )
    # Chatbot should now return an answer in English
    result_en = agent_executor_en(question_en)
    visited_links_en = visited_links()
    visited_links.clear()

    # ------------------------------------------------------------
    # 3) Combine results
    # ------------------------------------------------------------
    row_dict = {
        "question_id": question_id,
        "question_de": question_de,
        "output_de": result_de["output"],
        "visited_links_de": visited_links_de,
        "question_en": question_en,
        "output_en": result_en["output"],
        "visited_links_en": visited_links_en,
    }
    all_results.append(row_dict)

# ------------------------------------------------------------
# 4) SAVE RESULTS
# ------------------------------------------------------------
with open("chatbot_responses.json", "w", encoding="utf-8") as f:
    json.dump(all_results, f, indent=2, ensure_ascii=False)

df_copy["chatbot_answer_de"] = [res["output_de"] for res in all_results]
df_copy["chatbot_answer_en"] = [res["output_en"] for res in all_results]
df_copy["chatbot_visited_urls_de"] = [
    ";".join(res["visited_links_de"]) for res in all_results
]
df_copy["chatbot_visited_urls_en"] = [
    ";".join(res["visited_links_en"]) for res in all_results
]

df_copy.to_csv("data/chatbot_data/chatbot_answers_first_20_de_en_test.csv", index=False, quoting=1)


### Add context from chatobot answer, the search query used for the website search and the latency of the whole process

In [2]:
import os
from langsmith import Client
from dotenv import load_dotenv
import pandas as pd
from datetime import datetime, timedelta

load_dotenv()

# 1) Connect to LangSmith
api_key = os.getenv("LANGCHAIN_API_KEY")
project_name = os.getenv("LANGCHAIN_PROJECT")
endpoint = os.getenv("LANGCHAIN_ENDPOINT")
client = Client(api_key=api_key)

# 2) List all runs of type "tool"
tool_runs = client.list_runs(
    project_name=project_name,
    start_time=datetime.now() - timedelta(days=6),
    run_type="tool"
)

# 3) Create a DataFrame (df_tool_runs) with columns: run_id, parent_run_id, start_time, status
tool_runs_dict = [
    {
        "run_id": str(run.id),  # or run.id if you want the UUID object
        "parent_run_id": str(run.parent_run_id) if run.parent_run_id else None,
        "start_time": run.start_time,
        "status": run.status,
    }
    for run in tool_runs
]
df_tool_runs = pd.DataFrame(tool_runs_dict)
df_tool_runs.sort_values("start_time", inplace=True, ascending=True)
df_tool_runs.reset_index(drop=True, inplace=True)

# print how many rows are in the DataFrame
print(f"Number of tool runs: {len(df_tool_runs)}")

Number of tool runs: 78


In [3]:
import tqdm
import ast
from datetime import datetime
def get_tool_and_parent_text(run_id: str, parent_run_id: str, client: Client):
    """
    Given a tool run ID, fetch that run's details and its parent run's details.
    Return:
      - parent_input (the question or chain input from the parent run//user question)
      - tool_output (the output from the tool run itself//chatbot context)
      - tool_input (the input to the tool run itself//chatbot search query)
      - latency (the time taken for the trace run to complete)
    If something fails or key is missing, return None, None.
    """
    no_question_text = "No question text found"
    no_context_used = "No context used"
    no_querey_found = "No query found"
    latency = None
    # Fetch the tool run's output - the chatbot context
    try:
        tool_run = client.read_run(run_id=run_id)
    except Exception as e:
        print(f"Error reading tool run {run_id}: {e}")
        return no_question_text, no_context_used, no_querey_found, latency
    try:
        tool_output = tool_run.outputs.get("output", no_context_used)
    except Exception as e:
        print(f"Error reading tool output for run {run_id}: {e}")
        tool_output = no_context_used
    # Fetch tool run's input - the chatbot search query
    try:
        tool_input = tool_run.inputs.get("input", no_querey_found)
    except Exception as e:
        print(f"Error reading tool input for run {run_id}: {e}")
        tool_input = no_querey_found
    # tool_input:str:{'query': 'Stipendien und Finanzhilfen für internationale Studierende'}
    # extract the query from the tool_input
    if tool_input == no_querey_found:
        tool_input = no_querey_found
    else:
        # Convert the string to a dictionary
        dict = ast.literal_eval(tool_input)
        # Extract the value of the 'query' key
        tool_input = dict['query']

    if parent_run_id is None:
        return no_question_text, tool_output, tool_input, latency
    # Fetch the parent run's input - the user question
    try:
        parent_run = client.read_run(parent_run_id)
        parent_input = parent_run.inputs.get("input", no_question_text)
    except Exception as e:
        print(f"Error reading parent run {parent_run_id}: {e}")
        parent_input = no_question_text
    # Calculate the latency of the parten run
    try:
        parent_start_time = parent_run.start_time
        parent_end_time = parent_run.end_time
        latency = parent_end_time - parent_start_time
        #print(f"run id: {parent_run.id}, start time: {parent_start_time}, end time: {parent_end_time}, latency: {latency}")
        # to seconds
        latency = latency.total_seconds()
    except Exception as e:
        print(f"Error calculating latency for run {run_id}: {e}")
        latency = None

    return parent_input, tool_output, tool_input, latency


def fetch_question_context_pairs(
    df_tool: pd.DataFrame,
    client: Client
) -> list:
    """
    Iterate over df_tool to fetch the parent's input (question), the tool's output (context) and the tool's input (search query).
    Returns a list of (question, context) pairs.
    """
    question_context_pairs = []

    for _, row in tqdm.tqdm(df_tool.iterrows(), total=len(df_tool)):
        run_id = row["run_id"]
        parent_run_id = row["parent_run_id"]
        parent_input, tool_output, tool_input, latency = get_tool_and_parent_text(run_id, parent_run_id, client)
        question_context_pairs.append((parent_input, tool_output, tool_input, latency))

    return question_context_pairs

def add_context_to_final_df(
    df_final: pd.DataFrame,
    question_context_pairs: list,
    question_col: str,
    context_col: str,
    query_col: str,
    latency_col: str
) -> pd.DataFrame:
    """
    Merge the question-context pairs into df_final by matching the question column.
    """
    for (q_str, ctx_str, query_str, latency) in question_context_pairs:
        mask = df_final[question_col] == q_str
        if mask.any():
            idx = df_final.loc[mask].index
            row_index = idx[0]
            df_final.at[row_index, context_col] = ctx_str
            df_final.at[row_index, query_col] = query_str
            df_final.at[row_index, latency_col] = latency


    return df_final

# # Example usage:
# # =========== Read your final DataFrame (English example) ===========
# df_en = pd.read_csv("../../data/final_merged_dataset_short_en.csv")
# df_de = pd.read_csv("../../data/final_merged_dataset_short_de.csv")

# # Add a new column for storing the context
# context_col_name = "chatbot_context"
# query_col_name = "chatbot_search_query"
# latency_col_name = "latency"

# # Fetch question-context pairs
# question_context_pairs = fetch_question_context_pairs(df_tool=df_tool_runs, client=client)

# # Add context to the final DataFrame
# df_en_updated = add_context_to_final_df(
#     df_final=df_en, 
#     question_context_pairs=question_context_pairs, 
#     question_col="english_question_text_q", 
#     context_col=context_col_name,
#     query_col=query_col_name,
#     latency_col=latency_col_name)

# df_de_updated = add_context_to_final_df(
#     df_final=df_de, 
#     question_context_pairs=question_context_pairs, 
#     question_col="german_question_text_q", 
#     context_col=context_col_name,
#     query_col=query_col_name,
#     latency_col=latency_col_name)

# # Save the updated DataFrame
# out_csv_en = "../../data/final_merged_dataset_short_en_with_context.csv"
# out_csv_de = "../../data/final_merged_dataset_short_de_with_context.csv"
# df_en_updated.to_csv(out_csv_en, index=False)
# df_de_updated.to_csv(out_csv_de, index=False)
# print("Saved updated DataFrame with context to:", out_csv_en)
# print("Saved updated DataFrame with context to:", out_csv_de)
# df_en_updated.head()

In [6]:
# Example usage:
# =========== Read your final DataFrame (English example) ===========
df = pd.read_csv("../../data/chatbot_data/chatbot_answers_first_20_de_en.csv")

# Columns for storing the context
# English
context_col_name_en = "chatbot_context_en"
query_col_name_en = "chatbot_search_query_en"
latency_col_name_en = "latency_en"
# German
context_col_name_de = "chatbot_context_de"
query_col_name_de = "chatbot_search_query_de"
latency_col_name_de = "latency_de"

# Fetch question-context pairs
question_context_pairs = fetch_question_context_pairs(df_tool=df_tool_runs, client=client)

# Add context to the final DataFrame English
df = add_context_to_final_df(
    df_final=df, 
    question_context_pairs=question_context_pairs, 
    question_col="english_question_text_q", 
    context_col=context_col_name_en,
    query_col=query_col_name_en,
    latency_col=latency_col_name_en)

df = add_context_to_final_df(
    df_final=df, 
    question_context_pairs=question_context_pairs, 
    question_col="german_question_text_q", 
    context_col=context_col_name_de,
    query_col=query_col_name_de,
    latency_col=latency_col_name_de)

# Save the updated DataFrame
out_csv = "../../data/chatbot_data/chatbot_answers_first_20_de_en_test_2.csv"
df.to_csv(out_csv, index=False)
print("Saved updated DataFrame with context to:", out_csv)
df.head()

  6%|▋         | 5/78 [00:07<01:41,  1.38s/it]

Error reading tool output for run 45cb6ae1-7a42-465b-8f6f-a1afb96998e8: 'NoneType' object has no attribute 'get'


 22%|██▏       | 17/78 [00:24<01:21,  1.34s/it]

Error reading tool output for run ae8ff84d-9365-4b59-94b2-07f86bd1097d: 'NoneType' object has no attribute 'get'


100%|██████████| 78/78 [01:56<00:00,  1.50s/it]

Saved updated DataFrame with context to: ../../data/chatbot_answers_first_20_de_en_test_2.csv





Unnamed: 0,german_question_text_q,english_question_text_q,role_q,program_q,participant_id_q,question_id_q,question_language_q,age_q,gender_q,gender[other]_q,...,chatbot_answer_de,chatbot_answer_en,chatbot_visited_urls_de,chatbot_visited_urls_en,chatbot_context_en,chatbot_search_query_en,latency_en,chatbot_context_de,chatbot_search_query_de,latency_de
0,Welche Master Studiengänge kann ich mit einem ...,Which master's degree programs can I do with a...,international,"Bachelor of Science, Cognitive Science",24,9,de,A2,A2,,...,Mit einem Bachelorabschluss in Cognitive Scien...,With a Bachelor's degree in Cognitive Science ...,https://www.uni-osnabrueck.de/studieninteressi...,https://www.ikw.uni-osnabrueck.de/studieninter...,Information taken from:https://www.uni-osnabru...,Masterstudiengänge für Absolventen eines Bache...,11.673712,Information taken from:https://www.uni-osnabru...,Master Studiengänge nach einem Bachelor in Cog...,21.348208
1,Wo finde ich Wohnungsmöglichkeiten in der Nähe...,Where can I find housing options near the univ...,prospective,,72,83,de,A1,A2,,...,In der Nähe der Universität Osnabrück gibt es ...,Finding housing options near the University of...,https://www.uni-osnabrueck.de/studieninteressi...,https://www.uni-osnabrueck.de/studieninteressi...,Information taken from:https://www.uni-osnabru...,Wohnmöglichkeiten in der Nähe der Universität ...,16.048288,Information taken from:https://www.uni-osnabru...,Wohnungsmöglichkeiten in der Nähe der Universi...,16.215333
2,Welche Tools stellt die Uni mir zur Verfügung?,What tools does the university provide me with?,enrolled,Bachelor of Science Cognitive Science,18,105,de,A2,A1,,...,Die Universität Osnabrück stellt ihren Studier...,The University of Osnabrück provides a variety...,https://www.uni-osnabrueck.de/kommunikation/ko...,https://www.uni-osnabrueck.de/kommunikation/ko...,Information taken from:https://www.uni-osnabru...,Welche Werkzeuge stellt die Universität Osnabr...,34.478961,Information taken from:https://www.uni-osnabru...,Welche Tools stellt die Universität Osnabrück ...,16.948566
3,Wie melde ich mich für eine Bachelorarbeit an?,How do I register for a bachelor's thesis?,international,"Bachelor of Science, Cognitive Science",24,111,de,A2,A2,,...,Um sich für eine Bachelorarbeit an der Univers...,To register for your bachelor's thesis at the ...,https://www.lili.uni-osnabrueck.de/fachbereich...,https://www.ikw.uni-osnabrueck.de/studierende/...,Information taken from:https://www.ikw.uni-osn...,Wie registriere ich mich für eine Bachelorarbeit?,17.695043,Information taken from:https://www.lili.uni-os...,Wie melde ich mich für eine Bachelorarbeit an?,13.214948
4,Wo kann ich meine bisher erbrachten Leistungen...,Where can I view my previously provided servic...,enrolled,Wirtschaftsrecht,46,123,de,A3,A2,,...,Sie können Ihre bisher erbrachten Leistungen u...,You can view your previously provided services...,https://www.lili.uni-osnabrueck.de/fachbereich...,https://www.uni-osnabrueck.de/universitaet/org...,Information taken from:https://www.uni-osnabru...,Wo kann ich meine zuvor erbrachten Leistungen ...,11.56174,Information taken from:https://www.lili.uni-os...,Wo kann ich meine bisher erbrachten Leistungen...,10.415501


In [14]:
client.read_run(run_id=df_tool_runs["parent_run_id"][0]).start_time


datetime.datetime(2024, 12, 30, 11, 49, 0, 776993)

In [17]:
tool_runs_dict

[{'run_id': '718baa7f-0456-4bd7-822f-0c2c960b5e42',
  'parent_run_id': 'bfdda99c-e135-4322-b7e9-792c5050e60f',
  'start_time': datetime.datetime(2024, 12, 30, 11, 49, 50, 990423),
  'status': 'success'},
 {'run_id': '777dcd76-962d-4088-b796-2605cfabb408',
  'parent_run_id': 'b3410f73-3e9f-48c9-b213-3a97f8c629b9',
  'start_time': datetime.datetime(2024, 12, 30, 11, 49, 34, 774765),
  'status': 'success'},
 {'run_id': '0f7babc4-6d9e-4954-aa37-41e5b0415684',
  'parent_run_id': '35d78517-c4b7-4fa6-9da9-0ed3d6bb2ef1',
  'start_time': datetime.datetime(2024, 12, 30, 11, 49, 23, 918523),
  'status': 'success'},
 {'run_id': '54d55eb3-090d-49ad-b740-9453cea7685b',
  'parent_run_id': '003ff3ad-5673-4c2b-8353-a5b718d70bf3',
  'start_time': datetime.datetime(2024, 12, 30, 11, 49, 1, 930953),
  'status': 'success'}]