In [28]:
import os
from openai import OpenAI
import json
import pandas as pd

system_prompt_response_format_all = """\
You are an expert evaluator tasked with assessing the quality of system-generated answers to user questions across multiple dimensions. Follow these detailed instructions to provide your evaluation:

1. System Setting:
The system to be evaluated is tasked with answering questions related to the University of Osnabrück. If a question is unrelated to the University, the system is instructed to politely decline. Consider this directive when evaluating.

2. Evaluation Dimensions (Rate each dimension on a scale of 0 to 4):
- Hallucination: Refers to the presence of factually incorrect or unfaithful information in the answer. Any claim that cannot be verified using the provided context or widely known facts is considered a hallucination.
- Answer Accuracy: The degree to which a response precisely addresses the user’s question by providing correct, complete, and relevant information that aligns with the question’s intent. Factual correctness is necessary but not sufficient; the response must also be on-point, thorough, and responsive to the exact context and purpose of the question.
- User Satisfaction: Reflects the user's subjective evaluation of the answer's quality, emphasizing its effectiveness in addressing their question, delivering meaningful value, and creating a positive overall experience.
- Coherence, Clarity, and Fluency: Evaluates the overall readability and presentation of the answer. A response that scores well in this dimension is logically structured, free of grammatical errors, easy to understand, and expressed in a natural, flowing manner.
- Context Quality: Evaluates the adequacy and relevance of the provided context in supporting the answer. A high-quality context is directly tied to the user’s question, providing all necessary information to formulate a complete and accurate response. If no context is provided, its absence is assessed for its impact on the quality of the answer.

3. Scoring Guidelines:
For each dimension, assign a score from 0 to 4 and follow these interpretations:
- 0: Very Bad
- 1: Bad
- 2: Neutral
- 3: Good
- 4: Very Good

4. Evaluation Steps (Chain of Thought):
For each dimension, follow these steps to ensure thorough and consistent evaluations:

Step 1. Understand the Question and Context:
   - Read the user question carefully.
   - Examine the provided context (if any) and background of the question to understand the information need of the user.

Step 2. Analyze the System Answer:
   - Break down the system-generated answer into key components or claims.
   - Compare each component to the question and context for the impact on the evaluation dimensions.

Step 3. Assess Strengths and Weaknesses:
   - Identify specific aspects of the system-generated answer that align well with the evaluation dimension.
   - Note any shortcomings or inconsistencies, such as irrelevant details, factual errors, or unclear phrasing.

Step 4. Provide Justifications and Scores:
   - Based on your analysis, assign a score (0–4) for the dimension.
   - Write a clear and concise explanation for your score, referring to observed strengths and weaknesses.

5. Best Practices:
- Objectivity: Base evaluations strictly on the provided content and guidelines.
- Clarity: Be concise in your comment (1 sentence), focusing on specific observations.
- Consider Ambiguities: For ambiguous or multi-faceted questions, assess based on the most straightforward interpretation unless otherwise stated.
- Context Quality: If context is missing or insufficient, clearly describe its impact, but avoid penalizing unrelated dimensions.

Adhere strictly to these instructions, using the chain-of-thought reasoning process to ensure consistent and high-quality evaluations.
"""
system_prompt_response_format_all_reference = """\
You are an expert evaluator tasked with assessing the quality of system-generated answers to user questions across multiple dimensions. Follow these detailed instructions to provide your evaluation:

1. System Setting:
The system to be evaluated is tasked with answering questions related to Osnabrück University. If a question is unrelated to the university, the system is instructed to politely decline. Consider this directive when evaluating.

2. Reference Answer:
In addition to the system-generated answer, a human-provided reference answer is available for comparison. Use the reference answer to assess the quality of the system's response. The reference answer represents a reliable benchmark for evaluating correctness, completeness, and appropriateness.

3. Evaluation Dimensions (Rate each dimension on a scale of 0 to 4):
- Hallucination: Refers to the presence of factually incorrect or unfaithful information in the answer. Any claim that cannot be verified using the provided context or widely known facts is considered a hallucination.
- Answer Accuracy: The degree to which a response precisely addresses the user’s question by providing correct, complete, and relevant information that aligns with the question’s intent. Factual correctness is necessary but not sufficient; the response must also be on-point, thorough, and responsive to the exact context and purpose of the question.
- User Satisfaction: Reflects the user's subjective evaluation of the answer's quality, emphasizing its effectiveness in addressing their question, delivering meaningful value, and creating a positive overall experience.
- Coherence, Clarity, and Fluency: Evaluates the overall readability and presentation of the answer. A response that scores well in this dimension is logically structured, free of grammatical errors, easy to understand, and expressed in a natural, flowing manner.
- Context Quality: Evaluates the adequacy and relevance of the provided context in supporting the answer. A high-quality context is directly tied to the user’s question, providing all necessary information to formulate a complete and accurate response. If no context is provided, its absence is assessed for its impact on the quality of the answer.


4. Scoring Guidelines:
For each dimension, assign a score from 0 to 4 and follow these interpretations:
- 0: Very Bad
- 1: Bad
- 2: Neutral
- 3: Good
- 4: Very Good

5. Evaluation Steps (Chain of Thought):
For each dimension, follow these steps to ensure thorough and consistent evaluations:

Step 1. Understand the Question, Context, and Reference Answer:
	- Read the user question carefully.
	- Examine the provided context (if any) and background of the question to understand the information need of the user.
	- Review the reference answer to establish a benchmark the evaluation dimensions.

Step 2. Analyze the System Answer:
	-Break down the system-generated answer into key components or claims.
	-Compare each component to the question, context, and reference answer for alignment and impact on the evaluation dimensions.

Step 3. Assess Strengths and Weaknesses:
	-Identify specific aspects of the system-generated answer that align well with the evaluation dimension.
	-Note any shortcomings or inconsistencies, such as irrelevance, factual errors, or unclear phrasing, especially in comparison to the reference answer.

Step 4. Provide Justifications and Scores:
	-Based on your analysis, assign a score (0–4) for the dimension.
	-Write a clear and concise explanation for your score, referring to observed strengths and weaknesses.

6. Best Practices:
- Objectivity: Base evaluations strictly on the provided content and guidelines.
- Clarity: Be concise in your comment (1 sentence), focusing on specific observations.
- Consider Ambiguities: For ambiguous or multi-faceted questions, assess based on the most straightforward interpretation unless otherwise stated.
- Context Quality: If context is missing or insufficient, clearly describe its impact, but avoid penalizing unrelated dimensions.

Adhere strictly to these instructions, using the chain-of-thought reasoning process to ensure consistent and high-quality evaluations.
"""

system_prompt_response_format_hallucination = """\
You are an expert evaluator tasked with assessing the quality of system-generated answers to user questions. Follow these detailed instructions to provide your evaluation of the dimension: Hallucination.

1. System Setting:
The system to be evaluated is tasked with answering questions related to Osnabrück University. If a question is unrelated to the University, the system is instructed to politely decline. Consider this directive when evaluating.

2. Evaluation Dimension:
- Hallucination: Refers to the presence of factually incorrect or unfaithful information in the answer. Any claim that cannot be verified using the provided context or widely known facts is considered a hallucination.

3. Scoring Guidelines:
Assign a score from 0 to 4 based on the following interpretations:
- 0: Very Bad
- 1: Bad
- 2: Neutral
- 3: Good
- 4: Very Good

4. Evaluation Steps (Chain of Thought):
For this dimension, follow these steps to ensure a thorough and consistent evaluation:

Step 1. Understand the Question and Context:
   - Read the user question carefully.
   - Examine the provided context (if any) and background of the question to understand the information need of the user.

Step 2. Analyze the System Answer:
   - Break down the system-generated answer into key components or claims.
   - Compare each component to the question and context for its impact on Hallucination.

Step 3. Assess Strengths and Weaknesses:
   - Identify specific aspects of the system-generated answer that align well with Hallucination.
   - Note any shortcomings or inconsistencies, such as irrelevant details, factual errors, or unclear phrasing.

Step 4. Provide Justifications and Scores:
   - Based on your analysis, assign a score (0–4) for Hallucination.
   - Write a clear and concise explanation for your score, referring to observed strengths and weaknesses.

5. Best Practices:
- Objectivity: Base evaluations strictly on the provided content and guidelines.
- Clarity: Be concise in your comment (1 sentence), focusing on specific observations.
- Consider Ambiguities: For ambiguous or multi-faceted questions, assess based on the most straightforward interpretation unless otherwise stated.
- Context Quality: If context is missing or insufficient, clearly describe its impact, but avoid penalizing unrelated aspects of the system-generated answer.

Adhere strictly to these instructions, using the chain-of-thought reasoning process to ensure consistent and high-quality evaluations.
"""
system_prompt_response_format_hallucination_reference = """\
You are an expert evaluator tasked with assessing the quality of system-generated answers to user questions. Follow these detailed instructions to provide your evaluation of the dimension: Hallucination.

1. System Setting:
The system to be evaluated is tasked with answering questions related to Osnabrück University. If a question is unrelated to the university, the system is instructed to politely decline. Consider this directive when evaluating.

2. Reference Answer:
In addition to the system-generated answer, a human-provided reference answer is available for comparison. Use the reference answer to assess the quality of the system's response. The reference answer represents a reliable benchmark for evaluating correctness, completeness, and appropriateness.

3. Evaluation Dimension:
- Hallucination: Refers to the presence of factually incorrect or unfaithful information in the answer. Any claim that cannot be verified using the provided context or widely known facts is considered a hallucination.

4. Scoring Guidelines:
For each dimension, assign a score from 0 to 4 and follow these interpretations:
- 0: Very Bad
- 1: Bad
- 2: Neutral
- 3: Good
- 4: Very Good

5. Evaluation Steps (Chain of Thought):
For this dimension, follow these steps to ensure a thorough and consistent evaluation:

Step 1. Understand the Question, Context, and Reference Answer:
	- Read the user question carefully.
	- Examine the provided context (if any) and background of the question to understand the information need of the user.
	- Review the reference answer to establish a benchmark for Hallucination.

Step 2. Analyze the System Answer:
	-Break down the system-generated answer into key components or claims.
	-Compare each component to the question, context, and reference answer for alignment and impact on Hallucination.

Step 3. Assess Strengths and Weaknesses:
	-Identify specific aspects of the system-generated answer that align well with Hallucination.
	-Note any shortcomings or inconsistencies, such as irrelevance, factual errors, or unclear phrasing, especially in comparison to the reference answer.

Step 4. Provide Justifications and Scores:
	-Based on your analysis, assign a score (0–4) for Hallucination.
	-Write a clear and concise explanation for your score, referring to observed strengths and weaknesses.

6. Best Practices:
- Objectivity: Base evaluations strictly on the provided content and guidelines.
- Clarity: Be concise in your comment (1 sentence), focusing on specific observations.
- Consider Ambiguities: For ambiguous or multi-faceted questions, assess based on the most straightforward interpretation unless otherwise stated.
- Context Quality: If context is missing or insufficient, clearly describe its impact, but avoid penalizing unrelated aspects of the system-generated answer.

Adhere strictly to these instructions, using the chain-of-thought reasoning process to ensure consistent and high-quality evaluations.
"""

system_prompt_response_format_accuracy = """\
You are an expert evaluator tasked with assessing the quality of system-generated answers to user questions. Follow these detailed instructions to provide your evaluation of the dimension: Answer Accuracy.

1. System Setting:
The system to be evaluated is tasked with answering questions related to Osnabrück University. If a question is unrelated to the University, the system is instructed to politely decline. Consider this directive when evaluating.

2. Evaluation Dimension:
- Answer Accuracy: The degree to which a response precisely addresses the user’s question by providing correct, complete, and relevant information that aligns with the question’s intent. Factual correctness is necessary but not sufficient; the response must also be on-point, thorough, and responsive to the exact context and purpose of the question.

3. Scoring Guidelines:
Assign a score from 0 to 4 based on the following interpretations:
- 0: Very Bad
- 1: Bad
- 2: Neutral
- 3: Good
- 4: Very Good

4. Evaluation Steps (Chain of Thought):
For this dimension, follow these steps to ensure a thorough and consistent evaluation:

Step 1. Understand the Question and Context:
   - Read the user question carefully.
   - Examine the provided context (if any) and background of the question to understand the information need of the user.

Step 2. Analyze the System Answer:
   - Break down the system-generated answer into key components or claims.
   - Compare each component to the question and context for its impact on Answer Accuracy.

Step 3. Assess Strengths and Weaknesses:
   - Identify specific aspects of the system-generated answer that align well with Answer Accuracy.
   - Note any shortcomings or inconsistencies, such as irrelevant details, factual errors, or unclear phrasing.

Step 4. Provide Justifications and Scores:
   - Based on your analysis, assign a score (0–4) for Answer Accuracy.
   - Write a clear and concise explanation for your score, referring to observed strengths and weaknesses.

5. Best Practices:
- Objectivity: Base evaluations strictly on the provided content and guidelines.
- Clarity: Be concise in your comment (1 sentence), focusing on specific observations.
- Consider Ambiguities: For ambiguous or multi-faceted questions, assess based on the most straightforward interpretation unless otherwise stated.
- Context Quality: If context is missing or insufficient, clearly describe its impact, but avoid penalizing unrelated aspects of the system-generated answer.

Adhere strictly to these instructions, using the chain-of-thought reasoning process to ensure consistent and high-quality evaluations.
"""
system_prompt_response_format_accuracy_reference = """\
You are an expert evaluator tasked with assessing the quality of system-generated answers to user questions. Follow these detailed instructions to provide your evaluation of the dimension: Answer Accuracy.

1. System Setting:
The system to be evaluated is tasked with answering questions related to Osnabrück University. If a question is unrelated to the university, the system is instructed to politely decline. Consider this directive when evaluating.

2. Reference Answer:
In addition to the system-generated answer, a human-provided reference answer is available for comparison. Use the reference answer to assess the quality of the system's response. The reference answer represents a reliable benchmark for evaluating correctness, completeness, and appropriateness.

3. Evaluation Dimension:
- Answer Accuracy: The degree to which a response precisely addresses the user’s question by providing correct, complete, and relevant information that aligns with the question’s intent. Factual correctness is necessary but not sufficient; the response must also be on-point, thorough, and responsive to the exact context and purpose of the question.

4. Scoring Guidelines:
For each dimension, assign a score from 0 to 4 and follow these interpretations:
- 0: Very Bad
- 1: Bad
- 2: Neutral
- 3: Good
- 4: Very Good

5. Evaluation Steps (Chain of Thought):
For this dimension, follow these steps to ensure a thorough and consistent evaluation:

Step 1. Understand the Question, Context, and Reference Answer:
	- Read the user question carefully.
	- Examine the provided context (if any) and background of the question to understand the information need of the user.
	- Review the reference answer to establish a benchmark for Answer Accuracy.

Step 2. Analyze the System Answer:
	-Break down the system-generated answer into key components or claims.
	-Compare each component to the question, context, and reference answer for alignment and impact on Answer Accuracy.

Step 3. Assess Strengths and Weaknesses:
	-Identify specific aspects of the system-generated answer that align well with Answer Accuracy.
	-Note any shortcomings or inconsistencies, such as irrelevance, factual errors, or unclear phrasing, especially in comparison to the reference answer.

Step 4. Provide Justifications and Scores:
	-Based on your analysis, assign a score (0–4) for Answer Accuracy.
	-Write a clear and concise explanation for your score, referring to observed strengths and weaknesses.

6. Best Practices:
- Objectivity: Base evaluations strictly on the provided content and guidelines.
- Clarity: Be concise in your comment (1 sentence), focusing on specific observations.
- Consider Ambiguities: For ambiguous or multi-faceted questions, assess based on the most straightforward interpretation unless otherwise stated.
- Context Quality: If context is missing or insufficient, clearly describe its impact, but avoid penalizing unrelated aspects of the system-generated answer.

Adhere strictly to these instructions, using the chain-of-thought reasoning process to ensure consistent and high-quality evaluations.
"""

system_prompt_response_format_satisfaction = """\
You are an expert evaluator tasked with assessing the quality of system-generated answers to user questions. Follow these detailed instructions to provide your evaluation of the dimension: User Satisfaction.

1. System Setting:
The system to be evaluated is tasked with answering questions related to Osnabrück University. If a question is unrelated to the University, the system is instructed to politely decline. Consider this directive when evaluating.

2. Evaluation Dimension:
- User Satisfaction: Reflects the user's subjective evaluation of the answer's quality, emphasizing its effectiveness in addressing their question, delivering meaningful value, and creating a positive overall experience.

3. Scoring Guidelines:
Assign a score from 0 to 4 based on the following interpretations:
- 0: Very Bad
- 1: Bad
- 2: Neutral
- 3: Good
- 4: Very Good

4. Evaluation Steps (Chain of Thought):
For this dimension, follow these steps to ensure a thorough and consistent evaluation:

Step 1. Understand the Question and Context:
   - Read the user question carefully.
   - Examine the provided context (if any) and background of the question to understand the information need of the user.

Step 2. Analyze the System Answer:
   - Break down the system-generated answer into key components or claims.
   - Compare each component to the question and context for its impact on User Satisfaction.

Step 3. Assess Strengths and Weaknesses:
   - Identify specific aspects of the system-generated answer that align well with User Satisfaction.
   - Note any shortcomings or inconsistencies, such as irrelevant details, factual errors, or unclear phrasing.

Step 4. Provide Justifications and Scores:
   - Based on your analysis, assign a score (0–4) for User Satisfaction.
   - Write a clear and concise explanation for your score, referring to observed strengths and weaknesses.

5. Best Practices:
- Objectivity: Base evaluations strictly on the provided content and guidelines.
- Clarity: Be concise in your comment (1 sentence), focusing on specific observations.
- Consider Ambiguities: For ambiguous or multi-faceted questions, assess based on the most straightforward interpretation unless otherwise stated.
- Context Quality: If context is missing or insufficient, clearly describe its impact, but avoid penalizing unrelated aspects of the system-generated answer.

Adhere strictly to these instructions, using the chain-of-thought reasoning process to ensure consistent and high-quality evaluations.
"""
system_prompt_response_format_satisfaction_reference = """\
You are an expert evaluator tasked with assessing the quality of system-generated answers to user questions. Follow these detailed instructions to provide your evaluation of the dimension: User Satisfaction.

1. System Setting:
The system to be evaluated is tasked with answering questions related to Osnabrück University. If a question is unrelated to the university, the system is instructed to politely decline. Consider this directive when evaluating.

2. Reference Answer:
In addition to the system-generated answer, a human-provided reference answer is available for comparison. Use the reference answer to assess the quality of the system's response. The reference answer represents a reliable benchmark for evaluating correctness, completeness, and appropriateness.

3. Evaluation Dimension:
- User Satisfaction: Reflects the user's subjective evaluation of the answer's quality, emphasizing its effectiveness in addressing their question, delivering meaningful value, and creating a positive overall experience.

4. Scoring Guidelines:
For each dimension, assign a score from 0 to 4 and follow these interpretations:
- 0: Very Bad
- 1: Bad
- 2: Neutral
- 3: Good
- 4: Very Good

5. Evaluation Steps (Chain of Thought):
For this dimension, follow these steps to ensure a thorough and consistent evaluation:

Step 1. Understand the Question, Context, and Reference Answer:
	- Read the user question carefully.
	- Examine the provided context (if any) and background of the question to understand the information need of the user.
	- Review the reference answer to establish a benchmark for User Satisfaction.

Step 2. Analyze the System Answer:
	-Break down the system-generated answer into key components or claims.
	-Compare each component to the question, context, and reference answer for alignment and impact on User Satisfaction.

Step 3. Assess Strengths and Weaknesses:
	-Identify specific aspects of the system-generated answer that align well with User Satisfaction.
	-Note any shortcomings or inconsistencies, such as irrelevance, factual errors, or unclear phrasing, especially in comparison to the reference answer.

Step 4. Provide Justifications and Scores:
	-Based on your analysis, assign a score (0–4) for User Satisfaction.
	-Write a clear and concise explanation for your score, referring to observed strengths and weaknesses.

6. Best Practices:
- Objectivity: Base evaluations strictly on the provided content and guidelines.
- Clarity: Be concise in your comment (1 sentence), focusing on specific observations.
- Consider Ambiguities: For ambiguous or multi-faceted questions, assess based on the most straightforward interpretation unless otherwise stated.
- Context Quality: If context is missing or insufficient, clearly describe its impact, but avoid penalizing unrelated aspects of the system-generated answer.

Adhere strictly to these instructions, using the chain-of-thought reasoning process to ensure consistent and high-quality evaluations.
"""

system_prompt_response_format_coherence = """\
You are an expert evaluator tasked with assessing the quality of system-generated answers to user questions. Follow these detailed instructions to provide your evaluation of the dimension: Coherence, Clarity, and Fluency.

1. System Setting:
The system to be evaluated is tasked with answering questions related to Osnabrück University. If a question is unrelated to the University, the system is instructed to politely decline. Consider this directive when evaluating.

2. Evaluation Dimension:
- Coherence, Clarity, and Fluency: Evaluates the overall readability and presentation of the answer. A response that scores well in this dimension is logically structured, free of grammatical errors, easy to understand, and expressed in a natural, flowing manner.

3. Scoring Guidelines:
Assign a score from 0 to 4 based on the following interpretations:
- 0: Very Bad
- 1: Bad
- 2: Neutral
- 3: Good
- 4: Very Good

4. Evaluation Steps (Chain of Thought):
For this dimension, follow these steps to ensure a thorough and consistent evaluation:

Step 1. Understand the Question and Context:
   - Read the user question carefully.
   - Examine the provided context (if any) and background of the question to understand the information need of the user.

Step 2. Analyze the System Answer:
   - Break down the system-generated answer into key components or claims.
   - Compare each component to the question and context for its impact on Coherence, Clarity, and Fluency.

Step 3. Assess Strengths and Weaknesses:
   - Identify specific aspects of the system-generated answer that align well with Coherence, Clarity, and Fluency.
   - Note any shortcomings or inconsistencies, such as irrelevant details, factual errors, or unclear phrasing.

Step 4. Provide Justifications and Scores:
   - Based on your analysis, assign a score (0–4) for Coherence, Clarity, and Fluency.
   - Write a clear and concise explanation for your score, referring to observed strengths and weaknesses.

5. Best Practices:
- Objectivity: Base evaluations strictly on the provided content and guidelines.
- Clarity: Be concise in your comment (1 sentence), focusing on specific observations.
- Consider Ambiguities: For ambiguous or multi-faceted questions, assess based on the most straightforward interpretation unless otherwise stated.
- Context Quality: If context is missing or insufficient, clearly describe its impact, but avoid penalizing unrelated aspects of the system-generated answer.

Adhere strictly to these instructions, using the chain-of-thought reasoning process to ensure consistent and high-quality evaluations.
"""
system_prompt_response_format_coherence_reference = """\
You are an expert evaluator tasked with assessing the quality of system-generated answers to user questions. Follow these detailed instructions to provide your evaluation of the dimension: Coherence, Clarity, and Fluency.

1. System Setting:
The system to be evaluated is tasked with answering questions related to Osnabrück University. If a question is unrelated to the university, the system is instructed to politely decline. Consider this directive when evaluating.

2. Reference Answer:
In addition to the system-generated answer, a human-provided reference answer is available for comparison. Use the reference answer to assess the quality of the system's response. The reference answer represents a reliable benchmark for evaluating correctness, completeness, and appropriateness.

3. Evaluation Dimension:
- Coherence, Clarity, and Fluency: Evaluates the overall readability and presentation of the answer. A response that scores well in this dimension is logically structured, free of grammatical errors, easy to understand, and expressed in a natural, flowing manner.

4. Scoring Guidelines:
For each dimension, assign a score from 0 to 4 and follow these interpretations:
- 0: Very Bad
- 1: Bad
- 2: Neutral
- 3: Good
- 4: Very Good

5. Evaluation Steps (Chain of Thought):
For this dimension, follow these steps to ensure a thorough and consistent evaluation:

Step 1. Understand the Question, Context, and Reference Answer:
	- Read the user question carefully.
	- Examine the provided context (if any) and background of the question to understand the information need of the user.
	- Review the reference answer to establish a benchmark for Coherence, Clarity, and Fluency.

Step 2. Analyze the System Answer:
	-Break down the system-generated answer into key components or claims.
	-Compare each component to the question, context, and reference answer for alignment and impact on Coherence, Clarity, and Fluency.

Step 3. Assess Strengths and Weaknesses:
	-Identify specific aspects of the system-generated answer that align well with Coherence, Clarity, and Fluency.
	-Note any shortcomings or inconsistencies, such as irrelevance, factual errors, or unclear phrasing, especially in comparison to the reference answer.

Step 4. Provide Justifications and Scores:
	-Based on your analysis, assign a score (0–4) for Coherence, Clarity, and Fluency.
	-Write a clear and concise explanation for your score, referring to observed strengths and weaknesses.

6. Best Practices:
- Objectivity: Base evaluations strictly on the provided content and guidelines.
- Clarity: Be concise in your comment (1 sentence), focusing on specific observations.
- Consider Ambiguities: For ambiguous or multi-faceted questions, assess based on the most straightforward interpretation unless otherwise stated.
- Context Quality: If context is missing or insufficient, clearly describe its impact, but avoid penalizing unrelated aspects of the system-generated answer.

Adhere strictly to these instructions, using the chain-of-thought reasoning process to ensure consistent and high-quality evaluations.
"""

system_prompt_response_format_context = """\
You are an expert evaluator tasked with assessing the quality of system-generated answers to user questions. Follow these detailed instructions to provide your evaluation of the dimension: Context Quality.

1. System Setting:
The system to be evaluated is tasked with answering questions related to Osnabrück University. If a question is unrelated to the University, the system is instructed to politely decline. Consider this directive when evaluating.

2. Evaluation Dimension:
- Context Quality: Evaluates the adequacy and relevance of the provided context in supporting the answer. A high-quality context is directly tied to the user’s question, providing all necessary information to formulate a complete and accurate response. If no context is provided, its absence is assessed for its impact on the quality of the answer.

3. Scoring Guidelines:
Assign a score from 0 to 4 based on the following interpretations:
- 0: Very Bad
- 1: Bad
- 2: Neutral
- 3: Good
- 4: Very Good

4. Evaluation Steps (Chain of Thought):
For this dimension, follow these steps to ensure a thorough and consistent evaluation:

Step 1. Understand the Question and Context:
   - Read the user question carefully.
   - Examine the provided context (if any) and background of the question to understand the information need of the user.

Step 2. Analyze the System Answer:
   - Break down the system-generated answer into key components or claims.
   - Compare each component to the question and context for its impact on Context Quality.

Step 3. Assess Strengths and Weaknesses:
   - Identify specific aspects of the system-generated answer that align well with Context Quality.
   - Note any shortcomings or inconsistencies, such as irrelevant details, factual errors, or unclear phrasing.

Step 4. Provide Justifications and Scores:
   - Based on your analysis, assign a score (0–4) for Context Quality.
   - Write a clear and concise explanation for your score, referring to observed strengths and weaknesses.

5. Best Practices:
- Objectivity: Base evaluations strictly on the provided content and guidelines.
- Clarity: Be concise in your comment (1 sentence), focusing on specific observations.
- Consider Ambiguities: For ambiguous or multi-faceted questions, assess based on the most straightforward interpretation unless otherwise stated.
- Context Quality: If context is missing or insufficient, clearly describe its impact, but avoid penalizing unrelated aspects of the system-generated answer.

Adhere strictly to these instructions, using the chain-of-thought reasoning process to ensure consistent and high-quality evaluations.
"""
system_prompt_response_format_context_reference = """\
You are an expert evaluator tasked with assessing the quality of system-generated answers to user questions. Follow these detailed instructions to provide your evaluation of the dimension: Context Quality.

1. System Setting:
The system to be evaluated is tasked with answering questions related to Osnabrück University. If a question is unrelated to the university, the system is instructed to politely decline. Consider this directive when evaluating.

2. Reference Answer:
In addition to the system-generated answer, a human-provided reference answer is available for comparison. Use the reference answer to assess the quality of the system's response. The reference answer represents a reliable benchmark for evaluating correctness, completeness, and appropriateness.

3. Evaluation Dimension:
- Context Quality: Evaluates the adequacy and relevance of the provided context in supporting the answer. A high-quality context is directly tied to the user’s question, providing all necessary information to formulate a complete and accurate response. If no context is provided, its absence is assessed for its impact on the quality of the answer.

4. Scoring Guidelines:
For each dimension, assign a score from 0 to 4 and follow these interpretations:
- 0: Very Bad
- 1: Bad
- 2: Neutral
- 3: Good
- 4: Very Good

5. Evaluation Steps (Chain of Thought):
For this dimension, follow these steps to ensure a thorough and consistent evaluation:

Step 1. Understand the Question, Context, and Reference Answer:
	- Read the user question carefully.
	- Examine the provided context (if any) and background of the question to understand the information need of the user.
	- Review the reference answer to establish a benchmark for Context Quality.

Step 2. Analyze the System Answer:
	-Break down the system-generated answer into key components or claims.
	-Compare each component to the question, context, and reference answer for alignment and impact on Context Quality.

Step 3. Assess Strengths and Weaknesses:
	-Identify specific aspects of the system-generated answer that align well with Context Quality.
	-Note any shortcomings or inconsistencies, such as irrelevance, factual errors, or unclear phrasing, especially in comparison to the reference answer.

Step 4. Provide Justifications and Scores:
	-Based on your analysis, assign a score (0–4) for Context Quality.
	-Write a clear and concise explanation for your score, referring to observed strengths and weaknesses.

6. Best Practices:
- Objectivity: Base evaluations strictly on the provided content and guidelines.
- Clarity: Be concise in your comment (1 sentence), focusing on specific observations.
- Consider Ambiguities: For ambiguous or multi-faceted questions, assess based on the most straightforward interpretation unless otherwise stated.
- Context Quality: If context is missing or insufficient, clearly describe its impact, but avoid penalizing unrelated aspects of the system-generated answer.

Adhere strictly to these instructions, using the chain-of-thought reasoning process to ensure consistent and high-quality evaluations.
"""


In [29]:
from pydantic import BaseModel
from typing import List

# Define the schema using pydantic
class DimensionScore(BaseModel):
    score: int  # 0-4
    comment: str

class EvaluationOutput_all(BaseModel):
    hallucination: DimensionScore
    answer_accuracy: DimensionScore
    user_satisfaction: DimensionScore
    coherence_clarity_fluency: DimensionScore
    context_quality: DimensionScore

class EvaluationOutput_hallucination(BaseModel):
    hallucination: DimensionScore

class EvaluationOutput_accuracy(BaseModel):
    answer_accuracy: DimensionScore

class EvaluationOutput_satisfaction(BaseModel):
    user_satisfaction: DimensionScore

class EvaluationOutput_coherence(BaseModel):
    coherence_clarity_fluency: DimensionScore

class EvaluationOutput_context(BaseModel):
    context_quality: DimensionScore
    
def evaluate_with_llm_as_judge_structured(
    context: str,
    user_question: str,
    system_answer: str,
    reference_answer: str,
    client: OpenAI,
    model_name: str,
    evaluation_style: str,
):
    """
    Calls the OpenAI API with structured outputs, enforcing the defined schema.
    
    If evaluation_style == 'together', do a single call with the multi-dimension
    prompt and schema (with or without a reference).
    
    If evaluation_style == 'separate', do five separate calls (one per dimension);
    if reference answers exist for each dimension, use the reference prompt, else
    use the default dimension prompt.
    """

    # -------------------------------------------------------
    # 1) EVALUATION STYLE: "TOGETHER"
    # -------------------------------------------------------
    if evaluation_style == "together":
        # If a reference answer is provided and not empty
        if reference_answer:
            # Use the "all dimensions with reference" prompt
            messages = [
                {
                    "role": "system",
                    "content": system_prompt_response_format_all_reference
                },
                {
                    "role": "user",
                    "content": f"""
                        Context:
                        <context>
                        {context}
                        </context>

                        User Question:
                        <user_question>
                        {user_question}
                        </user_question>

                        Reference Answer:
                        <reference_answer>
                        {reference_answer}
                        </reference_answer>

                        System Answer:
                        <system_answer>
                        {system_answer}
                        </system_answer>
                        """.strip()
                }
            ]
        else:
            # No reference answer => use the default "all-dimensions" prompt
            messages = [
                {
                    "role": "system",
                    "content": system_prompt_response_format_all
                },
                {
                    "role": "user",
                    "content": f"""
                        Context:
                        <context>
                        {context}
                        </context>

                        User Question:
                        <user_question>
                        {user_question}
                        </user_question>

                        System Answer:
                        <system_answer>
                        {system_answer}
                        </system_answer>
                        """.strip()
                }
            ]

        # Single call, enforcing the multi-dimension schema
        completion = client.beta.chat.completions.parse(
            model=model_name,
            messages=messages,
            temperature=0.0,
            response_format=EvaluationOutput_all
        )
        return completion.choices[0].message.parsed, messages

    # -------------------------------------------------------
    # 2) EVALUATION STYLE: "SEPARATE"
    # -------------------------------------------------------
    elif evaluation_style == "separate":
        dimension_prompts_and_schemas_no_ref = {
            "hallucination": (
                system_prompt_response_format_hallucination,
                EvaluationOutput_hallucination
            ),
            "answer_accuracy": (
                system_prompt_response_format_accuracy,
                EvaluationOutput_accuracy
            ),
            "user_satisfaction": (
                system_prompt_response_format_satisfaction,
                EvaluationOutput_satisfaction
            ),
            "coherence_clarity_fluency": (
                system_prompt_response_format_coherence,
                EvaluationOutput_coherence
            ),
            "context_quality": (
                system_prompt_response_format_context,
                EvaluationOutput_context
            ),
        }

        # Reference versions
        dimension_prompts_and_schemas_ref = {
            "hallucination": (
                system_prompt_response_format_hallucination_reference, 
                EvaluationOutput_hallucination
            ),
            "answer_accuracy": (
                system_prompt_response_format_accuracy_reference,
                EvaluationOutput_accuracy
            ),
            "user_satisfaction": (
                system_prompt_response_format_satisfaction_reference,
                EvaluationOutput_satisfaction
            ),
            "coherence_clarity_fluency": (
                system_prompt_response_format_coherence_reference,
                EvaluationOutput_coherence
            ),
            "context_quality": (
                system_prompt_response_format_context_reference,
                EvaluationOutput_context
            ),
        }

        # Decide which dictionary to use
        if reference_answer:
            dimension_prompts_and_schemas = dimension_prompts_and_schemas_ref
        else:
            dimension_prompts_and_schemas = dimension_prompts_and_schemas_no_ref

        combined_dimensions = {}
        all_messages = []

        # Loop over each dimension
        for dim_name, (dim_prompt, dim_schema) in dimension_prompts_and_schemas.items():
            # Build the user content
            user_content = f"""
                    Context:
                    <context>
                    {context}
                    </context>

                    User Question:
                    <user_question>
                    {user_question}
                    </user_question>
                    """.strip()

            # If we do have a reference, include it in the prompt
            if reference_answer:
                user_content += f"""
                    Reference Answer:
                    <reference_answer>
                    {reference_answer}
                    </reference_answer>
                    """.strip()

            # Always include the system answer
            user_content += f"""
                System Answer:
                <system_answer>
                {system_answer}
                </system_answer>
                """.strip()

            messages = [
                {"role": "system", "content": dim_prompt},
                {"role": "user", "content": user_content}
            ]
            #print(messages)
            completion = client.beta.chat.completions.parse(
                model=model_name,
                messages=messages,
                temperature=0.0,
                response_format=dim_schema
            )
            parsed_output = completion.choices[0].message.parsed
            all_messages.append(messages)

            # E.g., parsed_output might be {"hallucination": {"score": ..., "comment": ...}}
            dimension_score_obj = getattr(parsed_output, dim_name)
            combined_dimensions[dim_name] = dimension_score_obj

        # Combine into a single instance of EvaluationOutput_all
        combined_parsed = EvaluationOutput_all(
            hallucination=combined_dimensions["hallucination"],
            answer_accuracy=combined_dimensions["answer_accuracy"],
            user_satisfaction=combined_dimensions["user_satisfaction"],
            coherence_clarity_fluency=combined_dimensions["coherence_clarity_fluency"],
            context_quality=combined_dimensions["context_quality"]
        )

        return combined_parsed, all_messages

    


def compute_weighted_score(evaluation_dict):
    """
    A toy weighting:
    hallucination:              0.3
    answer_accuracy:            0.3
    user_satisfaction:          0.1
    coherence_clarity_fluency:  0.1
    context_quality:            0.2
    """
    h = evaluation_dict["hallucination"]["score"]
    a = evaluation_dict["answer_accuracy"]["score"]
    s = evaluation_dict["user_satisfaction"]["score"]
    c_c_f = evaluation_dict["coherence_clarity_fluency"]["score"]
    c_qual  = evaluation_dict["context_quality"]["score"]

    weighted = (
        0.3 * h +
        0.3 * a +
        0.1 * s +
        0.1 * c_c_f +
        0.2 * c_qual
    )
    return weighted

import tiktoken

def calculate_api_call_cost(input_text: str, output_text: str, encoding: tiktoken, model_name: str) -> float:
    """
    Calculate the cost of an API call based on input and output texts.

    Args:
        input_text (str): The input text to be tokenized.
        output_text (str): The output text to be tokenized.
        model (str): The model name to determine the appropriate tokenizer.

    Returns:
        float: The total cost of the API call.
    """
    # Cast the input and output texts to strings
    input_text = str(input_text)
    output_text = str(output_text)
    # Define the cost per million tokens
    if model_name == "gpt-4o-2024-08-06":
        input_cost_per_million = 2.50
        output_cost_per_million = 10.00
    elif model_name == "gpt-4o-mini-2024-07-18":
        input_cost_per_million = 0.15
        output_cost_per_million = 0.6
    else:
        return "Not defined for this model"

    # Tokenize the input and output texts
    input_tokens = encoding.encode(input_text)
    output_tokens = encoding.encode(output_text)

    # Calculate the number of tokens
    num_input_tokens = len(input_tokens)
    num_output_tokens = len(output_tokens)

    if num_input_tokens > 128000 or num_output_tokens > 128000:
        print(f"Number of input tokens: {num_input_tokens}", f"Number of output tokens: {num_output_tokens} - Too many tokens for this model")

    # Calculate the cost for input and output tokens
    input_cost = (num_input_tokens / 1_000_000) * input_cost_per_million
    output_cost = (num_output_tokens / 1_000_000) * output_cost_per_million

    # Total cost is the sum of input and output costs
    total_cost = input_cost + output_cost

    return total_cost



def run_llm_judge_evaluation_structured(
    df: pd.DataFrame,
    context_col: str,
    question_col: str,
    system_answer_col: str,
    question_id_col: str,
    output_csv_path: str,
    evaluation_style: str,
    client: OpenAI,
    reference_answer_col: str = None,
    model: str = "gpt-4o-2024-08-06"
) -> pd.DataFrame:
    """
    Evaluates each row in `df` by calling the LLM (one time per row) using the provided
    system_prompt. Saves results to `output_csv_path`.

    Args:
        df (pd.DataFrame): DataFrame containing the rows to evaluate.
        context_col (str): The column name in `df` that holds the 'context'.
        question_col (str): The column name for the user question.
        system_answer_col (str): The column name for the system's answer.
        question_id_col (str): The column name for a unique question or row ID.
        output_csv_path (str): Where to save the final evaluations as CSV.
        evaluation_style (str): The evaluation style to use. One of 'together' or 'separate'.
        client (OpenAI): The OpenAI client instance.
        model (str): The model name to use for the evaluation.

    Returns:
        pd.DataFrame: A DataFrame with the evaluation scores & comment, 
                      plus the weighted overall score.
    """
    # Initialize the tokenizer for the specified model
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        raise ValueError(f"Model '{model}' not found. Please provide a valid model name.")

    all_results = []

    for _, row in df.iterrows():
        context = row[context_col]
        user_question = row[question_col]
        system_answer = row[system_answer_col]
        question_id = row[question_id_col]

        if reference_answer_col:
            reference_answer = row[reference_answer_col]
        else:
            reference_answer = None

        # Call the API with structured outputs
        eval_output, input_message = evaluate_with_llm_as_judge_structured(
            context=context,
            user_question=user_question,
            system_answer=system_answer,
            reference_answer=reference_answer,
            client=client,
            model_name=model,
            evaluation_style=evaluation_style
        )
        # Flatten the output into score and comment
        eval_dict = eval_output.dict()  # Convert the Pydantic model to a dictionary
        flattened_results = {}

        for dim, value in eval_dict.items():
            # Ensure nested dictionaries for each dimension are properly handled
            if isinstance(value, dict):  # Check if the value is a dictionary
                flattened_results[f"{dim}_score"] = value.get("score")
                flattened_results[f"{dim}_comment"] = value.get("comment")
            else:
                raise ValueError(f"Unexpected value type for dimension {dim}: {type(value)}")

        # Compute weighted score
        weighted_overall = compute_weighted_score(eval_dict)

        # Calculate API cost
        total_cost = calculate_api_call_cost(input_message, eval_output, encoding, model)

        # Append results
        all_results.append({
            question_id_col: question_id,
            **flattened_results,
            "weighted_overall_score": weighted_overall,
            "api_call_cost": total_cost
        })

    # Convert to DataFrame and save
    df_eval = pd.DataFrame(all_results)
    df_eval.to_csv(output_csv_path, index=False, quoting=1)
    print(f"Saved LLM judge evaluation to: {output_csv_path}")

    return df_eval

In [32]:
# ============================
# Usage Example
# ============================
# Example: Call the function with structured outputs
# Load the environment variables
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
# model_name = "gpt-4o-2024-08-06"
model_name = "gpt-4o-mini-2024-07-18"
# evaluation_style = "together"
evaluation_style = "separate"

# Load your dataset
df_en = pd.read_csv("../../data/final_merged_dataset_short_en_2.csv")
df_en = df_en.head(2).copy()  # For testing
df_eval_en = run_llm_judge_evaluation_structured(
    df=df_en,
    context_col="chatbot_context_en",
    question_col="english_question_text_q",
    system_answer_col="chatbot_answer_en",
    question_id_col="question_id_q",
    output_csv_path="../../data/eval/evaluation_llm_judge_chatbot_en.csv",
    evaluation_style=evaluation_style,
    client=client,
    model=model_name,
)


# # Load your dataset
# df_de = pd.read_csv("../../data/final_merged_dataset_short_de_2.csv")
# df_de = df_de.head(2).copy()  # For testing
# df_eval_de = run_llm_judge_evaluation_structured(
#     df=df_de,
#     context_col="chatbot_context_de",
#     question_col="german_question_text_q",
#     system_answer_col="chatbot_answer_de",
#     #reference_answer_col="human_answer_de",
#     question_id_col="question_id_q",
#     output_csv_path="../../data/eval/evaluation_llm_judge_chatbot_de.csv",
#     evaluation_style=evaluation_style,
#     client=client,
#     model=model_name,
# )
# calculate the mean of the evaluation
df_mean = pd.read_csv("../../data/eval/mean_eval.csv")
# English
# mean_llm_en = df_eval_en["weighted_overall_score"].mean()
# df_mean["llm_as_judge_en"] = mean_llm_en
# German
# mean_llm_de = df_eval_de["weighted_overall_score"].mean()
# df_mean["llm_as_judge_de"] = mean_llm_de


# df_mean.to_csv("../../data/eval/mean_eval.csv", index=False)
df_eval_en

Saved LLM judge evaluation to: ../../data/eval/evaluation_llm_judge_chatbot_en.csv


Unnamed: 0,question_id_q,hallucination_score,hallucination_comment,answer_accuracy_score,answer_accuracy_comment,user_satisfaction_score,user_satisfaction_comment,coherence_clarity_fluency_score,coherence_clarity_fluency_comment,context_quality_score,context_quality_comment,weighted_overall_score,api_call_cost
0,153,4,The system answer accurately lists various sch...,4,The system answer accurately and comprehensive...,4,The system answer provides a comprehensive and...,4,"The answer is well-structured, clearly present...",4,The system answer provides a comprehensive and...,4.0,0.002793
1,9,1,The system answer includes a Master's program ...,4,The system answer accurately identifies the re...,4,The system answer effectively addresses the us...,4,"The answer is well-structured, clearly outline...",4,The system answer provides relevant and detail...,3.1,0.003961


In [33]:
print(df_eval_en['hallucination_comment'][1])
df_eval_en["api_call_cost"].sum()

The system answer includes a Master's program in Cognitive Computing that is not mentioned in the provided context, indicating a hallucination.


0.0067536

In [24]:
print(df_eval_de['hallucination_comment'][1])
df_eval_de["api_call_cost"].sum()

The answer accurately reflects the context without introducing any false information.


0.0223275

In [20]:
# code form chatbot to extract text from pdf and html
import requests
import re
import io
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
def extract_html_text(href: str, response) -> str:

    link_soup = BeautifulSoup(response.content, "html.parser")
    div_content = link_soup.find("div", class_="eb2")
    if div_content:
        text = re.sub(r"\n+", "\n", div_content.text.strip())
        content_with_link = ""
        for link in div_content.find_all("a", href=True):
            text_anchor_tag = re.sub(r"\n+", "\n", link.text.strip())
            content_with_link += f" - {text_anchor_tag}: {link['href']}"
        return text + "\nHref found in the text:\n" + content_with_link
    print(f"Failed to fetch html content from: {href}")
    return ""
def read_pdf_from_url(pdf_bytes: bytes, num_pages: int = 7) -> str:
    """
    Read the content of a PDF file from a given byte stream.

    Args:
        pdf_bytes (bytes): Raw bytes of the PDF content.
        num_pages (int, optional): Number of pages to process. If None, process all pages. Defaults to 7.

    Returns:
        str: Extracted text content from the PDF.
    """

    pdf_stream = io.BytesIO(pdf_bytes)

    pdf_text = ""
    with pdf_stream as f:
        reader = PdfReader(f)
        if num_pages is None:
            num_pages = len(reader.pages)
        else:
            num_pages = min(num_pages, len(reader.pages))

        for page_num in range(num_pages):
            page = reader.pages[page_num]
            pdf_text += page.extract_text() or ""

    return pdf_text

def extract_pdf_text(href: str, response) -> str:

    text = read_pdf_from_url(response)
    return re.sub(r"(\n\s*|\n\s+\n\s+)", "\n", text.strip())

def context_from_links(links):
    contents = []
    taken_from = "Information taken from:"
    search_result_text = "Content not found"
    for tag in links:
        href = tag
        try:
            # TODO I/O operation (use async code)
            response = requests.get(href)
            #print(f"Fetching: {href}")
        except:
            print(f"Error while fetching: {href}")
            continue

        if response.status_code == 200:

            if href.endswith(".pdf"):
                print(f"Extracted PDF text from: {href}")
                text = extract_pdf_text(href, response)
            else:
                print(f"Extracted HTML text from: {href}")
                text = extract_html_text(href, response)

            if text:
                text = f"{taken_from}{href}\n{text}"
                contents.append(text)

    return "\n".join(contents) if contents else search_result_text, links

# code from parsing links and getting context
import re
import pandas as pd

def parse_links_from_string(links_str: str):
    """
    Extracts all URLs (including Markdown-style) from a string.
    Returns a list of raw link URLs.
    """
    if not isinstance(links_str, str) or not links_str.strip():
        return []

    # Regex approach:
    # 1) Find Markdown links: [some text](http://some-url.com)
    # 2) Also find bare URLs like http://... or https://...
    pattern = r'\((https?://[^\)]+)\)|(https?://[^\s]+)'
    # Explanation:
    #   \((https?://[^\)]+)\) matches markdown ( capturing URL inside parentheses )
    #   (https?://[^\s]+) captures bare URLs
    matches = re.findall(pattern, links_str)

    # matches is a list of tuples, e.g. [("https://...",""), ("","https://...")]
    # We gather the non-empty group from each tuple
    urls = []
    for md_link, normal_link in matches:
        if md_link:
            urls.append(md_link)
        elif normal_link:
            urls.append(normal_link)
    print(f"FOund Links: {urls}")
    return urls

def gather_context_from_links(links_str: str) -> str:
    """
    Parses the given links_str (which may be plain or Markdown),
    fetches each link, extracts text, and concatenates them into
    a single context string.
    """
    links = parse_links_from_string(links_str)
    if not links:
        return "No links or no content found."

    all_contents = context_from_links(links)

    
    return all_contents

def add_content_to_df(df, links_col, context_col):
    """
    Adds context to the DataFrame by fetching and parsing the content
    from the URLs in the specified column.

    Args:
        df (pd.DataFrame): The DataFrame to update.
        links_col (str): The column name containing the URLs.
        context_col (str): The column name to store the extracted content.

    Returns:
        pd.DataFrame: The updated DataFrame.
    """
    context_rows = []
    for idx, row in df.iterrows():
        links_str = row[links_col]
        context = gather_context_from_links(links_str)
        context_rows.append(context)

    df[context_col] = context_rows
    return df

df_en = pd.read_csv("../../data/final_merged_dataset_short_en.csv")
df_en = df_en.head(3).copy()
df_en = add_content_to_df(df_en, "chatbot_visited_urls_en", "chatbot_context")

df_en.head()

df_en.to_csv("../../data/final_merged_dataset_short_en_testing.csv", index=False)



FOund Links: ['https://www.uni-osnabrueck.de/studieninteressierte/stipendien-und-foerderung/;https://www.uni-osnabrueck.de/studieninteressierte/studieninteressierte-aus-dem-ausland/kosten-und-finanzierung/']
FOund Links: ['https://www.ikw.uni-osnabrueck.de/studieninteressierte/willkommen.html;https://www.uni-osnabrueck.de/studieninteressierte/studiengaenge-a-z/cognitive-science-master-of-science/']
Extracted HTML text from: https://www.ikw.uni-osnabrueck.de/studieninteressierte/willkommen.html;https://www.uni-osnabrueck.de/studieninteressierte/studiengaenge-a-z/cognitive-science-master-of-science/
FOund Links: ['https://www.wiwi.uni-osnabrueck.de/fileadmin/documents/public/1_fachbereich/00_Allgemein/1.04_jahresberichte/Jahresbericht_FB9_2012-2013.pdf;https://www.uni-osnabrueck.de/fileadmin/documents/public/1_universitaet/1.3_organisation/d4_akad._angelegenheiten/d4_akad._angelegenheiten/mitteilungsblaetter/2016-2020/2019-05-09_Nr03.pdf']
