In [53]:
import openai
from pydantic import BaseModel, Field
from typing import List
import json
import pandas as pd
from dotenv import load_dotenv
import os

In [54]:
# Load environment variables from .env file
load_dotenv()

# Access an example variable (if needed)
api_key = os.getenv("OPENAI_API_KEY")

## Dataset

In [55]:
# Load a JSON file containing the dataset
def load_json_file(file_path: str) -> List[dict]:
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

In [66]:
low_complexity = load_json_file("low_complexity.json")
moderate_complexity = load_json_file("moderate_complexity.json")
high_complexity = load_json_file("high_complexity.json")

questions = low_complexity + moderate_complexity + high_complexity
len(questions)

300

In [67]:
questions[:5]

[{'question': 'The health domain is about staying healthy. What do you do to keep healthy?'},
 {'question': 'Finance is about money and coins. Do people save their money in banks?'},
 {'question': 'Relationships are about how people connect with each other. Do you like making friends?'},
 {'question': 'Events are things that happen like parties or concerts. Do people go to many events?'},
 {'question': 'Leisure is about having fun and relaxing. What is your favorite thing to do for fun?'}]

In [68]:
questions[-5:]

[{'question': 'Navigating the complexities of global travel involves synchronizing multiple systems such as airlines, hospitality, and transportation, which often utilize disparate data standards. How can we develop a unified data framework that enables real-time, seamless integration across all travel service providers to enhance traveler experience and operational efficiency?'},
 {'question': 'The challenge of optimizing global food distribution networks remains daunting, given the variability in demand and logistical constraints.  \nHow can emerging AI models be integrated into agricultural supply chains to predict pest outbreaks and optimize crop yields under climate change conditions?'},
 {'question': 'Fashion is a constantly evolving industry where cultural, technological, and environmental factors interplay to shape trends and consumer behavior. How can the integration of sustainable practices with advanced manufacturing technologies irreversibly alter global supply chains while

## Cognitive complexity evaluation

In [77]:
# Based on Bloom's Taxonomy
class ContentComplexity(BaseModel):
    remembering_complexity: int = Field(..., ge=1, le=10, description="Requires recalling or finding facts (1-10)")
    creation_complexity: int = Field(..., ge=1, le=10, description="Requires to formulate new theories, design solutions (1-10)")
    evaluation_complexity: int = Field(..., ge=1, le=10, description="Requires to make and defend judgments (1-10)")
    analysis_complexity: int = Field(..., ge=1, le=10, description="Requires breaking down information and seeing relationships (1-10)")
    synthesis_complexity: int = Field(..., ge=1, le=10, description="Requires information integration from multiple sources, disciplines, or concepts (1-10)")
    applying_complexity: int = Field(..., ge=1, le=10, description="Requires to use knowledge in a new situation (1-10)")
    hypothesis_complexity: int = Field(..., ge=1, le=10, description="Requires hypothetical thinking (1-10)")

In [78]:
# Initialize OpenAI client
client = openai.OpenAI(api_key=api_key)

In [87]:
def analyze_content(user_question: str) -> ContentComplexity:
    
    system_prompt = """
You are an expert content evaluator. Given the following content, evaluate it based on the provided criteria and return a JSON object with integer scores (1-10) for each property. 
Ensure all values are integers between 1 and 10.

Criteria:
- remembering_complexity: Requires recalling or finding facts (1=simple, 10=complex).
- creation_complexity: Requires to formulate new theories, design solutions (1=simple, 10=complex).
- evaluation_complexity: Requires to make and defend judgments (1=simple, 10=complex).
- analysis_complexity: Requires breaking down information and seeing relationships (1=simple, 10=complex).
- synthesis_complexity: Requires information integration from multiple sources, disciplines, or concepts (1=simple, 10=complex).
- applying_complexity: Requires to use knowledge in a new situation (1=simple, 10=complex).
- hypothesis_complexity: Requires hypothetical thinking (1=simple, 10=complex).

Return a JSON object with these fields and integer values between 1 and 10.
"""

    try:
        response = client.beta.chat.completions.parse(
            model="gpt-4.1-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": f"Analyze this content: {user_question}"}
            ],
            response_format=ContentComplexity,
            temperature=0.1
        )
        
        return response.choices[0].message.parsed
    
    except Exception as e:
        print(f"Error analyzing question: {e}")
        return None


In [93]:
# Initialize list to store evaluation data
data = []

# Evaluate each question
for q in questions:
    question_text = q["question"]
    try:
        evaluation = analyze_content(question_text)
        # Convert evaluation to dict and add question text
        eval_dict = evaluation.model_dump()
        eval_dict["question"] = question_text
        data.append(eval_dict)
    except Exception as e:
        print(f"Error evaluating question '{question_text}': {str(e)}")
        continue

# Create DataFrame
columns = ["question"] + list(ContentComplexity.model_fields.keys())
df = pd.DataFrame(data, columns=columns)

In [None]:
# Low complexity questions
df[:5]

Unnamed: 0,question,remembering_complexity,creation_complexity,evaluation_complexity,analysis_complexity,synthesis_complexity,applying_complexity,hypothesis_complexity
0,The health domain is about staying healthy. Wh...,2,2,3,2,1,2,1
1,Finance is about money and coins. Do people sa...,2,1,2,2,1,2,2
2,Relationships are about how people connect wit...,2,1,2,2,1,2,1
3,Events are things that happen like parties or ...,2,1,2,2,1,2,2
4,Leisure is about having fun and relaxing. What...,2,1,2,2,1,1,1


In [None]:
# Moderate complexity questions
df[150:155]

Unnamed: 0,question,remembering_complexity,creation_complexity,evaluation_complexity,analysis_complexity,synthesis_complexity,applying_complexity,hypothesis_complexity
150,Safety is about protecting people from harm an...,3,4,5,5,4,5,4
151,"Culture encompasses the beliefs, customs, art,...",3,5,6,7,6,5,6
152,Sports can really bring people together and bo...,3,5,7,6,4,4,5
153,"Entertainment includes movies, television, mus...",2,1,2,2,1,1,1
154,Politics often involves negotiation and debate...,3,6,7,6,5,4,6


In [None]:
# High complexity questions
df[-5:]

Unnamed: 0,question,remembering_complexity,creation_complexity,evaluation_complexity,analysis_complexity,synthesis_complexity,applying_complexity,hypothesis_complexity
295,Navigating the complexities of global travel i...,5,9,8,8,9,7,8
296,The challenge of optimizing global food distri...,6,9,8,8,9,8,9
297,Fashion is a constantly evolving industry wher...,5,9,8,8,9,7,8
298,The challenge of capturing the essence of huma...,6,9,8,8,9,7,8
299,The challenge of understanding consciousness p...,6,9,8,8,9,7,9


## Data for the custom model training

In [97]:
# export df into a csv file
df.to_csv('train.csv', index=False)

## Generic text evaluation

In [19]:
class GenericEvaluationModel(BaseModel):
    clarity: int = Field(..., ge=1, le=10, description="How clear and well-defined the question is (1-10)")
    intent: int = Field(..., ge=1, le=10, description="Purpose or goal of the question (1-10)")
    complexity: int = Field(..., ge=1, le=10, description="Level of difficulty or depth required (1-10)")
    specificity: int = Field(..., ge=1, le=10, description="Level of detail or precision in the question (1-10)")
    answer_format_preference: int = Field(..., ge=1, le=10, description="Expected format of the response (1-10)")
    urgency: int = Field(..., ge=1, le=10, description="Time-sensitivity of the question (1-10)")
    context_dependency: int = Field(..., ge=1, le=10, description="Reliance on prior context (1-10)")
    ambiguity: int = Field(..., ge=1, le=10, description="Degree of multiple interpretations possible (1-10)")
    tone_preference: int = Field(..., ge=1, le=10, description="Desired tone of the response (1-10)")
    explicit_constraints: int = Field(..., ge=1, le=10, description="Presence of specific limitations or requirements (1-10)")

In [20]:
def analyse_2(question: str) -> GenericEvaluationModel:

    prompt = f"""
You are an expert question evaluator. Given the following user question, evaluate it based on the provided criteria and return a JSON object with integer scores (1-10) for each property. Ensure all values are integers between 1 and 10.

Criteria:
- clarity: How clear and well-defined the question is (1=vague, 10=very clear).
- intent: Purpose or goal of the question (1=unclear intent, 10=clear intent).
- complexity: Level of difficulty or depth required (1=simple, 10=complex).
- specificity: Level of detail or precision (1=broad, 10=narrow).
- answer_format_preference: Expected response format (1=unspecified, 10=specific format).
- urgency: Time-sensitivity (1=non-urgent, 10=urgent).
- context_dependency: Reliance on prior context (1=standalone, 10=highly contextual).
- ambiguity: Degree of multiple interpretations (1=unambiguous, 10=highly ambiguous).
- tone_preference: Desired tone of response (1=unspecified, 10=specific tone).
- explicit_constraints: Presence of limitations or requirements (1=none, 10=many).

Return a JSON object with these fields and integer values between 1 and 10.
"""

    response_2 = client.beta.chat.completions.parse(
        model="gpt-4.1-nano",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": f"Analyze this question: {question}"}
        ],
        response_format=GenericEvaluationModel,
        temperature=0.1
    )

    return response_2.choices[0].message.parsed

In [21]:
# Print as pretty JSON
resp = analyse_2("Hello, how many people live in Europe?")
pretty_json = resp.model_dump_json(indent=2)
print(pretty_json)

{
  "clarity": 8,
  "intent": 7,
  "complexity": 3,
  "specificity": 4,
  "answer_format_preference": 3,
  "urgency": 2,
  "context_dependency": 2,
  "ambiguity": 2,
  "tone_preference": 5,
  "explicit_constraints": 1
}


In [22]:
# Initialize list to store evaluation data
data = []

# Evaluate each question
for q in questions:
    question_text = q["question"]
    try:
        evaluation = analyse_2(question_text)
        # Convert evaluation to dict and add question text
        eval_dict = evaluation.model_dump()
        eval_dict["question"] = question_text
        data.append(eval_dict)
    except Exception as e:
        print(f"Error evaluating question '{question_text}': {str(e)}")
        continue

# Create DataFrame
columns = ["question"] + list(GenericEvaluationModel.model_fields.keys())
df2 = pd.DataFrame(data, columns=columns)

In [23]:
df2

Unnamed: 0,question,clarity,intent,complexity,specificity,answer_format_preference,urgency,context_dependency,ambiguity,tone_preference,explicit_constraints
0,What is the capital city of Lithuania?,10,10,1,10,8,1,1,1,1,1
1,How many planets are in our solar system?,10,9,2,8,8,2,1,1,5,1
2,Who wrote the novel 'Pride and Prejudice'?,10,10,2,8,8,1,1,1,1,1
3,What is the chemical symbol for gold?,10,10,2,8,8,2,1,1,5,1
4,In which year did World War II end?,10,9,2,8,8,3,2,1,5,1
5,What are the main causes of the greenhouse eff...,8,9,4,5,3,2,2,2,5,1
6,How does photosynthesis work in plants?,8,9,4,3,2,2,2,2,5,1
7,What are the differences between a democracy a...,8,9,6,4,3,2,2,3,5,1
8,Why is the Mona Lisa considered a significant ...,8,9,4,5,3,2,3,2,4,1
9,What are the primary sources of renewable ener...,9,8,4,6,3,2,2,2,5,1
