# Preprocessing and Complexity Estimation Steps

### Step 1: Create Empty Feature Dataset

In [1]:
import json
import pandas as pd

def create_feature_dataset(num_rows):
    """
    Creates a DataFrame with a predefined feature template for query evaluation.

    Args:
        num_rows (int): The number of rows to generate in the dataset.

    Returns:
        pandas.DataFrame: A DataFrame containing initialized feature columns for each row.
    """
    feature_template = {
        "id": "",
        "instruction": "",
        "input": "",
        "query": "",
        "category": "",
        "output": "",
        "query_chars_count": 0,
        "query_words_count": 0,
        "query_unique_word_count": 0,
        "query_readability_score": 0.0,
        "query_special_tokens_count": 0,
        "query_keywords_count": 0,
        "query_contains_url": 0,
        "query_complexity_score": 0.0,
        "rt_t5_base_avg": 0.0,
        "rt_t5_large_avg": 0.0,
        "rt_t5_xl_avg": 0.0,
        "r_t5_base_length_avg": 0.0,
        "r_t5_large_length_avg": 0.0,
        "r_t5_xl_length_avg": 0.0,
        "r_t5_base_redundancy_avg": 0.0,
        "r_t5_large_redundancy_avg": 0.0,
        "r_t5_xl_redundancy_avg": 0.0,
        "r_t5_base_trash_count": 0,
        "r_t5_large_trash_count": 0,
        "r_t5_xl_trash_count": 0,
        "r_t5_base_query_repetition_count": 0,
        "r_t5_large_query_repetition_count": 0,
        "r_t5_xl_query_repetition_count": 0,
        "scores_t5_base_avg": {
            "bart": 0.0,
            "rouge1": 0.0,
            "rouge2": 0.0,
            "rougeL": 0.0,
            "rougeLsum": 0.0,
            "bleu": 0.0,
            "bert": 0.0,
            "bleurt": 0.0,
            "logprobs": 0.0
        },
        "scores_t5_large_avg": {
            "bart": 0.0,
            "rouge1": 0.0,
            "rouge2": 0.0,
            "rougeL": 0.0,
            "rougeLsum": 0.0,
            "bleu": 0.0,
            "bert": 0.0,
            "bleurt": 0.0,
            "logprobs": 0.0
        },
        "scores_t5_xl_avg": {
            "bart": 0.0,
            "rouge1": 0.0,
            "rouge2": 0.0,
            "rougeL": 0.0,
            "rougeLsum": 0.0,
            "bleu": 0.0,
            "bert": 0.0,
            "bleurt": 0.0,
            "logprobs": 0.0
        },
        "avg_normalized_score_t5_base": 0.0,
        "avg_normalized_score_t5_large": 0.0,
        "avg_normalized_score_t5_xl": 0.0,
        "discrepancy_base_vs_large": 0.0,
        "discrepancy_large_vs_xl": 0.0,
        "discrepancy_base_vs_xl": 0.0,
        "evaluation_model_label": 0
    }
    return pd.DataFrame([feature_template] * num_rows)

# Create the dataset with 50k rows
num_rows = 50000
feature_df = create_feature_dataset(num_rows)

# Define output file path
output_file_path = '../datasets/generated/step_1/empty_feature_dataset_50k.jsonl'

# Write DataFrame to a JSONL file
with open(output_file_path, 'w') as output_file:
    for entry in feature_df.to_dict(orient='records'):
        json.dump(entry, output_file, separators=(",", ":"))
        output_file.write('\n')

print(f"Dataset with 50k rows created and saved in {output_file_path}.")

Dataset with 50k rows created and saved in ../datasets/generated/step_1/empty_feature_dataset_50k.jsonl.


### Step 2: Create Empty Response Dataset

In [2]:
import json
import pandas as pd

def create_response_dataset(num_rows):
    """
    Creates a DataFrame for storing responses and evaluation scores for different models.

    Args:
        num_rows (int): The number of rows to generate in the dataset.

    Returns:
        pandas.DataFrame: A DataFrame containing response fields and evaluation scores for each model.
    """
    response_template = {"id": ""}
    for model in ["t5_base", "t5_large", "t5_xl"]:
        # Add 10 responses and scores per model 
        for i in range(1, 11):
            response_template[f"r_{model}_{i}"] = ""
            response_template[f"scores_{model}_{i}"] = {
                "bart": 0.0,
                "rouge1": 0.0,
                "rouge2": 0.0,
                "rougeL": 0.0,
                "rougeLsum": 0.0,
                "bleu": 0.0,
                "bert": 0.0,
                "bleurt": 0.0,
                "logprobs": 0.0
            }
        # Add average response time feature
        response_template[f"rt_{model}_avg"] = 0.0
    return pd.DataFrame([response_template] * num_rows)

# Create the dataset with 50k rows
num_rows = 50000
response_df = create_response_dataset(num_rows)

# Define output file path
output_file_path = '../datasets/generated/step_2/empty_response_dataset_50k.jsonl'

# Write DataFrame to a JSONL file
with open(output_file_path, 'w') as output_file:
    for entry in response_df.to_dict(orient='records'):
        json.dump(entry, output_file, separators=(",", ":"))
        output_file.write('\n')

print(f"Dataset with 50k rows created and saved in {output_file_path}.")


Dataset with 50k rows created and saved in ../datasets/generated/step_2/empty_response_dataset_50k.jsonl.


### Step 3: Reduce MixInstruct Dataset to 50,000 lines

In [5]:
import pandas as pd
import json

# Path to the original dataset
input_file = "../datasets/mix-instruct/train_data_prepared.jsonl"
output_file = "../datasets/generated/step_3/train_data_reduced_50k.jsonl"

# Load the original dataset
df = pd.read_json(input_file, lines=True)

# Select only the required columns and reduce to 50k rows
df = df[["id", "instruction", "input", "output"]].head(50000)

# Save to JSONL format
with open(output_file, "w") as f:
    for entry in df.to_dict(orient="records"):
        json.dump(entry, f)
        f.write("\n")

print(f"Reduced dataset saved to {output_file}.")


Reduced dataset saved to ../datasets/generated/step_3/train_data_reduced_50k.jsonl.


### Step 4: Categorize Queries
#### ATTENTION: Output file might need manual formatting!

In [None]:
import openai
import time

# Set up your OpenAI API key
client = openai.OpenAI(api_key='PLACE_KEY_HERE')

# File paths
input_file_path = "../datasets/generated/step_3/train_data_reduced_50k.jsonl"
output_file_path = "../datasets/generated/step_4/train_data_categorized_50k_unformatted.json"

# Request template text
request_text_template = """Please categorize following queries (instruction+input) into one of following 14 categories: 
1. Factual Question Answering (QA), 
2. Open-ended/Opinion-based Queries 
3. Creative Text Generation 
4. Instruction Following
5. Mathematics/Problem Solving 
6. Recommendation Systems 
7. Coding/Programming 
8. Summarization/ParaphrasingTasks 
9. Translation Tasks 
10. Reasoning/Logic Questions 
11. Dialogue/Conversational Queries 
12. Comparative/Analytical Queries 
13. Miscellaneous/Other 
14. General Advice/Personal Tasks

by adding a new column "category" to each query. Please only return the query ID and its category number to me in a format like: {"id":"itwgpt4\/33289","category":7}, ... Thanks!"""

def send_gpt_request(queries):
    """
    Sends a batch of queries to the GPT-4 model for categorization.

    Args:
        queries (list): List of queries to be categorized.

    Returns:
        str: The response from GPT-4 containing categorized queries, or None if an error occurs.
    """
    prompt = request_text_template + "\n" + "\n".join(queries)
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=1,
            max_tokens=4095,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error during API request: {e}")
        return None

def load_queries(start_index, num_queries):
    """
    Loads a subset of queries from the input JSONL file.

    Args:
        start_index (int): The index in the dataset from which to start loading.
        num_queries (int): The number of queries to load.

    Returns:
        list: A list of query strings.
    """
    with open(input_file_path, 'r') as f:
        queries = []
        for i, line in enumerate(f):
            if i >= start_index and i < start_index + num_queries:
                queries.append(line.strip())
            if i >= start_index + num_queries:
                break
    return queries

def append_results_to_file(results):
    """
    Appends the categorized queries to the output JSON file.

    Args:
        results (str): The formatted JSON string containing categorized queries.

    Returns:
        None
    """
    with open(output_file_path, 'a') as f:
        f.write(results + "\n")

def countdown(seconds):
    """
    Implements a countdown timer before sending the next API request.

    Args:
        seconds (int): The number of seconds to wait before the next request.

    Returns:
        None
    """
    for i in range(seconds, 0, -1):
        print(f"Next request in {i} seconds...", end="\r")
        time.sleep(1)

def process_queries(start_index, num_queries_per_request, total_requests, wait_time_between_requests):
    """
    Processes queries in batches, sends them to GPT-4 for categorization, and saves the results.

    Args:
        start_index (int): The starting index in the dataset.
        num_queries_per_request (int): The number of queries to send in each request.
        total_requests (int): The total number of requests to process.
        wait_time_between_requests (int): The wait time (in seconds) between API requests.

    Returns:
        None
    """
    for request_number in range(total_requests):
        print(f"Processing request {request_number + 1}/{total_requests}...")

        # Load the queries for the current batch
        queries = load_queries(start_index + request_number * num_queries_per_request, num_queries_per_request)

        if not queries:
            print("No more queries to process.")
            break

        # Send the request to GPT-4
        result = send_gpt_request(queries)
        if result:
            # Append the result to the output file
            append_results_to_file(result)

        # Countdown before the next request
        if request_number < total_requests - 1:
            countdown(wait_time_between_requests)

# Parameters
start_index = 0  # Set the starting index in the dataset
num_queries_per_request = 230  # Number of queries sent in each API request. Higher values risk exceeding token limits.
total_requests = 218  # Total number of API requests needed to process all 50,000 queries (50000 / 230, rounded up).
wait_time_between_requests = 10  # Time in seconds between requests to avoid rate limits.

# Start processing the queries
process_queries(start_index, num_queries_per_request, total_requests, wait_time_between_requests)

print(f"Process completed. Results printed to {output_file_path}.")

### Step 5: Fill empty Feature Dataset with MixInstruct Queries and Calculate Query Features

In [7]:
import pandas as pd
import json
import re
from textstat import textstat

def read_jsonl(file_path):
    """
    Reads a JSONL file and returns a list of dictionaries.

    Args:
        file_path (str): The path to the JSONL file.

    Returns:
        list: A list of dictionaries, where each dictionary represents a record from the file.
    """
    with open(file_path, "r") as f:
        return [json.loads(line) for line in f]

def construct_query(instruction, input_text):
    """
    Constructs a query by combining the instruction and input text.

    Args:
        instruction (str): The instruction part of the query.
        input_text (str): The input text associated with the instruction.

    Returns:
        str: A formatted query string.
    """
    if instruction and input_text:
        return f"Instruction: {instruction} Input: {input_text}"
    elif instruction:
        return f"Instruction: {instruction}"
    else:
        return f"Input: {input_text}"

# Load the empty feature dataset
feature_file = "../datasets/generated/step_1/empty_feature_dataset_50k.jsonl"
feature_df = pd.DataFrame(read_jsonl(feature_file))

# Load the mix-instruct dataset
mix_instruct_file = "../datasets/generated/step_3/train_data_reduced_50k.jsonl"
mix_instruct_df = pd.DataFrame(read_jsonl(mix_instruct_file))

# Copy values from the mix-instruct dataset into the feature dataset
feature_df["id"] = mix_instruct_df["id"]
feature_df["instruction"] = mix_instruct_df["instruction"]
feature_df["input"] = mix_instruct_df["input"]
feature_df["output"] = mix_instruct_df["output"]

# Fill "query" feature
feature_df["query"] = feature_df.apply(
    lambda row: construct_query(row["instruction"], row["input"]), axis=1
)

# Load categorized data to fill the "category" feature
category_file = "../Datasets/generated/step_4/train_data_categorized_50k.jsonl"
category_df = pd.DataFrame(read_jsonl(category_file))

# Map categories using the predefined mapping
category_mapping = {
    1: "Factual Question Answering",
    2: "Open-ended/Opinion-based",
    3: "Creative Text Generation",
    4: "Instruction Following",
    5: "Mathematics/Problem Solving",
    6: "Recommendation Systems",
    7: "Coding/Programming",
    8: "Summarization/Paraphrasing",
    9: "Translation",
    10: "Reasoning/Logic",
    11: "Dialogue/Conversational",
    12: "Comparative/Analytical",
    13: "Miscellaneous/Other",
    14: "General Advice/Personal"
}

# Map the categories
feature_df["category"] = category_df["category"].map(category_mapping)

# Define the list of special tokens
special_tokens = [":", ";", "=", "+", "-", "_", "/", ".", "'", '"', "´", "`", ",", "<", ">", "[", "]", "{", "}", "(", ")", "?", "!", "*", "&", "$", "#", "@", "%", "^", "~", "|", "\\"]

# Define a list of keywords relevant for query complexity
keywords = [
    "analyze", "synthesize", "interpret", "evaluate", "justify", "compare", 
    "optimize", "hypothesize", "formulate", "simulate", "derive", "describe",
    "validate", "correlate", "quantify", "investigate", "predict", "forecast",
    "prove", "assess", "criticize", "argue", "solve", "reconstruct", "theorize",
    "explore", "elaborate", "deduce", "refute", "conceptualize", "identify", "outline",
    "rationalize", "articulate", "summarize", "innovate", "extrapolate", "explain", "clarify"
]

def remove_urls(text):
    """
    Removes URLs and image links from a given text.

    Args:
        text (str): The input text.

    Returns:
        str: The cleaned text without URLs.
    """
    return re.sub(r'!\[.*?\]\(.*?\)|http[s]?://\S+', '', text)

# Define the feature calculations
query_features = {
    'query_chars_count': lambda row: len(row['query']),
    'query_words_count': lambda row: len(row['query'].split()),
    'query_unique_word_count': lambda row: len(set(row['query'].split())),
    'query_readability_score': lambda row: textstat.flesch_kincaid_grade(remove_urls(row['query'])),
    'query_special_tokens_count': lambda row: sum(1 for char in row['query'] if char in special_tokens),
    'query_keywords_count': lambda row: (
        sum(len(re.findall(r'\b{}\b'.format(re.escape(keyword)), row['instruction'].lower())) for keyword in keywords)
        if row['instruction'] 
        else sum(len(re.findall(r'\b{}\b'.format(re.escape(keyword)), row['input'].lower())) for keyword in keywords)
    ),
    'query_contains_url': lambda row: int(bool(re.search(r'http[s]?://', row['query'])))
}

# Apply the feature calculations to the dataset
for feature_name, feature_func in query_features.items():
    feature_df[feature_name] = feature_df.apply(feature_func, axis=1)

# Save the updated dataset
output_file_path = "../datasets/generated/step_5/train_data_with_query_features.jsonl"
with open(output_file_path, "w") as output_file:
    for entry in feature_df.to_dict(orient="records"):
        json.dump(entry, output_file, separators=(",", ":"))
        output_file.write("\n")

print(f"Feature dataset created and saved to {output_file_path}.")


Feature dataset created and saved to ../datasets/generated/step_5/train_data_with_query_features.jsonl.


### Step 6: Calculate Complexity Score for each Query

In [8]:
import pandas as pd
import json
import os

def read_jsonl(file_path):
    """
    Reads a JSONL file and returns a list of dictionaries.

    Args:
        file_path (str): The path to the JSONL file.

    Returns:
        list: A list of dictionaries, where each dictionary represents a record from the file.
    """
    with open(file_path, "r") as f:
        return [json.loads(line) for line in f]

# File paths
input_file_path = "../datasets/generated/step_5/train_data_with_query_features.jsonl"
output_file_path = "../datasets/generated/step_6/train_data_with_query_features_complexity_scores.jsonl"

# Load dataset
feature_df = pd.DataFrame(read_jsonl(input_file_path))

# Normalized category weights based on subjective assessment of complexity
category_weights = {
    "Mathematics/Problem Solving": 1.0,
    "Coding/Programming": 0.9,
    "Reasoning/Logic": 0.85,
    "Summarization/Paraphrasing": 0.75,
    "Comparative/Analytical": 0.7,
    "Creative Text Generation": 0.6,
    "Open-ended/Opinion-based": 0.5,
    "Recommendation Systems": 0.45,
    "General Advice/Personal": 0.4,
    "Dialogue/Conversational": 0.35,
    "Factual Question Answering": 0.3,
    "Instruction Following": 0.25,
    "Translation": 0.2,
    "Miscellaneous/Other": 0.15,
}

# Calculate min and max values for each feature
feature_min_max = {
    "query_chars_count": (feature_df['query_chars_count'].min(), feature_df['query_chars_count'].max()),
    "query_words_count": (feature_df['query_words_count'].min(), feature_df['query_words_count'].max()),
    "query_unique_word_count": (feature_df['query_unique_word_count'].min(), feature_df['query_unique_word_count'].max()),
    "query_readability_score": (feature_df['query_readability_score'].min(), 20),  # Cap readability at 20
    "query_special_tokens_count": (feature_df['query_special_tokens_count'].min(), feature_df['query_special_tokens_count'].max()),
    "query_keywords_count": (feature_df['query_keywords_count'].min(), feature_df['query_keywords_count'].max()),
}

# Define feature weights for complexity calculation
weights = {
    "category": 2.5,
    "length": 1.0,
    "unique": 0.4,
    "readability": 1.2,
    "special_tokens": 0.9,
    "keywords": 1.5,
    "url_presence": 1.0,
}

def min_max_scaling(value, min_value, max_value):
    """
    Applies Min-Max scaling to normalize a feature value between 0 and 1.

    Args:
        value (float): The feature value to be scaled.
        min_value (float): The minimum value of the feature.
        max_value (float): The maximum value of the feature.

    Returns:
        float: The normalized value between 0 and 1.
    """
    if max_value - min_value == 0:
        return 0  # Handle division by zero for constant features
    return (value - min_value) / (max_value - min_value)

def calculate_complexity(row):
    """
    Computes the query complexity score based on multiple features.

    Args:
        row (pandas.Series): A row from the dataset containing query-related features.

    Returns:
        float: The calculated complexity score for the query.
    """
    category_weight = category_weights.get(row['category'], 0.03)  # Default weight if category unknown

    # Min-max scaled query length features
    length_score = (
        min_max_scaling(row['query_chars_count'], *feature_min_max['query_chars_count']) +
        min_max_scaling(row['query_words_count'], *feature_min_max['query_words_count'])
    ) / 2

    # Unique words
    unique_score = min_max_scaling(row['query_unique_word_count'], *feature_min_max['query_unique_word_count'])

    # Readability score
    readability_score = min_max_scaling(
        min(row['query_readability_score'], feature_min_max['query_readability_score'][1]),
        *feature_min_max['query_readability_score']
    )

    # Special tokens count
    special_tokens_score = min_max_scaling(row['query_special_tokens_count'], *feature_min_max['query_special_tokens_count'])

    # Keywords count
    keywords_score = min_max_scaling(row['query_keywords_count'], *feature_min_max['query_keywords_count'])

    # URL presence score
    url_presence_score = row['query_contains_url']

    # Combine scores
    complexity_score = (
        weights['category'] * category_weight +
        weights['length'] * length_score +
        weights['unique'] * unique_score +
        weights['readability'] * readability_score +
        weights['special_tokens'] * special_tokens_score +
        weights['keywords'] * keywords_score +
        weights['url_presence'] * url_presence_score
    )

    return round(complexity_score, 4)

# Apply complexity calculation to each row
feature_df['query_complexity_score'] = feature_df.apply(calculate_complexity, axis=1)

# Calculate and print the min and max range of the complexity score
min_complexity_score = feature_df['query_complexity_score'].min()
max_complexity_score = feature_df['query_complexity_score'].max()

print(f"Min complexity score: {min_complexity_score}")
print(f"Max complexity score: {max_complexity_score}")

# Save the dataset with complexity scores
with open(output_file_path, "w") as output_file:
    for entry in feature_df.to_dict(orient="records"):
        json.dump(entry, output_file, separators=(",", ":"))
        output_file.write("\n")

print(f"Complexity scores calculated and saved to {output_file}")


Min complexity score: 0.5207
Max complexity score: 5.2306
Complexity scores calculated and saved to <_io.TextIOWrapper name='../datasets/generated/step_6/train_data_with_query_features_complexity_scores.jsonl' mode='w' encoding='UTF-8'>


### Step 7: Generate all Responses from all Flan-T5-Models with Evaluation Scripts from "./scripts/response_generation/"
#### ATTENTION: Run each script in predefined virtual environment on a GPU with enough RAM (40GB for XL)

### Step 8: Calculate all Metric Scores with Scripts from "./scripts/metric_scoring"

### Step 9: Merge Responses and Metric Scores into Training Datasets

In [9]:
import json

# Define the paths for train files and their corresponding score files
datasets = {
    "base": {
        "train_file": "../datasets/generated/step_7/train_data_with_50k_times_10_t5_base_answers_with_avgtime_max5000.jsonl",
        "score_files": {
            "bart": "../datasets/generated/step_8/bart_scores_t5_base.jsonl",
            "bert": "../datasets/generated/step_8/bert_scores_t5_base.jsonl",
            "bleu": "../datasets/generated/step_8/bleu_scores_t5_base.jsonl",
            "bleurt": "../datasets/generated/step_8/bleurt_scores_t5_base.jsonl",
            "logprobs": "../datasets/generated/step_8/logprobs_scores_t5_base.jsonl",
            "rouge": "../datasets/generated/step_8/rouge_scores_t5_base.jsonl"
        },
        "output_file": "../datasets/generated/step_9/train_data_with_50kx10_t5_base_answers_with_scores.jsonl",
        "model_name": "base"
    },
    "large": {
        "train_file": "../datasets/generated/step_7/train_data_with_50k_times_10_t5_large_answers_with_avgtime_max5000.jsonl",
        "score_files": {
            "bart": "../datasets/generated/step_8/bart_scores_t5_large.jsonl",
            "bert": "../datasets/generated/step_8/bert_scores_t5_large.jsonl",
            "bleu": "../datasets/generated/step_8/bleu_scores_t5_large.jsonl",
            "bleurt": "../datasets/generated/step_8/bleurt_scores_t5_large.jsonl",
            "logprobs": "../datasets/generated/step_8/logprobs_scores_t5_large.jsonl",
            "rouge": "../datasets/generated/step_8/rouge_scores_t5_large.jsonl"
        },
        "output_file": "../datasets/generated/step_9/train_data_with_50kx10_t5_large_answers_with_scores.jsonl",
        "model_name": "large"
    },
    "xl": {
        "train_file": "../datasets/generated/step_7/train_data_with_50k_times_10_t5_xl_answers_with_avgtime_max5000.jsonl",
        "score_files": {
            "bart": "../datasets/generated/step_8/bart_scores_t5_xl.jsonl",
            "bert": "../datasets/generated/step_8/bert_scores_t5_xl.jsonl",
            "bleu": "../datasets/generated/step_8/bleu_scores_t5_xl.jsonl",
            "bleurt": "../datasets/generated/step_8/bleurt_scores_t5_xl.jsonl",
            "logprobs": "../datasets/generated/step_8/logprobs_scores_t5_xl.jsonl",
            "rouge": "../datasets/generated/step_8/rouge_scores_t5_xl.jsonl"
        },
        "output_file": "../datasets/generated/step_9/train_data_with_50kx10_t5_xl_answers_with_scores.jsonl",
        "model_name": "xl"
    }
}

# Define metrics for each score file
score_metrics = {
    "bart": ["bart"],
    "bert": ["bert"],
    "bleu": ["bleu"],
    "bleurt": ["bleurt"],
    "logprobs": ["logprobs"],
    "rouge": ["rouge1", "rouge2", "rougeL", "rougeLsum"]
}

def load_score_file(score_file, metrics, model_name):
    """
    Loads a score file and organizes its content in a dictionary.

    Args:
        score_file (str): Path to the JSONL file containing scores.
        metrics (list): List of metric names to extract.
        model_name (str): Model identifier (base, large, xl) used for key structuring.

    Returns:
        dict: A dictionary with query IDs as keys and metric scores for each response (1 to 10).
    """
    scores = {}
    with open(score_file, "r") as f:
        for line in f:
            data = json.loads(line)
            scores[data["id"]] = {
                f"scores_t5_{model_name}_{i}": {
                    metric: data[f"scores_t5_{model_name}_{i}"][metric]
                    for metric in metrics
                }
                for i in range(1, 11)
            }
    return scores

def update_train_file(train_file, score_files, output_file, model_name):
    """
    Updates a train file with scores from corresponding score files.

    Args:
        train_file (str): Path to the JSONL file containing training data.
        score_files (dict): Dictionary mapping score types to their file paths.
        output_file (str): Path to save the updated train dataset.
        model_name (str): Model identifier (base, large, xl).

    Returns:
        None: The function writes the updated dataset to the output file.
    """
    # Load all score files into dictionaries
    score_data = {
        metric: load_score_file(file, metrics, model_name)
        for metric, (file, metrics) in zip(score_files.keys(), zip(score_files.values(), score_metrics.values()))
    }

    # Update train data
    with open(train_file, "r") as train_f, open(output_file, "w") as out_f:
        for train_line in train_f:
            train_data = json.loads(train_line)
            train_id = train_data["id"]

            # Update scores for each response (1 to 10)
            for i in range(1, 11):
                key = f"scores_t5_{model_name}_{i}"
                for metric, metric_scores in score_data.items():
                    if train_id in metric_scores:
                        for sub_metric, value in metric_scores[train_id][key].items():
                            train_data[key][sub_metric] = value

            # Write updated data to the output file
            json.dump(train_data, out_f)
            out_f.write("\n")

    print(f"Train data updated and saved to {output_file}.")

# Process all datasets
for model, paths in datasets.items():
    print(f"Processing {model} dataset...")
    update_train_file(
        train_file=paths["train_file"],
        score_files=paths["score_files"],
        output_file=paths["output_file"],
        model_name=paths["model_name"]
    )

print("All datasets successfully updated with scores.")


Processing base dataset...
Train data updated and saved to ../datasets/generated/step_9/train_data_with_50kx10_t5_base_answers_with_scores.jsonl.
Processing large dataset...
Train data updated and saved to ../datasets/generated/step_9/train_data_with_50kx10_t5_large_answers_with_scores.jsonl.
Processing xl dataset...
Train data updated and saved to ../datasets/generated/step_9/train_data_with_50kx10_t5_xl_answers_with_scores.jsonl.
All datasets successfully updated with scores.


### Step 10: Merge 50k Queries File with 10 Responses and Scores into Empty Response Dataset

In [10]:
import json
import pandas as pd

# Paths to files
empty_responses_file = "../datasets/generated/step_2/empty_response_dataset_50k.jsonl"
base_file_path = "../datasets/generated/step_9/train_data_with_50kx10_t5_base_answers_with_scores.jsonl"
large_file_path = "../datasets/generated/step_9/train_data_with_50kx10_t5_large_answers_with_scores.jsonl"
xl_file_path = "../datasets/generated/step_9/train_data_with_50kx10_t5_xl_answers_with_scores.jsonl"
output_file_path = "../datasets/generated/step_10/train_data_with_50kx10_merged_answers_with_scores.jsonl"

def load_jsonl_to_df(file_path):
    """
    Loads a JSONL file into a Pandas DataFrame.

    Args:
        file_path (str): The path to the JSONL file.

    Returns:
        pd.DataFrame: A DataFrame containing the JSONL data.
    """
    return pd.read_json(file_path, orient="records", lines=True)

# Load datasets into DataFrames
empty_responses_df = load_jsonl_to_df(empty_responses_file)
base_df = load_jsonl_to_df(base_file_path)
large_df = load_jsonl_to_df(large_file_path)
xl_df = load_jsonl_to_df(xl_file_path)

# Check if IDs are aligned across datasets
if not base_df["id"].equals(large_df["id"]) or not base_df["id"].equals(xl_df["id"]):
    raise ValueError("ID columns do not match exactly across datasets. Please check the input files.")

# Copy ID column from base_df to empty_responses_df
empty_responses_df["id"] = base_df["id"]

# Copy columns for t5_base
for i in range(1, 11):
    empty_responses_df[f"r_t5_base_{i}"] = base_df[f"r_t5_base_{i}"]
    empty_responses_df[f"scores_t5_base_{i}"] = base_df[f"scores_t5_base_{i}"]

# Copy columns for t5_large
for i in range(1, 11):
    empty_responses_df[f"r_t5_large_{i}"] = large_df[f"r_t5_large_{i}"]
    empty_responses_df[f"scores_t5_large_{i}"] = large_df[f"scores_t5_large_{i}"]

# Copy columns for t5_xl
for i in range(1, 11):
    empty_responses_df[f"r_t5_xl_{i}"] = xl_df[f"r_t5_xl_{i}"]
    empty_responses_df[f"scores_t5_xl_{i}"] = xl_df[f"scores_t5_xl_{i}"]

# Copy average response time features
empty_responses_df["rt_t5_base_avg"] = base_df["rt_t5_base_avg"]
empty_responses_df["rt_t5_large_avg"] = large_df["rt_t5_large_avg"]
empty_responses_df["rt_t5_xl_avg"] = xl_df["rt_t5_xl_avg"]

# Save the merged dataset to a JSONL file
with open(output_file_path, "w") as output_file:
    for entry in empty_responses_df.to_dict(orient="records"):
        json.dump(entry, output_file, separators=(",", ":"))
        output_file.write("\n")

print(f"Filled dataset with average response times has been written to {output_file_path}")


Filled dataset with average response times has been written to ../datasets/generated/step_10/train_data_with_50kx10_merged_answers_with_scores.jsonl


### Step 11: Compute Average Scores and Additional Response Features for Dataset

In [11]:
import pandas as pd
import json
from tqdm import tqdm
from collections import Counter

# File paths
query_feature_file = "../datasets/generated/step_6/train_data_with_query_features_complexity_scores.jsonl"
scores_file_path = "../datasets/generated/step_10/train_data_with_50kx10_merged_answers_with_scores.jsonl"
output_file_path = "../datasets/generated/step_11/train_data_with_query_and_response_features_with_scores.jsonl"

def read_jsonl(file_path):
    """
    Reads a JSONL file and returns its content as a list of dictionaries.

    Args:
        file_path (str): The path to the JSONL file.

    Returns:
        list: A list of dictionaries, each representing a JSON object from the file.
    """
    with open(file_path, "r") as f:
        return [json.loads(line) for line in f]
    
def ensure_float_columns(df, columns):
    """
    Converts specified columns in a DataFrame to float type.

    Args:
        df (pd.DataFrame): The DataFrame containing the columns to be converted.
        columns (list): List of column names to convert to float.

    Returns:
        None
    """
    for col in columns:
        if col in df.columns:
            df[col] = df[col].astype(float)

def calculate_redundancy_score(response):
    """
    Calculates the redundancy score of a response based on word-level and character-level redundancy.

    Args:
        response (str): The response text to be analyzed.

    Returns:
        float: The highest redundancy score based on word-level and character-level redundancy.
    """
    if not response or response.strip() == "":
        return 1  # Maximum redundancy score for empty or whitespace-only responses

    # Word-level redundancy
    words = response.split()
    word_counts = Counter(words)
    total_words = len(words)
    unique_words = len(word_counts)
    word_level_redundancy = (total_words - unique_words) / total_words if total_words > 0 else 0

    # Word-char-level redundancy
    word_char_redundancies = [
        (len(word) - len(Counter(word))) / len(word) if len(word) > 0 else 0
        for word in words
    ]
    avg_word_char_redundancy = sum(word_char_redundancies) / len(word_char_redundancies) if word_char_redundancies else 0

    return max(word_level_redundancy, avg_word_char_redundancy)

def is_trashy(response, redundancy_score, length, response_time):
    """
    Determines whether a response should be considered as "trashy" based on redundancy, length, and response time.

    Args:
        response (str): The response text.
        redundancy_score (float): The redundancy score of the response.
        length (int): The length of the response in words.
        response_time (float): The response generation time in milliseconds.

    Returns:
        bool: True if the response is considered trashy, False otherwise.
    """
    max_redundancy_threshold = 0.85
    max_length_threshold = 100
    max_response_time_threshold = 1000

    if not response.strip():
        return True
    elif (redundancy_score > max_redundancy_threshold and
          length > max_length_threshold and
          response_time > max_response_time_threshold):
        return True
    return False

def calculate_query_repetition_count(instruction, input_text, query, response):
    """
    Checks whether a response is an exact repetition of the query, instruction, or input.

    Args:
        instruction (str): The instruction part of the query.
        input_text (str): The input part of the query.
        query (str): The full query (instruction + input).
        response (str): The generated response.

    Returns:
        bool: True if the response is a repetition, False otherwise.
    """
    if not response or response.isspace():
        return False

    # Check for exact match conditions
    if instruction and not input_text and response.strip() == instruction.strip():
        return True
    if input_text and not instruction and response.strip() == input_text.strip():
        return True
    if instruction and input_text and response.strip() == (instruction.strip() + input_text.strip()):
        return True

    # Check if response matches the entire query
    if response.strip() == query.strip():
        return True

    return False

# Load datasets
query_feature_df = pd.DataFrame(read_jsonl(query_feature_file))
scores_df = pd.DataFrame(read_jsonl(scores_file_path))

# Ensure features are floats
columns_to_convert = [
    "rt_t5_base_avg", "rt_t5_large_avg", "rt_t5_xl_avg",
    "r_t5_base_length_avg", "r_t5_large_length_avg", "r_t5_xl_length_avg",
    "r_t5_base_redundancy_avg", "r_t5_large_redundancy_avg", "r_t5_xl_redundancy_avg"
]
ensure_float_columns(query_feature_df, columns_to_convert)

# Process each row to calculate response features
for index, row in tqdm(scores_df.iterrows(), total=scores_df.shape[0], desc="Processing Response Features"):
    instruction = row.get("instruction", "")
    input_text = row.get("input", "")
    query = row.get("query", "")
    
    for model in ["t5_base", "t5_large", "t5_xl"]:
        total_length, total_redundancy, total_scores, trash_count, repetition_count = 0, 0.0, Counter(), 0, 0

        for i in range(1, 11):
            response_key = f"r_{model}_{i}"
            score_key = f"scores_{model}_{i}"

            response = row.get(response_key, "")
            response_time = row.get(f"rt_{model}_avg", 0.0)
            length = len(response.split()) if response else 0
            redundancy = calculate_redundancy_score(response)
            total_length += length
            total_redundancy += redundancy

            scores = row.get(score_key, {})
            for metric, value in scores.items():
                if value is not None:
                    total_scores[metric] += value

            if is_trashy(response, redundancy, length, response_time):
                trash_count += 1
                
            if calculate_query_repetition_count(instruction, input_text, query, response):
                repetition_count += 1

        avg_length = round(total_length / 10, 1)
        avg_redundancy = round(total_redundancy / 10, 4)

        avg_scores = {metric: total_scores[metric] / 10 for metric in total_scores}
        query_feature_df.at[index, f"rt_{model}_avg"] = round(row.get(f"rt_{model}_avg", 0.0), 1)
        query_feature_df.at[index, f"r_{model}_length_avg"] = avg_length
        query_feature_df.at[index, f"r_{model}_redundancy_avg"] = avg_redundancy
        query_feature_df.at[index, f"r_{model}_trash_count"] = trash_count
        query_feature_df.at[index, f"r_{model}_query_repetition_count"] = repetition_count
        query_feature_df.at[index, f"scores_{model}_avg"] = avg_scores
        
# Save the updated dataset
with open(output_file_path, "w") as output_file:
    for entry in query_feature_df.to_dict(orient="records"):
        json.dump(entry, output_file, separators=(",", ":"))
        output_file.write("\n")

print("Process completed. Updated dataset saved to:", output_file_path)


Processing Response Features: 100%|██████| 50000/50000 [03:46<00:00, 220.29it/s]


Process completed. Updated dataset saved to: ../datasets/generated/step_11/train_data_with_query_and_response_features_with_scores.jsonl


### Step 12: Calculate Discrepancies between the Models

In [12]:
import pandas as pd
import numpy as np
import json

# Global constants
SCORE_METRICS = ['bart', 'rouge1', 'rouge2', 'rougeL', 'rougeLsum', 'bleu', 'bert', 'bleurt', 'logprobs']
LLMS = ['t5_base', 't5_large', 't5_xl']

# File paths
input_file_path = "../datasets/generated/step_11/train_data_with_query_and_response_features_with_scores.jsonl"
output_file_path = "../datasets/generated/step_12/train_data_with_query_and_response_features_discrepancies_with_scores.jsonl"

def read_jsonl(file_path):
    """
    Reads a JSONL file and returns its content as a list of dictionaries.

    Args:
        file_path (str): The path to the JSONL file.

    Returns:
        list: A list of dictionaries, each representing a JSON object from the file.
    """
    with open(file_path, "r") as f:
        return [json.loads(line) for line in f]

def calculate_metric_ranges(df):
    """
    Computes the minimum and maximum values for each score metric across all LLMs.

    Args:
        df (pd.DataFrame): The DataFrame containing score data.

    Returns:
        dict: A dictionary where keys are metric names and values are tuples (min, max).
    """
    metric_ranges = {}

    for metric in SCORE_METRICS:
        # Calculate min and max separately across each model's scores
        min_value = min(
            df['scores_t5_base_avg'].apply(lambda x: x.get(metric, 0)).min(),
            df['scores_t5_large_avg'].apply(lambda x: x.get(metric, 0)).min(),
            df['scores_t5_xl_avg'].apply(lambda x: x.get(metric, 0)).min()
        )
        max_value = max(
            df['scores_t5_base_avg'].apply(lambda x: x.get(metric, 0)).max(),
            df['scores_t5_large_avg'].apply(lambda x: x.get(metric, 0)).max(),
            df['scores_t5_xl_avg'].apply(lambda x: x.get(metric, 0)).max()
        )
        metric_ranges[metric] = (min_value, max_value)
        print(f"Metric {metric}: Min = {min_value}, Max = {max_value}")

    return metric_ranges

def normalize_metric(score, metric, metric_ranges):
    """
    Normalizes a metric score using min-max scaling or log transformation.

    Args:
        score (float): The raw score value to be normalized.
        metric (str): The name of the metric.
        metric_ranges (dict): Dictionary containing the min and max values for each metric.

    Returns:
        float: The normalized score between 0 and 1.
    """
    min_value, max_value = metric_ranges[metric]

    # Define shift value for metrics, with a special shift for BLEU
    shift_value = abs(min_value) + (2 if metric == 'bleu' else 1)

    # Apply log transformation for specific high-range metrics
    if metric in ['bart', 'bleu', 'bert', 'bleurt', 'logprobs']:
        score = np.log(score + shift_value)
        min_log_value = np.log(min_value + shift_value)
        max_log_value = np.log(max_value + shift_value)

        # Normalize the score between 0 and 1
        if max_log_value == min_log_value:
            return 0.0
        return (score - min_log_value) / (max_log_value - min_log_value)
    else:
        # Direct normalization for metrics that are already between 0 and 1 (rouge scores)
        if max_value == min_value:
            return 0.0
        return (score - min_value) / (max_value - min_value)

def calculate_avg_normalized_scores(df, metric_ranges):
    """
    Computes the average normalized score for each LLM using all metrics.

    Args:
        df (pd.DataFrame): The DataFrame containing scores.
        metric_ranges (dict): Dictionary containing the min and max values for each metric.

    Returns:
        pd.DataFrame: The updated DataFrame with new columns for average normalized scores.
    """
    for llm in LLMS:
        df[f'avg_normalized_score_{llm}'] = df.apply(
            lambda row: round(
                sum(
                    normalize_metric(row[f'scores_{llm}_avg'].get(metric, 0), metric, metric_ranges)
                    for metric in SCORE_METRICS
                    if row[f'scores_{llm}_avg'].get(metric) is not None
                ) / len(SCORE_METRICS),
                4
            ),
            axis=1
        )
    return df

def calculate_discrepancies(df):
    """
    Computes the discrepancy between normalized scores of different model pairs.

    Args:
        df (pd.DataFrame): The DataFrame containing average normalized scores.

    Returns:
        pd.DataFrame: The updated DataFrame with new discrepancy columns.
    """
    for llm_pair in [('base', 'large'), ('large', 'xl'), ('base', 'xl')]:
        col1, col2 = f'avg_normalized_score_t5_{llm_pair[0]}', f'avg_normalized_score_t5_{llm_pair[1]}'
        df[f'discrepancy_{llm_pair[0]}_vs_{llm_pair[1]}'] = round(df[col1] - df[col2], 4)
    return df

# Load dataset
df = pd.DataFrame(read_jsonl(input_file_path))

# Calculate metric ranges
metric_ranges = calculate_metric_ranges(df)

# Calculate average normalized scores for each LLM
df = calculate_avg_normalized_scores(df, metric_ranges)

# Calculate discrepancies
df = calculate_discrepancies(df)

# Save the updated dataset
with open(output_file_path, "w") as output_file:
    for entry in df.to_dict(orient="records"):
        json.dump(entry, output_file, separators=(",", ":"))
        output_file.write("\n")

print("Process completed. Discrepancies and average normalized scores saved to:", output_file_path)


Metric bart: Min = -9.870268821716309, Max = 0.0
Metric rouge1: Min = 0.0, Max = 1.0
Metric rouge2: Min = 0.0, Max = 1.0
Metric rougeL: Min = 0.0, Max = 1.0
Metric rougeLsum: Min = 0.0, Max = 1.0
Metric bleu: Min = 0.0, Max = 100.00000000000003
Metric bert: Min = -0.24503022432327198, Max = 1.000000238418579
Metric bleurt: Min = -1.9606274127960206, Max = 1.0893199443817132
Metric logprobs: Min = -13.237013339996338, Max = 0.0
Process completed. Discrepancies and average normalized scores saved to: ../datasets/generated/step_12/train_data_with_query_and_response_features_discrepancies_with_scores.jsonl


### Step 13: Calculate Evaluation Model Label (Ground Truth Model Label)

In [None]:
import pandas as pd
import json
from collections import defaultdict

# File paths
# ATTENTION: Change file names according to threshold
input_file_path = "../datasets/generated/step_12/train_data_with_query_and_response_features_discrepancies_with_scores.jsonl"
output_file_path = "../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt-0.03_tt3_rt3.jsonl"
log_file_path = "../datasets/generated/step_13/model_label_distribution_t-0.03.txt"

# Thresholds
discrepancy_threshold = -0.03 # Change to [0, -0.005, -0.01, -0.015, -0.02, -0.025, -0.03]
repetition_count_threshold = 3
trash_count_threshold = 3
response_time_buffer = 1.5
length_buffer = 1.5

# Initialize counters and query ID tracking
case_counters = defaultdict(int)
case_query_ids = defaultdict(list)

def read_jsonl(file_path):
    """
    Reads a JSONL file and returns its content as a list of dictionaries.

    Args:
        file_path (str): The path to the JSONL file.

    Returns:
        list: A list of dictionaries, each representing a JSON object from the file.
    """
    with open(file_path, "r") as f:
        return [json.loads(line) for line in f]

def calculate_model_label(row):
    """
    Determines the most suitable model (base, large, or XL) for a given query based on response quality metrics.

    Args:
        row (pd.Series): A row from the DataFrame containing discrepancy values, trash counts, repetition counts, and response times.

    Returns:
        int: Model label (1 = base, 2 = large, 3 = XL).
    """
    query_id = row['id']

    # Discrepancies
    discrepancy_base_vs_large = row["discrepancy_base_vs_large"]
    discrepancy_large_vs_xl = row["discrepancy_large_vs_xl"]
    discrepancy_base_vs_xl = row["discrepancy_base_vs_xl"]

    # Trash counts
    trash_count_base = row["r_t5_base_trash_count"]
    trash_count_large = row["r_t5_large_trash_count"]
    trash_count_xl = row["r_t5_xl_trash_count"]

    # Repetition counts
    repetition_count_base = row["r_t5_base_query_repetition_count"]
    repetition_count_large = row["r_t5_large_query_repetition_count"]
    repetition_count_xl = row["r_t5_xl_query_repetition_count"]

    # Check 1: All models are trash
    if trash_count_base >= trash_count_threshold and trash_count_large >= trash_count_threshold and trash_count_xl >= trash_count_threshold:
        case_counters["case_1"] += 1
        case_query_ids["case_1"].append(query_id)
        return 3

    # Check 2: XL or large is trash
    if (trash_count_xl >= trash_count_threshold or trash_count_large >= trash_count_threshold):
        case_counters["case_2"] += 1
        case_query_ids["case_2"].append(query_id)
        return 3

    # Check 3: All discrepancies are 0 and base is not trash
    if (discrepancy_base_vs_large == 0.0 and discrepancy_large_vs_xl == 0.0 and discrepancy_base_vs_xl == 0.0 and trash_count_base < trash_count_threshold):
        case_counters["case_3"] += 1
        case_query_ids["case_3"].append(query_id)
        return 1

    # Check 4: Smallest model outperformed all others and was not trash or query repitition
    if (trash_count_base < trash_count_threshold and 
        discrepancy_base_vs_large >= 0 and discrepancy_base_vs_xl >= 0 and
        repetition_count_base < repetition_count_threshold):
        case_counters["case_4"] += 1
        case_query_ids["case_4"].append(query_id)
        return 1

    # Step 6: Large outperformed all and all responses valid
    if (trash_count_base < trash_count_threshold and trash_count_large < trash_count_threshold and trash_count_xl < trash_count_threshold and
        discrepancy_base_vs_large < 0 and discrepancy_large_vs_xl >= 0 and repetition_count_large < repetition_count_threshold):
        if discrepancy_base_vs_large >= discrepancy_threshold and repetition_count_base < repetition_count_threshold:
            if (row['rt_t5_base_avg'] <= row['rt_t5_large_avg'] * response_time_buffer and
                row['r_t5_base_length_avg'] <= row['r_t5_large_length_avg'] * length_buffer):
                case_counters["case_5"] += 1
                case_query_ids["case_5"].append(query_id)
                return 1  # Base close to large
        else:
            case_counters["case_6"] += 1
            case_query_ids["case_6"].append(query_id)
            return 2  # Large selected

    # Step 7: XL outperformed large and large outperformed base
    if (trash_count_xl < trash_count_threshold and trash_count_large < trash_count_threshold and trash_count_base < trash_count_threshold and 
        discrepancy_large_vs_xl < 0 and discrepancy_base_vs_large < 0 and repetition_count_xl < repetition_count_threshold and 
        repetition_count_large < repetition_count_threshold and repetition_count_base < repetition_count_threshold):
        if discrepancy_base_vs_xl >= discrepancy_threshold:
            if (row['rt_t5_base_avg'] <= row['rt_t5_xl_avg'] * response_time_buffer and
                row['r_t5_base_length_avg'] <= row['r_t5_xl_length_avg'] * length_buffer):
                case_counters["case_7"] += 1
                case_query_ids["case_7"].append(query_id)
                return 1  # Base close to XL
        elif discrepancy_large_vs_xl >= discrepancy_threshold:
            if (row['rt_t5_large_avg'] <= row['rt_t5_xl_avg'] * response_time_buffer and
                row['r_t5_large_length_avg'] <= row['r_t5_xl_length_avg'] * length_buffer):
                case_counters["case_8"] += 1
                case_query_ids["case_8"].append(query_id)
                return 2  # Large close to XL

    # Default to XL if no conditions are met
    case_counters["case_default"] += 1
    case_query_ids["case_default"].append(query_id)
    return 3

# Load dataset
df = pd.DataFrame(read_jsonl(input_file_path))

# Apply the function to each row
df['evaluation_model_label'] = df.apply(calculate_model_label, axis=1)

# Save the updated dataset
with open(output_file_path, "w") as output_file:
    for entry in df.to_dict(orient="records"):
        json.dump(entry, output_file, separators=(",", ":"))
        output_file.write("\n")

# Write case distribution to a file
with open(log_file_path, "w") as report_file:
    for case, count in case_counters.items():
        report_file.write(f"{case}: {count}\n")
        report_file.write(f"Query IDs: {case_query_ids[case]}\n\n")

    # Count occurrences of each label and calculate average complexity scores
    label_counts = df['evaluation_model_label'].value_counts().sort_index()
    avg_complexity_scores = df.groupby('evaluation_model_label')['query_complexity_score'].mean()
    median_complexity_scores = df.groupby('evaluation_model_label')['query_complexity_score'].median()

    report_file.write("\nTotal Label Counts and Complexity Scores:\n")
    for label in label_counts.index:
        report_file.write(f"Label {label}: {label_counts[label]}\n")
        report_file.write(f"Average complexity score: {avg_complexity_scores[label]:.4f}\n")
        report_file.write(f"Median complexity score: {median_complexity_scores[label]:.4f}\n\n")

# Print total occurrences and average complexity scores
print("\nLabel Counts and Complexity Scores:")
for label in label_counts.index:
    print(f"Label {label}: {label_counts[label]}")
    print(f"Average complexity score: {avg_complexity_scores[label]:.4f}")
    print(f"Median complexity score: {median_complexity_scores[label]:.4f}\n")

print(f"Processing complete. Case distribution and label counts written to {log_file_path}.")
