# Testing the Reasoning Models 

In [1]:
from utils.compute_clients import create_clients
import pandas as pd
import os
from utils.compute_clients import LLMClient

In [2]:
df = pd.read_json("/dccstor/shanmukh/sravani_internship/benchmark_experiments/benchmark_dataset/benchmark_v10_v2.jsonl", lines=True)
user_prompts = df["combined_instruction"].tolist()

In [None]:
import os
api_key = os.getenv("RITS_API_KEY")
model_id = "microsoft/Phi-4-reasoning"
base_url = "https://inference-3scale-apicast-production.apps.rits.fmaas.res.ibm.com/phi-4-reasoning/v1"
client = LLMClient(
    api_key=api_key,
    model_id=model_id,
    client_type="rits",
    base_url=base_url
)

In [None]:
import os
api_key = os.getenv("RITS_API_KEY")
model_id = "Qwen/Qwen3-8B"
base_url = "https://inference-3scale-apicast-production.apps.rits.fmaas.res.ibm.com/qwen3-8b/v1"
client = LLMClient(
    api_key=api_key,
    model_id=model_id,
    client_type="rits",
    base_url=base_url
)

In [11]:
import os

api_key = os.getenv("RITS_API_KEY")
model_id = "ibm-granite/granite-3.3-8b-instruct"
base_url = "https://inference-3scale-apicast-production.apps.rits.fmaas.res.ibm.com/granite-3-3-8b-instruct/v1"
client = LLMClient(
    api_key=api_key,
    model_id=model_id,
    client_type="rits",
    base_url=base_url
)

In [12]:

def generate_response(user_prompts,client=client,max_new_tokens=8000,temperature=0.1, system_prompt=None):
    responses = client.get_model_response_batch(
        system_prompt=system_prompt,
        user_prompts=user_prompts,
        max_new_tokens=max_new_tokens,
        temperature=temperature

    )
    return responses 


In [13]:
prompts = user_prompts
responses = generate_response(prompts, client=client, temperature=0.1)
# for i, response in enumerate(responses):
#     print(f"Prompt {i+1}: {prompts[i]}")
#     print(f"Response {i+1}: {response}")
#     print("\n")

Processing: 100%|██████████| 1489/1489 [12:53<00:00,  1.92it/s] 


In [14]:
# I want a parser which can parse the responses and extract the <response> tag content
import re
def parse_response(response):
    # Use regex to find the content within <response> tags
    match = re.search(r'<response>(.*?)</response>', response, re.DOTALL)
    if match:
        return match.group(1).strip()
    else:
        return "No response found"

        

In [15]:
filter_responses = [parse_response(response) for response in responses]
df['response'] = filter_responses
# for i, response in enumerate(responses):
#     print(f"Parsed Response {i+1}: {response}")
#     print("\n")

In [16]:
df.to_json("/dccstor/shanmukh/sravani_internship/benchmark_experiments/outputs/V10_experiments/granite-3.3-8b-instruct_reasoning.jsonl", orient='records', lines=True, force_ascii=False)

# o3-mini model 

In [2]:
import pandas as pd
df = pd.read_json("/dccstor/shanmukh/sravani_internship/benchmark_experiments/benchmark_dataset/benchmark_v10_v2.jsonl", lines=True)
user_prompts = df["combined_instruction"].tolist()

In [3]:
from utils.compute_clients import create_clients
import os
api_key = os.getenv("IBM_OPENAI_API_KEY")
model_id = "Azure/o3-mini"
client = create_clients(mode="GPT-azure", model_id=model_id)


In [4]:

def generate_response(user_prompts,client=client,max_new_tokens=8000,temperature=0.1, system_prompt=None):
    responses = client.get_model_response_batch(
        system_prompt=system_prompt,
        user_prompts=user_prompts,
        max_new_tokens=max_new_tokens,
        temperature=temperature

    )
    return responses 


In [7]:
prompts = user_prompts
responses = generate_response(prompts, client=client, temperature=1)
# for i, response in enumerate(responses):
#     print(f"Prompt {i+1}: {prompts[i]}")
#     print(f"Response {i+1}: {response}")
df['response'] = responses
df.to_json("/dccstor/shanmukh/sravani_internship/benchmark_experiments/outputs/V10_experiments/o3-mini_results.jsonl", orient='records', lines=True, force_ascii=False)

Processing: 100%|██████████| 1489/1489 [05:47<00:00,  4.29it/s]


In [None]:
import json
import os
import pandas as pd
import matplotlib.pyplot as plt

def load_metrics(metrics_dir):
    """
    Load SSR metrics from JSONL files in a directory.
    Expects files named <model>_results_correctness.jsonl
    """
    metrics = {}
    for fname in os.listdir(metrics_dir):
        if fname.endswith("_results_correctness.jsonl"):
            model = fname.replace("_results_correctness.jsonl", "")
            with open(os.path.join(metrics_dir, fname), "r") as f:
                data = json.loads(f.read())
            # Extract SSR by category
            ssr_cat = data["SSR_by_Category"]
            # Extract SSR by instruction part and normalize keys
            ip = data["SSR_by_Instruction_Part"]
            ssr_ip = {
                "Extracted": ip["Extracted from instruction"],
                "Newly": ip["Newly Generated"],
                "Original": ip["Original source: 'Extracted from instruction'"],
                "Combined": ip["Combined/Refined"]
            }
            metrics[model] = {
                "SSR_by_Category": ssr_cat,
                "SSR_by_Instruction_Part": ssr_ip
            }
    return metrics

def plot_line(df, title, ylabel="SSR"):
    plt.figure(figsize=(8,5))
    for col in df.columns:
        plt.plot(df.index, df[col], marker='o', label=col)
    plt.xticks(rotation=45, ha='right')
    plt.ylabel(ylabel)
    plt.title(title)
    plt.legend(loc="best", fontsize='small')
    plt.tight_layout()
    plt.show()

def plot_bar(df, title, ylabel="SSR"):
    plt.figure(figsize=(8,5))
    df.plot.bar()
    plt.xticks(rotation=45, ha='right')
    plt.ylabel(ylabel)
    plt.title(title)
    plt.legend(loc="best", fontsize='small')
    plt.tight_layout()
    plt.show()

def main():
    # Path to folder containing your JSONL files
    metrics_dir = "./metrics_outputs/filtered_final_data"
    metrics = load_metrics(metrics_dir)

    # Build DataFrames
    df_cat = pd.DataFrame({m: v["SSR_by_Category"] for m, v in metrics.items()})
    df_ip  = pd.DataFrame({m: v["SSR_by_Instruction_Part"] for m, v in metrics.items()})

    # Define model groups
    open_source      = [m for m in metrics if m not in ("phi-4","Qwen3-8B")]
    closed_source    = ["phi-4", "Qwen3-8B"]
    reasoning_models = ["phi-4", "Qwen3-8B"]
    non_reasoning    = [m for m in metrics if m not in reasoning_models]

    # 1. SSR by Category Across Models
    plot_line(df_cat, "SSR by Category Across Models")

    # 2. SSR by Instruction Part Across Models
    plot_line(df_ip, "SSR by Instruction Part Across Models")

    # 3. Avg SSR by Category: Open vs Closed
    avg_open_cat   = df_cat[open_source].mean(axis=1)
    avg_closed_cat = df_cat[closed_source].mean(axis=1)
    df_avg_cat     = pd.DataFrame({"Open-Source": avg_open_cat, "Closed-Source": avg_closed_cat})
    plot_line(df_avg_cat, "Avg SSR by Category: Open vs Closed")

    # 4. Avg SSR by Category: Reasoning vs Non-Reasoning
    avg_reason_cat = df_cat[reasoning_models].mean(axis=1)
    avg_non_cat    = df_cat[non_reasoning].mean(axis=1)
    df_reason_cat  = pd.DataFrame({"Reasoning": avg_reason_cat, "Non-Reasoning": avg_non_cat})
    plot_line(df_reason_cat, "Avg SSR by Category: Reasoning vs Non-Reasoning")

    # 5. Avg SSR by Instruction Part: Open vs Closed
    avg_open_ip   = df_ip[open_source].mean(axis=1)
    avg_closed_ip = df_ip[closed_source].mean(axis=1)
    df_avg_ip     = pd.DataFrame({"Open-Source": avg_open_ip, "Closed-Source": avg_closed_ip})
    plot_line(df_avg_ip, "Avg SSR by Instruction Part: Open vs Closed")

    # 6. Avg SSR by Instruction Part: Reasoning vs Non-Reasoning
    avg_reason_ip = df_ip[reasoning_models].mean(axis=1)
    avg_non_ip    = df_ip[non_reasoning].mean(axis=1)
    df_reason_ip  = pd.DataFrame({"Reasoning": avg_reason_ip, "Non-Reasoning": avg_non_ip})
    plot_line(df_reason_ip, "Avg SSR by Instruction Part: Reasoning vs Non-Reasoning")

    # 7. Bar chart: Avg SSR by Category (Open vs Closed)
    plot_bar(df_avg_cat, "Bar: Avg SSR by Category", "SSR")

    # 8. Bar chart: Avg SSR by Instruction Part (Open vs Closed)
    plot_bar(df_avg_ip, "Bar: Avg SSR by Instruction Part", "SSR")

if __name__ == "__main__":
    main()


# Visualizations

## CSR Plots

In [None]:
import pandas as pd
df = pd.read_josn("/dccstor/shanmukh/sravani_internship/benchmark_experiments/metrics_outputs/filtered_final_data/metrics_summary_v4.jsonl", lines=True)
df.info()

# Filtering the data

In [1]:
import pandas as pd
ranked_path = "LLMjudge_outputs/filtered_final_data/ssr_ranked_dataset.jsonl"
inputpath = "/dccstor/shanmukh/sravani_internship/benchmark_experiments/benchmark_dataset/benchmark_v10_v2.jsonl"
output_path = "benchmark_experiments/benchmark_dataset"



In [9]:
import pandas as pd
import os
import ast
import argparse
import pandas as pd
def get_tail_ids(ranked_jsonl: str, drop_top: int) -> set:
    """
    Load the ranked JSONL, drop the first `drop_top` rows,
    and return the set of remaining `id` values.
    """
    ranked = pd.read_json(ranked_jsonl, lines=True)
    tail = ranked.iloc[drop_top:]
    return set(tail["id"].tolist())

def filter_and_save_all(
    original_folder: str,
    tail_ids: set,
    filtered_folder: str
):
        fname = "benchmark_data.jsonl"
        # in_path = os.path.join(original_folder, fname)
        df = pd.read_json(original_folder, lines=True)
        filtered = df[df["id"].isin(tail_ids)]

        out_path = os.path.join(filtered_folder, "benchmark_data.jsonl")
        filtered.to_json(out_path, orient="records", lines=True)
        print(f"Filtered {fname}: {len(df)} → {len(filtered)} rows → {out_path}")

In [10]:
if __name__ == "__main__":

    # ranked_path = "LLMjudge_outputs/filtered_final_data/ssr_ranked_dataset.jsonl"
    # inputpath = "/dccstor/shanmukh/sravani_internship/benchmark_experiments/benchmark_dataset/benchmark_v10_v2.jsonl"
    input_folder = "/dccstor/shanmukh/sravani_internship/benchmark_experiments/benchmark_dataset/benchmark_v10_v2.jsonl"
    output_folder = "benchmark_dataset"
    ranked_filename = "ssr_ranked_dataset.jsonl"
    drop_top = 489 


    # ranked_path = rank_and_save(
    #     input_folder=input_folder,
    #     ranked_folder=output_folder,
    #     ranked_filename=ranked_filename
    # )

    ranked_path = "LLMjudge_outputs/filtered_final_data/ssr_ranked_dataset.jsonl"

    tail_ids = get_tail_ids(ranked_path, drop_top)
    filter_and_save_all(
        original_folder=input_folder,
        tail_ids=tail_ids,
        filtered_folder=output_folder
    )

Filtered benchmark_data.jsonl: 1489 → 1000 rows → benchmark_dataset/benchmark_data.jsonl


In [12]:
df = pd.read_json("benchmark_dataset/benchmark_data.jsonl",lines=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 23 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   id                               1000 non-null   int64  
 1   dataset                          1000 non-null   object 
 2   instruction                      1000 non-null   object 
 3   code                             1000 non-null   object 
 4   test                             362 non-null    object 
 5   relevant_categories              1000 non-null   object 
 6   simplified_instruction           1000 non-null   object 
 7   extracted_constraints            1000 non-null   object 
 8   final_comprehensive_constraints  1000 non-null   object 
 9   filtered_relevant_constraints    1000 non-null   object 
 10  quality_scores                   1000 non-null   object 
 11  relevance_score                  1000 non-null   float64
 12  objectivity_score    

In [13]:
import pandas as pd

# Load your DataFrame (assuming it's already loaded as df)

# Reset index and drop the old index
df.reset_index(drop=True, inplace=True)

# Update the 'id' column with the new index values
df['id'] = df.index

# Optional: Save to CSV or JSONL
# df.to_csv("benchmark_data_with_reset_id.csv", index=False)
# or
# df.to_json("benchmark_data_with_reset_id.jsonl", orient='records', lines=True)


In [14]:
df.tail()

Unnamed: 0,id,dataset,instruction,code,test,relevant_categories,simplified_instruction,extracted_constraints,final_comprehensive_constraints,filtered_relevant_constraints,...,atomicity_score,unified_quality_score,combined_instruction,constraint_wise_presence,constraint_presence_response,final_constraints,instruction_difficulty_labels,difficulty_response,constraint_difficulty_labels,constraint_difficulty_response
995,995,Multilingual-Multimodal-NLP/McEval-Instruct,Write a Python program that simulates a simple...,```python\nimport threading\nimport time\nimpo...,,"['Code Structure and Modularity', 'Input and O...",Write a Python program that simulates a simple...,"[{'type': 'Code Structure and Modularity', 'co...","[{'type': 'Code Structure and Modularity', 'co...","[{'type': 'Code Structure and Modularity', 'co...",...,5.0,5.0,Write a Python program that simulates a simple...,"[True, True, True, True, True, True, True, Tru...","{\n ""Evaluation"": [\n {\n ""Constraint...","[{'type': 'Code Structure and Modularity', 'co...",medium,"```json\n{\n ""Reason"": ""The task involves usi...","[easy, hard, easy, easy, hard, easy, easy, har...","```json\n{\n ""Classification"": [\n {\n ..."
996,996,Multilingual-Multimodal-NLP/McEval-Instruct,Design a Python class-based system to manage t...,```python\n# -*- coding: utf-8 -*-\nimport imp...,,"['Code Structure and Modularity', 'Input and O...",Design a Python class-based system to manage t...,"[{'type': 'Code Structure and Modularity', 'co...","[{'type': 'Code Structure and Modularity', 'co...","[{'type': 'Code Structure and Modularity', 'co...",...,4.82,4.88,Design a Python class-based system to manage t...,"[True, True, True, True, True, True, True, Tru...","{\n ""Evaluation"": [\n {\n ""Constraint...","[{'type': 'Code Structure and Modularity', 'co...",hard,"```json\n{\n ""Reason"": ""The task involves des...","[easy, easy, hard, hard, easy, hard, hard, eas...","```json\n{\n ""Classification"": [\n {\n ..."
997,997,Multilingual-Multimodal-NLP/McEval-Instruct,Design a simple MVC (Model-View-Controller) ap...,```python\n# Import necessary packages\nimport...,,"['Code Structure and Modularity', 'Input and O...",Design a simple MVC (Model-View-Controller) ap...,"[{'type': 'Code Structure and Modularity', 'co...","[{'type': 'Code Structure and Modularity', 'co...","[{'type': 'Code Structure and Modularity', 'co...",...,4.83,4.83,Design a simple MVC (Model-View-Controller) ap...,"[True, True, True, True, True, True]","{\n ""Evaluation"": [\n {\n ""Constraint...","[{'type': 'Code Structure and Modularity', 'co...",medium,"```json\n{\n ""Reason"": ""The task involves des...","[easy, easy, hard, hard, hard, hard]","```json\n{\n ""Classification"": [\n {\n ..."
998,998,Multilingual-Multimodal-NLP/McEval-Instruct,Create a Python script that provides a command...,```python\nimport argparse\nimport fnmatch\nim...,,"['Code Structure and Modularity', 'Input and O...",Create a Python script that provides a command...,"[{'type': 'File and Data Management', 'constra...","[{'type': 'File and Data Management', 'constra...","[{'type': 'File and Data Management', 'constra...",...,5.0,5.0,Create a Python script that provides a command...,"[True, True, True, True, True]","{\n ""Evaluation"": [\n {\n ""Constraint...","[{'type': 'File and Data Management', 'constra...",medium,"```json\n{\n ""Reason"": ""The task involves mul...","[easy, hard, hard, hard, easy]","```json\n{\n ""Classification"": [\n {\n ..."
999,999,Multilingual-Multimodal-NLP/McEval-Instruct,Design a Python class `PluginManager` that man...,```python\n# Import necessary packages\nfrom t...,,"['Code Structure and Modularity', 'Error Handl...",Design a Python class `PluginManager` that man...,"[{'type': 'Code Structure and Modularity', 'co...","[{'type': 'Code Structure and Modularity', 'co...","[{'type': 'Code Structure and Modularity', 'co...",...,5.0,5.0,Design a Python class `PluginManager` that man...,"[True, True, True, True, True, True, True, True]","{\n ""Evaluation"": [\n {\n ""Constraint...","[{'type': 'Code Structure and Modularity', 'co...",medium,"```json\n{\n ""Reason"": ""The task involves cre...","[easy, easy, easy, easy, easy, easy, easy, easy]","```json\n{\n ""Classification"": [\n {\n ..."


In [16]:
df.columns

Index(['id', 'dataset', 'instruction', 'code', 'test', 'relevant_categories',
       'simplified_instruction', 'extracted_constraints',
       'final_comprehensive_constraints', 'filtered_relevant_constraints',
       'quality_scores', 'relevance_score', 'objectivity_score',
       'atomicity_score', 'unified_quality_score', 'combined_instruction',
       'constraint_wise_presence', 'constraint_presence_response',
       'final_constraints', 'instruction_difficulty_labels',
       'difficulty_response', 'constraint_difficulty_labels',
       'constraint_difficulty_response'],
      dtype='object')

In [17]:
df["dataset"].value_counts()

dataset
Multilingual-Multimodal-NLP/McEval-Instruct    373
ajibawa-2023/Python-Code-23k-ShareGPT          265
bigcode/bigcodebench                           183
xlangai/DS-1000                                179
Name: count, dtype: int64

In [18]:
df = df.drop(columns=['instruction', 'code', 'test', 'relevant_categories',
       'simplified_instruction', 'extracted_constraints',
       'final_comprehensive_constraints', 'filtered_relevant_constraints',
       'quality_scores', 'relevance_score', 'objectivity_score',
       'atomicity_score', 'unified_quality_score',
       'constraint_wise_presence', 'constraint_presence_response',
       'difficulty_response', 'constraint_difficulty_labels',
       'constraint_difficulty_response'])

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   id                             1000 non-null   int64 
 1   dataset                        1000 non-null   object
 2   combined_instruction           1000 non-null   object
 3   final_constraints              1000 non-null   object
 4   instruction_difficulty_labels  1000 non-null   object
dtypes: int64(1), object(4)
memory usage: 39.2+ KB


In [20]:
df = df.rename(columns= {"dataset" : "source_dataset", "combined_instruction":"instruction", "final_constraints":"constraints","instruction_difficulty_labels":"instruction_difficulty"})

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      1000 non-null   int64 
 1   source_dataset          1000 non-null   object
 2   instruction             1000 non-null   object
 3   constraints             1000 non-null   object
 4   instruction_difficulty  1000 non-null   object
dtypes: int64(1), object(4)
memory usage: 39.2+ KB


In [22]:
df["constraints"][0]

[{'type': 'Input and Output Handling',
  'constraint': 'Exclude any combination that contains the number 5.',
  'instruction_part': 'Extracted from instruction'},
 {'type': 'Input and Output Handling',
  'constraint': 'Exclude any combination that contains a repeating digit.',
  'instruction_part': 'Extracted from instruction'},
 {'type': 'Code Structure and Modularity',
  'constraint': 'Implement the solution without using any built-in functions or libraries to check for repeating digits.',
  'instruction_part': 'Extracted from instruction'},
 {'type': 'Testing and Debugging',
  'constraint': 'Create unit tests to verify that all combinations generated meet the specified constraints of excluding the number 5 and repeating digits.',
  'instruction_part': 'Newly Generated'},
 {'type': 'Input and Output Handling',
  'constraint': 'Ensure the output format is consistent, such as printing each combination on a new line or in a specified format.',
  'instruction_part': 'Newly Generated'}]

In [23]:
import pandas as pd

# Define a function to remove 'instruction_part' from each constraint dict
def remove_instruction_part(constraints):
    return [{k: v for k, v in constraint.items() if k != "instruction_part"} for constraint in constraints]

# Apply the function to the entire column
df["constraints"] = df["constraints"].apply(remove_instruction_part)


In [24]:
df["constraints"][0]

[{'type': 'Input and Output Handling',
  'constraint': 'Exclude any combination that contains the number 5.'},
 {'type': 'Input and Output Handling',
  'constraint': 'Exclude any combination that contains a repeating digit.'},
 {'type': 'Code Structure and Modularity',
  'constraint': 'Implement the solution without using any built-in functions or libraries to check for repeating digits.'},
 {'type': 'Testing and Debugging',
  'constraint': 'Create unit tests to verify that all combinations generated meet the specified constraints of excluding the number 5 and repeating digits.'},
 {'type': 'Input and Output Handling',
  'constraint': 'Ensure the output format is consistent, such as printing each combination on a new line or in a specified format.'}]

In [25]:
df.to_json("benchmark_dataset/benchmark_data_final.jsonl",orient="records",lines=True)