# Step 1 - Get and Prepare Input Datasets



In [None]:
# Constants
# Number of records we will process
N = 60
version = "v3" # benchmark version

In [None]:
import sys
print(sys.prefix)


## Get the Test Dataset for constraint generation from github

In [None]:
!wget https://raw.githubusercontent.com/gunnusravani/CIF_Benchmark/refs/heads/main/data/outputs/constraint_category_data.csv -O "constraint_category_initial_data.csv"

In [None]:
!pwd

In [None]:
import pandas as pd
full_df = pd.read_csv("constraint_category_initial_data.csv")
full_df.head(60)

In [None]:
full_df.info()

In [None]:
full_df = full_df[["dataset","instruction","code"]]
full_df.head(60)

In [None]:
test_df = full_df.sample(n=N)
test_df.info()

## Decide Dataset Sampling Ratios Based on Real-World Complexity
- ShareGPT - 25%
- DS-1000 - 15%
- Bigcodebench -25%
- MCeval - 35%

In [None]:
!wget https://raw.githubusercontent.com/gunnusravani/CIF_Benchmark/refs/heads/main/data/outputs/benchmark_data.csv -O "benchmark_data_initial.csv"

In [None]:
import pandas as pd
test_df = pd.read_csv("./benchmark/benchmark_v1_with_stats.csv")
test_df.info()

## Write transformation scripts for each dataset format

## Create a benchmark dataset of 1000–1200 examples



In [None]:
import pandas as pd
benchmark_df = pd.read_csv("benchmark_data_initial.csv")
benchmark_df.info()

## Data Slection For Execution

In [None]:
# df = test_df.copy()
df = benchmark_df.copy()

# Step 2 - Define Constraint Categories

In [None]:
categories = [
    "Code Structure and Modularity",
    "Input and Output Handling",
    "Error Handling and Robustness",
    "Data Processing and Transformation",
    "Performance and Optimization",
    "Library and API Usage",
    "Testing and Debugging",
    "Documentation and Readability",
    "Security and Privacy",
    "Reproducibility and Consistency",
    "Mathematical Computation",
    "File and Data Management",
    "UI and Interaction",
]
categories_str = "\n".join(categories)

# Utility Functions

In [None]:
import json

def extract_json(string,col_name="Constraints"):
    try:
        json_string = string.strip().replace('```json\n', '', 1).replace('\n```', '', 1)
        constraint_json = json.loads(json_string)
        return constraint_json.get(col_name, [])
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e} in string: {string}")
        return []
    except AttributeError as e:
        print(f"Attribute error: {e} in string: {string}")
        return []

# Step 3 Generating Constraints

In [None]:
!pwd

In [None]:
import pandas as pd
import time
from openai import OpenAI
import os

if os.path.exists("/content"):
    from google.colab import userdata
    openai_api_key = userdata.get('OPENAPI_KEY')
else:
    from dotenv import load_dotenv
    load_dotenv()
    openai_api_key = os.getenv("OPENAI_API_KEY")
    print("Running in local environment, loaded key from .env")

# Check if key was actually found
if not openai_api_key:
    raise ValueError("OPENAI_API_KEY is not set. Please check your environment or .env file.")

# Initialize OpenAI client
client = OpenAI(api_key=openai_api_key)


In [None]:
SYSTEM_PROMPT = f"""
You are a helpful assistant. You will be given a programming instruction and the corresponding code.
"""

In [None]:
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

def get_model_response_batch(user_prompts=None, system_prompt=SYSTEM_PROMPT):
    messages_list = [[{
                "role": "system",
                "content": system_prompt
            }, {
                "role": "user",
                "content": user_prompt
            }] for user_prompt in user_prompts]
    with ThreadPoolExecutor(max_workers=1500) as executor:
            response_texts = list(tqdm(
                executor.map(lambda messages: get_response_v2(messages), messages_list),
                total=len(messages_list),
                desc="Processing"
            ))
    return response_texts

def get_response_v2(messages, max_retries=1):
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=messages,
            temperature=0
            )
            # print(response)
            return response.choices[0].message.content
        except Exception as e:
            print(f"Error on attempt {attempt+1}: {e}")
            time.sleep(2)
    return "[]"

def get_response(user_prompt,system_prompt=SYSTEM_PROMPT, max_retries=1):
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0
            )
            # print(response)
            return response.choices[0].message.content
        except Exception as e:
            print(f"Error on attempt {attempt+1}: {e}")
            time.sleep(2)
    return "[]"

## Stage 1: Relevant Category Selection

In [None]:
SYSTEM_PROMPT_STAGE1 = """
You are a categorization expert for programming tasks. Your goal is to accurately map a programming instruction and its associated code to all applicable high-level constraint categories from a predefined list.
"""

def get_prompt_stage1_select_categories(instruction: str, code: str, all_constraint_categories: list[str]) -> str:
    categories_str = "\n".join(all_constraint_categories)
    prompt = f"""
    "task": "Classify the natural language `instruction` and its corresponding `code` into all relevant high-level constraint categories from the provided comprehensive list.",
    "context": "You are given an `instruction` that describes a coding task, `code` for context, and a list of `all_constraint_categories`.",
    "goal": "Select **all** relevant categories from the `all_constraint_categories` list that apply directly, indirectly, or even potentially to the given `instruction` and `code`.
    For each category, think carefully about both explicit requirements and implicit expectations that might arise in real-world programming tasks.
    If there is **any plausible reason**—whether due to the instruction's wording, the code's structure, edge cases, potential extensions, or real-world usage scenarios—that the category might impose constraints, then **include it**.

    Be exhaustive on the side of inclusion. The goal is to **maximize coverage** of all semantically or practically relevant constraint categories, not to limit selection to a minimal subset.
    Your output must be a JSON object containing a single field: 'relevant_categories'.
    "JSON Response Format": {{
        "relevant_categories": [
            "List of selected relevant categories from 'all_constraint_categories'. Each string should be the exact name of the matching supercategory."
        ]
    }},
    "Inputs Required": {{
        "instruction": {instruction}
        "code": {code}
        "all_constraint_categories": {categories_str}
    }}
    """
    return prompt
prompt = get_prompt_stage1_select_categories(full_df.iloc[26]["instruction"],full_df.iloc[26]["code"],categories)
print(prompt)

In [None]:
print(full_df.iloc[26]["instruction"])

In [None]:
relevant_categories = get_response(prompt,SYSTEM_PROMPT_STAGE1)
print(relevant_categories)
relevant_categories = extract_json(relevant_categories,"relevant_categories")
print(relevant_categories)

## Stage 2: Instruction Simplification & Implicit Constraint Extraction


In [None]:
SYSTEM_PROMPT_STAGE2 = """
You are an expert natural language processor and code instruction analyst. Your task is to meticulously read a programming instruction, identify and extract any embedded constraints, separate them from the core problem description, and map them to provided high-level categories. You must preserve any provided starter code or solution snippets.
"""

def get_prompt_stage2_extract_and_simplify(instruction: str, relevant_categories: list[str]) -> str:
    relevant_categories_str = ", ".join(relevant_categories)

    prompt = f"""
    "task": "Carefully analyze the provided programming `instruction`. Your main goal is to separate the core problem description from any explicit constraints or directives embedded within the instruction's text. You must return the core problem description as 'simplified_instruction' and all identified explicit constraints as 'extracted_constraints'.",

    "context": "You are provided with an `instruction` that describes a coding task and a list of `relevant_categories` that apply to this instruction. The `instruction` might contain specific rules, formatting requirements, or implementation details interwoven into its narrative. You need to pull these out.",

    "goal": "Perform the following steps:
    1. **Identify and Extract Explicit Constraints:** Read the `instruction` and pinpoint all **explicit** constraints, directives, or specific requirements (e.g., about function names, return types, error handling, algorithms to use, variable naming, documentation, etc.).
    2. **Split Compound Constraints into Atomic Ones:** If any sentence or clause contains multiple distinct requirements, split it into **multiple entries**, ensuring **each extracted constraint refers to exactly one requirement or condition**.
       Do not combine multiple checks (e.g., multiple `if` conditions or causes for `ValueError`) into one constraint.
    3. **Simplify Instruction:** Create a 'simplified_instruction' by removing only these identified explicit constraints, leaving behind the core problem explanation.
        **CRITICAL:** If the original `instruction` contains any starter code (e.g., code in ```python blocks) or any solution snippets demarcated by 'BEGIN SOLUTION', you **MUST NOT** modify or remove these from the 'simplified_instruction'.
        **Important:** If the original instruction contains any explanation or examples that are not constraints, you should keep them in the 'simplified_instruction' as they are part of the core problem description.
    4. **Format Extracted Constraints with Categories:** For each extracted constraint:
        - Include a separate item in the 'extracted_constraints' list.
        - Set its 'type' to one of the categories from `relevant_categories`.
        - Set 'instruction_part' to 'Extracted from instruction'.

    Example:
    - Original compound constraint: "Raise ValueError if input is None or if it's an empty list."
    - Extracted as:
        {{
            "type": "Error Handling and Robustness",
            "constraint": "Raise ValueError if input is None.",
            "instruction_part": "Extracted from instruction"
        }},
        {{
            "type": "Error Handling and Robustness",
            "constraint": "Raise ValueError if input is an empty list.",
            "instruction_part": "Extracted from instruction"
        }}

    Your output must be a JSON object with two fields: 'simplified_instruction' and 'extracted_constraints'.
    ",

    "JSON Response Format": {{
        "simplified_instruction": "The core problem description from the original instruction, with explicit constraints/directives removed. Starter code (e.g., ```python blocks) and solution snippets (e.g., marked by 'BEGIN SOLUTION') must be preserved.",
        "extracted_constraints": [
            {{
                "type": "Constraint_Category_Name",  // MUST be one of the provided relevant_categories
                "constraint": "A single, atomic extracted constraint.",
                "instruction_part": "Extracted from instruction"
            }}
        ]
    }},

    "Inputs Required": {{
        "instruction": {instruction}
        "relevant_categories": {relevant_categories_str}
    }}
    """
    return prompt


prompt_stage2 = get_prompt_stage2_extract_and_simplify(full_df.iloc[26]["instruction"],relevant_categories)
print(prompt_stage2)

In [None]:
instruction2 = "Write a function that sorts a list of numbers using merge sort with time complexity O(n log n)."
code2 = "def merge_sort(arr):\n    if len(arr) > 1:\n        mid = len(arr) // 2\n        left_half = arr[:mid]\n        right_half = arr[mid:]\n\n        merge_sort(left_half)\n        merge_sort(right_half)\n\n        i = j = k = 0\n\n        while i < len(left_half) and j < len(right_half):\n            if left_half[i] < right_half[j]:\n                arr[k] = left_half[i]\  i += 1\n            else:\n                arr[k] = right_half[j]\  j += 1\n            k += 1\n\n        while i < len(left_half):\n            arr[k] = left_half[i]\  i += 1\  k += 1\n\n        while j < len(right_half):\n            arr[k] = right_half[j]\  j += 1\  k += 1\n\n    return arr"
example_prompt2 = get_prompt_stage2_extract_and_simplify(instruction2,relevant_categories)
print(example_prompt2)

In [None]:
response = get_response(example_prompt2,SYSTEM_PROMPT_STAGE2)
print(response)

In [None]:
instruction = """Write a Python function `rotate_text` that takes a string `text` and an integer `rotation` as its parameters and returns a new string where each letter in the original string is rotated by the given rotation amount through the alphabet. The rotation should maintain the case of the original letters (i.e., uppercase letters remain uppercase, and lowercase letters remain lowercase) and should leave non-alphabetic characters unchanged.

For example, with a rotation of 1, 'a' becomes 'b', 'z' becomes 'a', 'A' becomes 'B', and 'Z' becomes 'A'. The function should handle negative rotation values as well, which rotate the letters in the opposite direction.
"""
relevant_categories = ['Input and Output Handling', 'Code Structure and Modularity', 'Scalability and Maintainability']
example_prompt = get_prompt_stage2_extract_and_simplify(instruction,relevant_categories)
response = get_response(example_prompt,SYSTEM_PROMPT_STAGE2)
print(response)

In [None]:
instruction = """Edit the given code to fix the bug and increase the difficulty:

```python
def calculate_tax(price):
    tax_rate = 0.20
    total = price + (price * tax_rate)
    
    # Apply a discount of 10% if the price is greater than $1000
    if price > 1000:
        discount = price * 0.10
        total -= discount
    
    # Round the total to the nearest cent
    total = round(total, 2)
    
    # Add a surcharge of 5% if the total is greater than $500 and less than $1000
    if total > 500 and total < 1000:
        surcharge = total * 0.05
        total += surcharge
    
    # Add a service fee of $50 if the total is less than $100
    if total < 100:
        total += 50
    
    return total
```

Explanation of changes:

1. Added an additional condition to check if the total is greater than $500 and less than $1000. If it is, a surcharge of 5% is added to the total.
2. This additional condition adds complexity to the function, requiring the programmer to consider multiple scenarios and perform additional calculations."""
relevant_categories = ['Code Structure and Modularity', 'Input and Output Handling', 'Documentation and Readability']
example_prompt = get_prompt_stage2_extract_and_simplify(instruction,relevant_categories)
response = get_response(example_prompt,SYSTEM_PROMPT_STAGE2)
print(response)

In [None]:
instruction = """Problem:
I have two csr_matrix, c1, c2.

I want a new matrix Feature = [c1, c2]. But if I directly concatenate them horizontally this way, there's an error that says the matrix Feature is a list. How can I achieve the matrix concatenation and still get the same type of matrix, i.e. a csr_matrix?

And it doesn't work if I do this after the concatenation: Feature = csr_matrix(Feature) It gives the error:

Traceback (most recent call last):
  File "yelpfilter.py", line 91, in <module>
    Feature = csr_matrix(Feature)
  File "c:\python27\lib\site-packages\scipy\sparse\compressed.py", line 66, in __init__
    self._set_self( self.__class__(coo_matrix(arg1, dtype=dtype)) )
  File "c:\python27\lib\site-packages\scipy\sparse\coo.py", line 185, in __init__
    self.row, self.col = M.nonzero()
TypeError: __nonzero__ should return bool or int, returned numpy.bool_

A:
<code>
from scipy import sparse
c1 = sparse.csr_matrix([[0, 0, 1, 0], [2, 0, 0, 0], [0, 0, 0, 0]])
c2 = sparse.csr_matrix([[0, 3, 4, 0], [0, 0, 0, 5], [6, 7, 0, 8]])
</code>
Feature = ... # put solution in this variable
BEGIN SOLUTION
<code>"""
relevant_categories = ['Code Structure and Modularity', 'Input and Output Handling', 'Error Handling and Robustness', 'Data Processing and Transformation', 'Library and API Usage', 'Testing and Debugging', 'Documentation and Readability', 'Reproducibility and Consistency', 'Mathematical Computation']
example_prompt = get_prompt_stage2_extract_and_simplify(instruction,relevant_categories)
response = get_response(example_prompt,SYSTEM_PROMPT_STAGE2)
print(response)

In [None]:
response_stage2 = get_response(prompt_stage2,SYSTEM_PROMPT_STAGE2)
print(response_stage2)
simplified_instruction = extract_json(response_stage2,"simplified_instruction")
extracted_constraints = extract_json(response_stage2,"extracted_constraints")
print(simplified_instruction)
print(extracted_constraints)

##   Stage 3: New Constraint Generation & Merging


In [None]:

SYSTEM_PROMPT_STAGE3_NEW = """
You are a highly skilled constraint curator and generator for code development tasks. Your job is to combine existing constraints with newly generated ones, ensuring the final list is comprehensive, free of conflicts and duplicates, and adheres to strict quality and relevance standards.
"""

def get_prompt_stage3_generate_and_merge_constraints(
    instruction: str,
    code: str,
    relevant_categories: list[str],
    extracted_constraints: list # These are from Stage 2 output
) -> str:
    relevant_categories_str = ", ".join(relevant_categories)
    extracted_constraints_str = json.dumps(extracted_constraints, indent=2)

    prompt = f"""
    "task": "Generate additional high-quality constraints, combine them with provided `extracted_constraints`, and refine the final comprehensive list based on quality principles and category alignment.",

    "context": "You are provided with a `instruction` (core problem), `code` for context, a list of `relevant_categories` for this task, and `extracted_constraints` that were found directly in the original instruction. Your job is to build the definitive, final set of constraints for this benchmark entry.",

    "goal": "Perform the following steps:
    1.  **Generate New Constraints:** Based on the `instruction`, `code`, and `relevant_categories`, generate 5 to 10 **new** natural language constraints. These new constraints must be specific, objective, and leave little room for subjective interpretation. Each should pose a tangible challenge and be clearly verifiable. For each newly generated constraint, set its `instruction_part` to 'Newly Generated'.
        **Note:** If the combined total with extracted constraints exceeds 15, generate fewer new ones; if it falls short of 5, generate more to meet the minimum.
    2.  **Combine and Curate:** Create a single preliminary list of all constraints by merging the `extracted_constraints` with your newly generated constraints.
    3.  **Refine and Resolve Conflicts/Duplicates:**
        * Review the combined list for any duplicate constraints (exact text or very similar meaning) or any subtle contradictions between extracted and newly generated constraints.
        * **Crucially, if a newly generated constraint is a duplicate of, or is semantically covered by, an 'Extracted from instruction' constraint, you MUST discard the newly generated constraint in favor of the extracted one. Do NOT change or tweak the 'Extracted from instruction' constraints.**
        * Resolve other conflicts by refining or merging existing constraints, prioritizing clarity and adhering to the 'Prioritize Modification over Deletion' principle.
    4.  **Finalize Count (5-15 Constraints):** Ensure the final list contains between **5 and 15** unique, valid constraints.
        * If the curated list has less than 5, identify which of the `relevant_categories` are underrepresented and create *additional* high-quality constraints for those categories to reach the minimum.
        * If the curated list has more than 15, select the most critical and impactful constraints that best cover the `relevant_categories` to reduce the list to 15, without losing essential information.
    5.  **Validate Category Alignment:** For every constraint in the final list, ensure its 'type' field is one of the categories explicitly listed in the `relevant_categories` list. If a type does not perfectly match, correct it to the most appropriate category from `relevant_categories`. There should be a perfect overlap.

    **Principles for All Constraints (Extracted & Newly Generated):**
        * **Actionable, Precise, Objective:** Ensure all constraints are specific, measurable, and unambiguous.
        * **Accept Requirements for Additional Code:** If meeting a constraint primarily requires generating *additional* code or functionality (rather than fundamentally altering the core solution), ensure the constraint is phrased to encourage this.
        * **No Unresolved Directives:** Ensure the final generated constraints do not contain any unreplaced directive keywords (e.g., `{{keyword}}`).

    Your output should be a JSON object containing a single field: 'final_comprehensive_constraints'.
    ",

    "JSON Response Format": {{
        "final_comprehensive_constraints": [
            {{
                "type": "Constraint_Category_Name", // MUST be from the 'relevant_categories' list
                "constraint": "Final, specific, objective, and atomic constraint statement.",
                "instruction_part": "Original source: 'Extracted from instruction' or 'Newly Generated' or 'Combined/Refined'"
            }},
            // ... list of 5 to 15 curated constraints
        ]
    }},

    "Inputs Required": {{
        "instruction": {instruction},
        "code": {code},
        "relevant_categories": {relevant_categories_str},
        "extracted_constraints": {extracted_constraints_str}
    }}
    """
    return prompt


prompt_stage3 = get_prompt_stage3_generate_and_merge_constraints(full_df.iloc[26]["instruction"],full_df.iloc[26]["code"],relevant_categories,extracted_constraints)
print(prompt_stage3)

In [None]:
response_stage3 = get_response(prompt_stage3,SYSTEM_PROMPT_STAGE3_NEW)
print(response_stage3)
final_comprehensive_constraints = extract_json(response_stage3,"final_comprehensive_constraints")
print(final_comprehensive_constraints)

## Stage 4: Final Relevance Validation

In [None]:
SYSTEM_PROMPT_STAGE4_FINAL_RELEVANCE_FILTER = """
You are a meticulous validator of programming task specifications. Your role is to critically assess each constraint for relevance to a simplified instruction’s core intent, and then provide a refined list of only the relevant constraints.

You must generate a reasoning statement for each constraint first, and then decide whether the constraint is relevant based on that reasoning. Do not make a binary decision without a clear explanation.
"""


def get_prompt_stage4_final_relevance_and_filter(instruction: str, final_comprehensive_constraints: list) -> str:
    import json
    constraints_str = json.dumps(final_comprehensive_constraints, indent=2)

    prompt = f"""
"task": "Critically evaluate each constraint in the `final_comprehensive_constraints` list against the `instruction` to determine its direct relevance. For each constraint, first write a reasoning statement explaining your judgment. Then, based on the reasoning, assign a boolean `is_relevant` value. Finally, provide a new list containing only the relevant constraints.",

"context": "You are provided with a `instruction` and a list of `final_comprehensive_constraints` that are intended to guide code generation for this task. Your job is to assess each constraint's logical consistency and direct relevance to the simplified core problem.",

"goal": "Perform the following steps:
1. **Understand the Instruction:** Read and understand the core task described in the `instruction`.
2. **Evaluate Constraints with Justification:**
   a. For each constraint:
     - Write a reasoning paragraph explaining whether and how the constraint supports, complements, or conflicts with the `instruction`.
     - Then, based on your reasoning, assign a boolean value `is_relevant`.
     - If the constraint is vague, off-topic, too generic, or in conflict with the instruction, mark it as false.
     - If the constraint is about documentation, include it only if it directly supports the core problem or is necessary for understanding the code's functionality. 
3. **Filter the Relevant Ones:** Build a new list containing only those constraints with `is_relevant: true`.
4. **Explain Exclusions:** At the end, explain which constraints were removed and why (summary).

Output Format:

Return a single JSON object with the following structure:

{{
  "evaluated_constraints": [
    {{
      "constraint": "The original constraint text.",
      "reasoning": "Detailed explanation of why this constraint is or is not relevant.",
      "is_relevant": true // or false, based on the reasoning
    }},
    // one per constraint
  ],
  "filtered_relevant_constraints": [
    {{
      "type": "Constraint_Category_Name",
      "constraint": "Relevant constraint statement.",
      "instruction_part": "Original source: 'Extracted from instruction', 'Newly Generated', or 'Combined/Refined'"
    }}
    // only constraints marked relevant
  ],
  "reasoning_for_removal": "Summary of why certain constraints were removed."
}}

Inputs Required:
- instruction: {instruction}
- final_comprehensive_constraints: {constraints_str}
"""
    return prompt


prompt_stage4 = get_prompt_stage4_final_relevance_and_filter(full_df.iloc[26]["instruction"],final_comprehensive_constraints)
print(prompt_stage4)

In [None]:
response_stage4 = get_response(prompt_stage4,SYSTEM_PROMPT_STAGE4_FINAL_RELEVANCE_FILTER)
print(response_stage4)
final_final_constraints = extract_json(response_stage4,"filtered_relevant_constraints")
print(final_final_constraints)

## Implemeting the 4 step process

### dataset

In [None]:
!pwd

In [None]:

from pathlib import Path
import os
if os.path.exists("/content"):
    output_path = "./"
else:
    root = "../../"
    output_path = root+"data/benchmark/"+version+"/"
    folder = Path(output_path)
    folder.mkdir(parents=True, exist_ok=True)
print(output_path)

### Implementing stage 1

In [None]:
# Run on first 20 rows (you can change this limit)
def map_categories(df,output_pth,input_col1,input_col2,output_col,categories=categories):
    results = []
    prompts = []
    for i, row in df.iterrows():
        # print(f"Processing row {i}")
        prompt = get_prompt_stage1_select_categories(row[input_col1], row[input_col2], categories)
        prompts.append(prompt)
    categories_list = get_model_response_batch(prompts,SYSTEM_PROMPT_STAGE1)

    
    categories_list = [extract_json(categories,"relevant_categories") for categories in categories_list]
    results = categories_list

    df[output_col] = results
    df.to_csv(output_pth, index=False)
    print("Saved the file successfully")
    return df


In [None]:
map_categories(df,output_path+"step1_with_categories.csv","instruction","code","relevant_categories")

In [None]:
from collections import Counter
import ast

all_categories = [category for sublist in df['relevant_categories'] for category in sublist]
category_counts = Counter(all_categories)
category_df = pd.DataFrame.from_dict(category_counts, orient='index', columns=['Count'])
category_df = category_df.sort_values(by='Count', ascending=False)
display(category_df)

### Implementing stage 2

In [None]:
df.info()

In [None]:
def extract_and_simplify(df,output_pth,input_col1,input_col2):
    extracted_constraints = []
    prompts = []
    for _,row in df.iterrows():
        # print(f"Processing row {i}")
        prompt = get_prompt_stage2_extract_and_simplify(row[input_col1], row[input_col2])
        prompts.append(prompt)
    responses = get_model_response_batch(prompts,SYSTEM_PROMPT_STAGE2)
    simplified_instructions = [extract_json(response,"simplified_instruction") for response in responses]
    extracted_constraints = [extract_json(response,"extracted_constraints") for response in responses]
    df["simplified_instruction"] = simplified_instructions
    df["extracted_constraints"] = extracted_constraints

    df.to_csv(output_pth, index=False)
    print("Saved the file successfully")
    return df

In [None]:
extract_and_simplify(df,output_path+"step2_with_simplified_instruction_and_constraints.csv","instruction","relevant_categories")

### Implementing stage 3


In [None]:
def generate_new_constraints(df,output_pth,input_col1,input_col2,input_col3,input_col4):
    prompts = []
    for _, row in df.iterrows():
        # print(f"Processing row {i}")
        prompt = get_prompt_stage3_generate_and_merge_constraints(row[input_col1], row[input_col2], row[input_col3], row[input_col4])
        prompts.append(prompt)
    responses = get_model_response_batch(prompts,SYSTEM_PROMPT_STAGE3_NEW)
    final_constraints = [extract_json(response,"final_comprehensive_constraints") for response in responses]
    df["final_comprehensive_constraints"] = final_constraints
    df.to_csv(output_pth, index=False)
    print("Saved the file successfully")
    return df

In [None]:
generate_new_constraints(df,output_path+"step3_with_final_comprehensive_constraints.csv","instruction","code","relevant_categories","extracted_constraints")

### Impementing the Stage 4

In [None]:
def final_relevance_filter(df,output_pth,input_col1,input_col2):
    results = []
    prompts = []
    for _, row in df.iterrows():
        # print(f"Processing row {i}")
        prompt = get_prompt_stage4_final_relevance_and_filter(row[input_col1], row[input_col2])
        prompts.append(prompt)
    responses = get_model_response_batch(prompts,SYSTEM_PROMPT_STAGE4_FINAL_RELEVANCE_FILTER)
    df["filtered_relevant_constraints"] = [extract_json(response,"filtered_relevant_constraints") for response in responses]
    df.to_csv(output_pth, index=False)
    print("Saved the file successfully")
    return df

In [None]:
final_relevance_filter(df,output_path+"step4_with_filtered_relevant_constraints.csv","instruction","final_comprehensive_constraints")

# Validating the Constraints

In [None]:
# SYSTEM_PROMPT for the Quality Judge LLM
SYSTEM_PROMPT_QUALITY_JUDGE = """
You are an expert in meticulously evaluating the quality of programming constraints.
Your task is to objectively assess a provided list of generated constraints based on strict quality criteria.
For each constraint, you will provide specific scores for Atomicity, Relevance, and Objectivity, along with detailed reasoning and suggestions for improvement.
Finally, you will synthesize these individual evaluations into a unified quality score and an overall analysis for the entire set of constraints.
Your judgment must be impartial and directly tied to the definitions provided.
"""

# User-facing prompt for the Quality Judge LLM
def get_quality_judge_prompt(original_instruction, original_code, generated_constraint_list):
    return f"""
Original Instruction: {original_instruction}

Original Code (for context, if available):
```python
{original_code}
```

List of Generated Constraints to Evaluate:
{generated_constraint_list}

Constraint Quality Criteria:
Each score ranges from 1 (poor) to 5 (excellent).

Atomicity Score (1–5):
Definition: Measures whether a constraint expresses exactly one indivisible requirement.
An atomic constraint should not contain multiple independent rules or directives.
Examples:
- 1 (Non-Atomic): "Return a float and raise ValueError for invalid input."
- 5 (Atomic): "Raise ValueError for invalid input."

Relevance Score (1–5):
Definition: Measures how well the constraint aligns with the core task described in the instruction.
A relevant constraint addresses functionality, behavior, or structure directly related to the task.
Examples:
- 1 (Off-topic): "Avoid using global variables."
- 5 (Directly relevant): "Raise ValueError if the input DataFrame is empty."

Objectivity Score (1–5):
Definition: Measures whether the constraint can be evaluated without personal interpretation or judgment.
An objective constraint uses measurable, observable, or clearly definable criteria.
Examples:
- 1 (Subjective): "The code should be intuitive and clean."
- 5 (Objective): "The function must return a list of integers."

Output Format:
Your response must be a single JSON object.

{{
  "constraint_evaluations": [
    {{
      "constraint_text": "The exact text of the constraint from the input list.",
      "atomicity_score": int,      // Score from 1 to 5
      "relevance_score": int,      // Score from 1 to 5
      "objectivity_score": int,    // Score from 1 to 5
      "reasoning": "Detailed explanation for each score and, if any scores are low, suggestions for improving atomicity, relevance, or objectivity."
    }}
    // ... for each constraint in the input list
  ],
  "avg_atomicity": float,         // Average atomicity score
  "avg_relevance": float,         // Average relevance score
  "avg_objectivity": float,       // Average objectivity score
  "unified_quality_score": float, // Average of the three above
  "overall_analysis": "A summary of the overall quality of the constraint set, including strengths and weaknesses."
}}"""

prompt = get_quality_judge_prompt(test_df.iloc[509]["instruction"],test_df.iloc[509]["code"],test_df.iloc[509]["final_comprehensive_constraints"])
print(prompt)

In [None]:
response = get_response(prompt,SYSTEM_PROMPT_QUALITY_JUDGE)
print(response)

In [None]:
def get_dict(response):
    try:
        json_string = response.strip().replace('```json\n', '', 1).replace('\n```', '', 1)
        dict_json = json.loads(json_string)
        return dict_json
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e} in string: {response}")
        return []
    except AttributeError as e:
        print(f"Attribute error: {e} in string: {response}")
        return []


In [None]:
# extract the final scores and keep them in separated 4 columns
def extract_quality_scores(df,score_column="quality_scores"):
    df["relevance_score"] = df[score_column].apply(lambda x: x.get("avg_relevance", 0) if isinstance(x, dict) else 0)
    df["objectivity_score"] = df[score_column].apply(lambda x: x.get("avg_objectivity", 0) if isinstance(x, dict) else 0)
    df["atomicity_score"] = df[score_column].apply(lambda x: x.get("avg_atomicity", 0) if isinstance(x, dict) else 0)
    df["unified_quality_score"] = df[score_column].apply(lambda x: x.get("unified_quality_score", 0) if isinstance(x, dict) else 0)
    return df



In [None]:
def measure_constraints(df,output_pth,input_col1,input_col2,input_col3,output_col):
    results = []
    prompts = []
    for _, row in df.iterrows():
        # print(f"Processing row {i}")
        prompt = get_quality_judge_prompt(row[input_col1], row[input_col2], row[input_col3])
        prompts.append(prompt)
    responses = get_model_response_batch(prompts,SYSTEM_PROMPT_QUALITY_JUDGE)
    # print(responses)
    results = [get_dict(response) for response in responses]
    df[output_col] = results
    df = extract_quality_scores(df)
    print(df.head())
    df.to_csv(output_pth, index=False)
    print("Saved the file successfully")
    return df

In [None]:
df = measure_constraints(df,output_path+"step5_with_quality_scores.csv","instruction","code","filtered_relevant_constraints","quality_scores")
df.head()

In [None]:
# df = df.drop(columns=["","quality_scores","unified_quality_score"])
df.info()

In [None]:
df.columns

In [None]:
# df = df.drop(columns=['quality_scores', 'specificity_score', 'objectivity_score',
#        'atomicity_score', 'unified_quality_score', 'relevance_score'])

In [None]:
df.info()

## Combining the Instruction and Constraints

In [None]:
import ast
n = 375
# n=12
# n = 11

original_instruction = df.iloc[n]["instruction"]
new_constraints = df.iloc[n]["filtered_relevant_constraints"]

#filter new_constraints on 'instruction_part': 'Newly Generated'
def filter_new_constraints(constraints):
    constraints = ast.literal_eval(constraints)
    return [c["constraint"] for c in constraints if c.get("instruction_part") == "Newly Generated"]

new_constraints = filter_new_constraints(new_constraints)
print(f"Original Instruction: {original_instruction}")
print("New Constraints:", new_constraints)

In [None]:
def insert_constraints_prompt(original_instruction,new_constraints):
    prompt = f""" Context: I have instruction for LLM to generate response and I also have some set of conditions for the model to follow while generating the response related to the instruction.
    Task: Take the instruction and the conditions provided and insert the conditions into the instruction and return the new instruction.
    Rules for completing the task:
    1. For every condition , find a relevant position in the instruction where this condition could be easily be inserted and still the instruction make sense.
    2. Depending on the need, paraphrase the condition to suit the relevant position in the instruction so that the condition is naturally inserted into the instruction.
    3. Do not delete, reorder, or alter any original content.
    4. Keep all punctuation, line breaks, return types, and code formatting intact.
    5. Solution snippets demarcated by BEGIN SOLUTION, you **MUST NOT** modify or remove these from the new_instruction
    ## Input format:
    {{"Instruction": {original_instruction}
    "Conditions": {new_constraints} }}
    ## response format
    {{"new_instruction": "The new instruction with the conditions merged in a natural way.}}"""
    return prompt

# prompt = insert_constraints_prompt(original_instruction,new_constraints)
# print("prompt: \n")
# print(prompt)
# print("original_instruction:", original_instruction)
# print("------------------")
# print("new_constraints:", new_constraints)
# print("------------------")
# response = get_response(prompt)
# print("response: ...................")
# print(response)
# print("New Instruction:", extract_json(response, "new_instruction"))



In [None]:
def combine_instruction(df,output_pth,input_col1,input_col2,output_col):
    prompts = []
    for _, row in df.iterrows():
        # print(f"Processing row {i}")
        prompt = insert_constraints_prompt(row[input_col1], row[input_col2])
        prompts.append(prompt)
    responses = get_model_response_batch(prompts)
    new_instructions = [extract_json(response, "new_instruction") for response in responses]
    df[output_col] = new_instructions
    df.to_csv(output_pth, index=False)
    print("Saved the file successfully")
    return df
    

In [None]:
combine_instruction(df,output_path+"step6_with_combined_instruction.csv","instruction","filtered_relevant_constraints","combined_instruction")

In [None]:
df.info()

In [None]:
df.tail(100)

In [None]:
df.info()

In [None]:
# df = df.drop(columns = ["combined_instruction"])
# df.info()

## Constraints Statistics

In [None]:
benchmark_v1 = df.copy()
benchmark_v1.info()

In [None]:
benchmark_v1.columns

In [None]:
benchmark_v1.to_csv(root+"benchmark/benchmark_v1.csv",index=True,index_label=["id"])


In [None]:
output_path = "../../benchmark/"
benchmark_v1 = pd.read_csv(output_path+"benchmark_v1.csv")
benchmark_v1.head()

In [None]:
import pandas as pd
root = "../.."
data = pd.read_csv(root+"/benchmark/outputs/benchmark_google_flan-t5-base.csv")
data.head()
data.info()

In [None]:
print(data.iloc[0,17])

In [None]:
benchmark_v1.info()

In [None]:
import pandas as pd
from ast import literal_eval
 
score_columns = ['relevance_score', 'objectivity_score', 'atomicity_score','unified_quality_score']
score_stats = benchmark_v1[score_columns].agg(['max', 'min', 'mean'])
print("Score Statistics:\n", score_stats)

def count_constraints(constraints, source):
    if pd.isna(constraints):
        return 0
    try:
        parsed = literal_eval(constraints)
        return sum(1 for item in parsed if item.get('instruction_part') == source)
    except Exception as e:
        print(f"Error parsing: {constraints}\n{e}")
        return 0

# Create new columns
benchmark_v1['num_extracted_constraints'] = benchmark_v1['filtered_relevant_constraints'].apply(lambda x: count_constraints(x, 'Extracted from instruction'))
benchmark_v1['num_new_constraints'] = benchmark_v1['filtered_relevant_constraints'].apply(lambda x: count_constraints(x, 'Newly Generated'))
benchmark_v1['num_total_constraints'] = benchmark_v1['num_extracted_constraints'] + benchmark_v1['num_new_constraints']


constraint_stats = benchmark_v1[['num_extracted_constraints', 'num_new_constraints', 'num_total_constraints']].mean()
print("\nConstraint Count Averages:\n", constraint_stats)

# benchmark_v1.to_csv(root+'benchmark/benchmark_v1_with_stats.csv', index=False)


In [None]:
benchmark_v1['num_total_constraints'].describe()

In [None]:
benchmark_v1['num_extracted_constraints'].describe()

In [None]:
benchmark_v1['num_new_constraints'].describe()

# Refining the benchmark

In [None]:
!pwd

In [None]:
import pandas as pd
import re

# Load the dataset
df = pd.read_csv("../../benchmark/benchmark_v1.csv")  # or pd.read_parquet("your_file.parquet")

# Regex pattern to detect and remove the solution block
solution_pattern = re.compile(r"BEGIN SOLUTION.*?END SOLUTION", re.DOTALL)

# Function to clean combined_instruction only if it wrongly includes the solution block
def clean_combined_instruction(row):
    instr = row['instruction']
    comb_instr = row['combined_instruction']
    
    instr_has_solution = bool(solution_pattern.search(instr))
    comb_has_solution = bool(solution_pattern.search(comb_instr))
    
    # If BEGIN SOLUTION block exists only in combined_instruction, remove it
    if comb_has_solution and not instr_has_solution:
        print("removing the snippet from combined_solution")
        comb_instr = solution_pattern.sub("", comb_instr).strip()
        row['combined_instruction'] = comb_instr
    
    return row

# Apply the cleaning logic row by row
df = df.apply(clean_combined_instruction, axis=1)

# Save to a new file or overwrite
df.to_csv("../../benchmark/benchmark_v2.csv", index=False)


In [None]:
import ast
import pandas as pd

 # or pd.read_parquet(...)

# Define the 12 valid supercategories
valid_categories = [
    "Code Structure and Modularity",
    "Input and Output Handling",
    "Error Handling and Robustness",
    "Data Processing and Transformation",
    "Performance and Optimization",
    "Library and API Usage",
    "Testing and Debugging",
    "Documentation and Readability",
    "Security and Privacy",
    "Reproducibility and Consistency",
    "Mathematical Computation",
    "File and Data Management",
    "UI and Interaction",
]

# Track rows with invalid types
invalid_entries = []

for i, row in df.iterrows():
    try:
        constraints = row["filtered_relevant_constraints"]
        for c in constraints:
            constraint_type = c.get("type", "")
            if constraint_type not in valid_categories:
                invalid_entries.append({
                    "row_index": i,
                    "invalid_type": constraint_type,
                    "constraint": c.get("constraint", "")
                })
    except Exception as e:
        print(f"Row {i} could not be parsed: {e}")

# Convert to DataFrame for easy inspection
invalid_df = pd.DataFrame(invalid_entries)

invalid_df.info()


In [None]:
invalid_df.head()

In [None]:
import ast
import pandas as pd

# or use read_parquet()

# Convert string representation of lists to actual lists
def is_empty_list(value):
    try:
        parsed = ast.literal_eval(value) if isinstance(value, str) else value
        return isinstance(parsed, list) and len(parsed) == 0
    except Exception:
        return False  # skip invalid entries

# Count empty lists in 'combined_solution' column
empty_list_count = df['combined_instruction'].apply(is_empty_list).sum()

print(f"Number of empty lists in 'combined_solution': {empty_list_count}")


In [None]:
import ast
import pandas as pd

# or read_parquet(...)

# Identify rows where combined_instruction is an empty list string
def is_empty_list_string(value):
    try:
        parsed = ast.literal_eval(value) if isinstance(value, str) else value
        return isinstance(parsed, list) and len(parsed) == 0
    except:
        return False

# Filter the rows that failed previously
mask = df['combined_instruction'].apply(is_empty_list_string)
df_failed = df[mask].copy()

print(f"Found {len(df_failed)} rows with empty combined_instruction")

# Re-run the combine_instruction function only on failed rows
df_failed_updated = combine_instruction(
    df_failed,
    output_pth="re_generated_only.csv",  # optional file
    input_col1="instruction",
    input_col2="filtered_relevant_constraints",
    output_col="combined_instruction"
)

# Replace back the corrected rows in the original DataFrame
df.update(df_failed_updated)

# Save the final cleaned and updated DataFrame
df.to_csv("../../benchmark/benchmark_v3.csv", index=False)
print("All empty combined_instruction rows regenerated and merged back.")


In [None]:
import pandas as pd
import ast


def is_empty_list(value):
    try:
        parsed = ast.literal_eval(value) if isinstance(value, str) else value
        return isinstance(parsed, list) and len(parsed) == 0
    except:
        return False

# Filter out rows with empty list in combined_instruction
df_cleaned = df[~df["combined_instruction"].apply(is_empty_list)]

# (Optional) Save the cleaned DataFrame
df_cleaned.to_csv("benchmark_v3.csv", index=False)

print(f"✅ Removed {len(df) - len(df_cleaned)} rows with empty lists in 'combined_instruction'")


In [None]:
df = df_cleaned.copy()

# Matching the proper categories

In [None]:
def map_category_prompt(constraint, invalid_type, valid_categories):
    category_list = "\n".join(f"- {cat}" for cat in valid_categories)
    return f"""Task: You are given a constraint and an incorrect category label. Your job is to map this constraint to the most appropriate correct category from the list of valid categories.

Constraint: "{constraint}"

Invalid Category: "{invalid_type}"

Valid Categories:
{category_list}

Return your response in the format:
{{"correct_category": "<best matching category from the list>"}}
"""


In [None]:
def correct_invalid_categories(invalid_df, valid_categories):
    prompts = [
        map_category_prompt(row["constraint"], row["invalid_type"], valid_categories)
        for _, row in invalid_df.iterrows()
    ]
    
    responses = get_model_response_batch(prompts)
    corrected = [extract_json(resp, "correct_category") for resp in responses]
    
    invalid_df["corrected_type"] = corrected
    return invalid_df


In [None]:
def apply_category_corrections(df, invalid_df):
    # Convert column to Python objects
    df["filtered_relevant_constraints"] = df["filtered_relevant_constraints"].apply(ast.literal_eval)

    # Group corrections by row_index
    correction_map = {}
    for _, row in invalid_df.iterrows():
        idx = row["row_index"]
        constraint_text = row["constraint"]
        corrected_type = row["corrected_type"]
        if idx not in correction_map:
            correction_map[idx] = {}
        correction_map[idx][constraint_text] = corrected_type

    # Apply corrections
    for idx, corrections in correction_map.items():
        updated = []
        for constraint in df.at[idx, "filtered_relevant_constraints"]:
            text = constraint.get("constraint")
            if text in corrections:
                constraint["type"] = corrections[text]
            updated.append(constraint)
        df.at[idx, "filtered_relevant_constraints"] = updated

    return df


In [None]:
df.to_csv("corrected_constraint_categories.csv", index=False)
print("✅ Saved updated constraints with corrected categories.")


In [None]:
import pandas as pd
import ast

# === Step 1: Define valid categories ===
VALID_CATEGORIES = [
    "Code Structure and Modularity",
    "Input and Output Handling",
    "Error Handling and Robustness",
    "Data Processing and Transformation",
    "Performance and Optimization",
    "Library and API Usage",
    "Testing and Debugging",
    "Documentation and Readability",
    "Security and Privacy",
    "Reproducibility and Consistency",
    "Mathematical Computation",
    "File and Data Management",
    "UI and Interaction",
]

# === Step 2: Prompt for category correction ===
def map_category_prompt(constraint, invalid_type, valid_categories):
    category_list = "\n".join(f"- {cat}" for cat in valid_categories)
    return f"""Task: You are given a constraint and an incorrect category label. Your job is to map this constraint to the most appropriate correct category from the list of valid categories.

Constraint: "{constraint}"

Invalid Category: "{invalid_type}"

Valid Categories:
{category_list}

Return your response in the format:
{{"correct_category": "<best matching category from the list>"}}
"""

# === Step 3: Extract invalid constraint entries ===
def extract_invalid_constraints(df):
    invalid_entries = []
    for i, row in df.iterrows():
        try:
            constraints = ast.literal_eval(row["filtered_relevant_constraints"])
            for c in constraints:
                constraint_type = c.get("type", "")
                if constraint_type not in VALID_CATEGORIES:
                    invalid_entries.append({
                        "row_index": i,
                        "invalid_type": constraint_type,
                        "constraint": c.get("constraint", "")
                    })
        except Exception as e:
            print(f"Row {i} parsing error: {e}")
    return pd.DataFrame(invalid_entries)

# === Step 4: Get corrected categories from model ===
def correct_invalid_categories(invalid_df, valid_categories):
    prompts = [
        map_category_prompt(row["constraint"], row["invalid_type"], valid_categories)
        for _, row in invalid_df.iterrows()
    ]
    
    responses = get_model_response_batch(prompts)
    corrected = [extract_json(resp, "correct_category") for resp in responses]
    
    invalid_df["corrected_type"] = corrected
    return invalid_df


def apply_category_corrections(df, invalid_df):
    df["filtered_relevant_constraints"] = df["filtered_relevant_constraints"].apply(ast.literal_eval)

    correction_map = {}
    for _, row in invalid_df.iterrows():
        idx = row["row_index"]
        constraint_text = row["constraint"]
        corrected_type = row["corrected_type"]
        if idx not in correction_map:
            correction_map[idx] = {}
        correction_map[idx][constraint_text] = corrected_type

    for idx, corrections in correction_map.items():
        updated = []
        for c in df.at[idx, "filtered_relevant_constraints"]:
            if c["constraint"] in corrections:
                c["type"] = corrections[c["constraint"]]
            updated.append(c)
        df.at[idx, "filtered_relevant_constraints"] = updated

    return df


def run_constraint_category_mapping(df, output_file="corrected_categories.csv"):
    print("🔍 Extracting invalid constraint categories...")
    invalid_df = extract_invalid_constraints(df)
    print(f"Found {len(invalid_df)} invalid entries.")

    if invalid_df.empty:
        print("✅ No invalid categories found.")
        return df

    print("🤖 Sending prompts to model for correction...")
    invalid_df = correct_invalid_categories(invalid_df, VALID_CATEGORIES)

    print("🔄 Applying corrections back to DataFrame...")
    df = apply_category_corrections(df, invalid_df)

    print(f"💾 Saving to {output_file} ...")
    df.to_csv(output_file, index=False)
    print("✅ Done.")
    return df


In [None]:
df_cleaned = run_constraint_category_mapping(df)

In [None]:
df = df_cleaned.copy()

In [None]:
df.head()

In [None]:
df_cleaned.to_csv("../../benchmark/benchmark_v4.csv",index=False)


# Code Responses Generation

In [193]:
from openai import OpenAI

# from transformers import AutoTokenizer
from ratelimit import limits, sleep_and_retry
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

NUM_CALLS_PER_MIN = 1500

class LLMClient:

    def __init__(self, api_key,model_id,client_type="rits", base_url=None):
        if client_type == "rits":
            llm = OpenAI(
                api_key=api_key,
                base_url=base_url,
                default_headers={"RITS_API_KEY": api_key},
            )
        elif client_type == "GPT":
            llm = OpenAI(
                api_key=api_key,
                base_url=base_url,)
        self.llm = llm
        self.model_id = model_id
        # self.tokenizer = AutoTokenizer.from_pretrained(model_id)

    def get_model_response(
        self,
        messages=None,
        system_prompt=None,
        user_prompt=None,
        max_new_tokens=2048,
        min_new_tokens=30,
        temperature=0.7,
        top_k=50,
        top_p=0.8,
        repetition_penalty=1.05,
    ):
        # Setup the sampling parameters for generation
        if messages is None:
            if system_prompt:
                messages = [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt},
                ]
            else:
                messages = [{"role": "user", "content": user_prompt}]

        response = self.llm.chat.completions.create(
            model=self.model_id,
            messages=messages,
            max_tokens=max_new_tokens,
            temperature=temperature,
        )

        return response.choices[0].message.content.strip()

    def apply_chat_template(self, messages_list):
        prompts = [
            self.tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
            for messages in messages_list
        ]
        return prompts

    def get_model_response_batch(
        self, system_prompt=None, user_prompts=None, max_new_tokens=400, temperature=0.7
    ):
        non_none_user_prompts = [ele for ele in user_prompts if ele is not None]
        if system_prompt:
            messages_list = [
                [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt},
                ]
                for user_prompt in non_none_user_prompts
            ]
        else:
            messages_list = [
                [{"role": "user", "content": user_prompt}]
                for user_prompt in non_none_user_prompts
            ]
        with ThreadPoolExecutor(max_workers=NUM_CALLS_PER_MIN) as executor:
            response_texts = list(
                tqdm(
                    executor.map(
                        lambda messages: self.call_api(
                            messages, max_new_tokens, temperature
                        ),
                        messages_list,
                    ),
                    total=len(messages_list),
                    desc="Processing",
                )
            )
        response_iter = iter(response_texts)
        all_response_texts = [
            next(response_iter) if ele is not None else None for ele in user_prompts
        ]
        return all_response_texts

    @sleep_and_retry
    @limits(calls=1500, period=60)
    def call_api(self, messages, max_new_tokens, temperature):
        response = self.get_model_response(
            messages=messages, max_new_tokens=max_new_tokens, temperature=temperature
        )
        return response


In [None]:
import argparse
import pandas as pd
import json
from tqdm import tqdm
import os
import dotenv
dotenv.load_dotenv()
random_state = 42

def main(input_csv, output_dir, api_key, model_id, base_url, temperature=0.1, system_prompt=None):
    df_ini = pd.read_csv(input_csv)
    df = df_ini.sample(10,random_state=random_state).copy()  # Limit to first 10 rows for testing

    user_prompts = df["combined_instruction"].tolist()
    model_name = model_id.split("/")[-1]
    output_path = os.path.join(output_dir, f"{model_name}_results.jsonl")
    client = LLMClient(
        api_key=api_key,
        model_id=model_id,
        client_type="rits",
        base_url=base_url
    )

    print(f"Generating responses for {len(user_prompts)} prompts.")
    responses = client.get_model_response_batch(
        system_prompt=system_prompt,
        user_prompts=user_prompts,
        temperature=temperature

    )

    output_data = []
    for row, response in zip(df.to_dict(orient="records"), responses):
        row["response"] = response
        output_data.append(row)


    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        for entry in output_data:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

    print(f"Saved output to: {output_path}")

if __name__ == "__main__":
    input_csv = "./benchmark/benchmark_v4.csv"
    output_dir = "./benchmark/response_outputs"
    api_key = os.getenv("RITS_API_KEY")
    model_id = "meta-llama/Llama-3.1-8B-Instruct"
    print(api_key)
    base_url = "https://inference-3scale-apicast-production.apps.rits.fmaas.res.ibm.com/llama-3-1-8b-instruct/v1"
    temperature = 0.1
    
    main(
        input_csv=input_csv,
        output_dir=output_dir,
        api_key=api_key,
        model_id=model_id,
        base_url=base_url,
        temperature=temperature
    )


# LLM as Judge

## LLM Judge for constraint adherence

In [107]:
def load_model_json_response(response):
    if response.startswith("```json") and response.endswith("```"):
        response = response[7:-3].strip()
    try:
        response_dict = json.loads(response)
        return response_dict
    except:
        pass
    try:
        json_pattern = re.compile(r"\{.*?\}", re.DOTALL)
        match = json_pattern.search(response)
        if match:
            json_block = match.group(0)
            json_block = json_block.replace("\\'", "'")
            response_dict = json.loads(json_block)
            return response_dict
        else:
            print("Not able to extract json data")
            return None
    except:
        print("Not able to extract json data")
        return None

In [108]:
def extract_response_constraint_decision(output):
    if output is None:
        return None
    output_dict = load_model_json_response(output)
    if output_dict:
        return output_dict
    return (None, output)

In [109]:
def function_batch_handler(function_to_batch: callable):
    return lambda *args_lists: [function_to_batch(*args) for args in zip(*args_lists)]

In [110]:
def response_constraint_validator(model_responses: str, constraints_list: list[str], instructions: str,
                                  client: LLMClient):
    """Return a boolean which indicates whether response satisfies all constraints or not."""

    def build_user_prompt(instruction, constraints, response):
        return f"""[Instruction]:
    {instruction}

    [Constraints]:
    {constraints}

    [Response]:
    ```python
    {response}
    ```"""

    system_prompt1 = """You are a verifier. Your task is to evaluate whether a given response satisfies a set of constraints for a specific instruction.

    You will be provided:
    - An instruction
    - A list of constraints
    - A response to the instruction

    Your task:
    - Analyze the response against each constraint independently.
    - For each constraint, determine whether it is satisfied, and provide explanation.
    - Do not make assumptions beyond the constraints. Only base your judgment on what is explicitly written.

    Output your evaluation in a valid JSON format like below:
    {"Evaluation": [
        {
        "Constraint": "<constraint text>",
        "Reason": "explanation",
        "Aligns": [true|false]
        },
        ...
    ],
    "FinalDecision": {
        "Score": "<number of constraints met>/<total number of constraints>",
        "Reason": "<Yes if all constraints were satisfied; No if any were violated, with explanation>",
        "Aligns": [true|false]
    }
    }

    Do not include any text outside this JSON object.
    """

    user_prompts = [
        build_user_prompt(instruction, constraints, model_response) if model_response is not None else None
        for instruction, constraints, model_response in zip(instructions, constraints_list, model_responses)
    ]

    outputs = client.get_model_response_batch(user_prompts=user_prompts,
                                                      system_prompt=system_prompt1,
                                                      temperature=0.3,
                                                      max_new_tokens=1000)
    extract_response_constraint_decision_batch = function_batch_handler(extract_response_constraint_decision)
    is_correct_list = extract_response_constraint_decision_batch(outputs)
    return is_correct_list

In [196]:
import dotenv
import os
dotenv.load_dotenv()
def create_clients(mode="rits"):
    if mode == "rits":
        RITS_KEY = os.getenv("RITS_API_KEY")
        print(RITS_KEY)
        base_url = "https://inference-3scale-apicast-production.apps.rits.fmaas.res.ibm.com/microsoft-phi-4/v1"
        model_id = "microsoft/phi-4"
        client = LLMClient(api_key=RITS_KEY,
                                    model_id=model_id,
                                    client_type="rits",
                                    base_url=base_url,
                                    )
    elif mode == "GPT-azure":
        openai_key = os.getenv("IBM_OPENAI_API_KEY")
        base_url="https://ete-litellm.bx.cloud9.ibm.com"
        client = LLMClient(api_key=openai_key,
                                  model_id="Azure/gpt-4.1-mini",
                                  base_url=base_url,
                                  client_type="GPT")
        
    elif mode == "GPT":
        openai_key = os.getenv("OPENAI_API_KEY")
        client = LLMClient(api_key=openai_key,
                                  model_id="gpt-4o-mini",
                                  client_type="GPT")

    return client

In [112]:
import pandas as pd
res_df = pd.read_json("./benchmark/response_outputs/Llama-3.1-8B-Instruct_results.jsonl",lines=True)
res_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 17 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   id                               10 non-null     int64  
 1   dataset                          10 non-null     object 
 2   instruction                      10 non-null     object 
 3   code                             10 non-null     object 
 4   test                             5 non-null      object 
 5   relevant_categories              10 non-null     object 
 6   simplified_instruction           10 non-null     object 
 7   extracted_constraints            10 non-null     object 
 8   final_comprehensive_constraints  10 non-null     object 
 9   filtered_relevant_constraints    10 non-null     object 
 10  quality_scores                   10 non-null     object 
 11  relevance_score                  10 non-null     float64
 12  objectivity_score        

In [None]:
client = create_clients(mode="GPT")
print(client.get_model_response(messages=[
            {"role": "user", "content": "What is the capital of France?"}]))

The capital of France is Paris.


In [113]:
client = create_clients()
print(client.get_model_response(user_prompt="What is the capital of France?"))

48032bfa1e1d782cece12fedb6b3fb40


The capital of France is Paris.


In [114]:
client = create_clients(mode="GPT")
instructions = res_df["instruction"].to_list()
model_responses = res_df["response"].to_list()
constraints_list = res_df["filtered_relevant_constraints"].to_list()

result_list = response_constraint_validator(model_responses,constraints_list,instructions,client)

Processing:   0%|                                                                                                                            | 0/10 [00:00<?, ?it/s]

Processing: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.58s/it]


In [115]:
for result in result_list:
    print(result)

{'Evaluation': [{'Constraint': 'The function should output a numpy.ndarray representing the generated complex wave.', 'Reason': "The response includes a return statement that outputs 'wave_windowed', which is a numpy array of complex numbers.", 'Aligns': True}, {'Constraint': 'The function should output a matplotlib.figure.Figure representing the figure object of the plot.', 'Reason': "The response includes a return statement that outputs 'fig', which is the figure object of the plot.", 'Aligns': True}, {'Constraint': 'The function should output a matplotlib.axes.Axes representing the axes object of the plot.', 'Reason': "The response includes a return statement that outputs 'ax', which is the axes object of the plot.", 'Aligns': True}, {'Constraint': 'The plot title must be "Complex Wave with Hann Window".', 'Reason': "The response sets the title of the plot to 'Complex Wave with Hann Window' as required.", 'Aligns': True}, {'Constraint': 'The x-label of the plot must be "Time".', 'Re

In [116]:
def extract_alignment_scores(evaluation_list):
    results = []
    for entry in evaluation_list:
        eval_items = entry.get("Evaluation", [])
        aligns_row = [1 if item.get("Aligns") else 0 for item in eval_items]
        results.append(aligns_row)
    return results


In [117]:
result = extract_alignment_scores(result_list)
print(result)

[[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], [1, 0], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]


In [119]:
import json
import re
import dotenv
import os
import argparse
import pandas as pd
dotenv.load_dotenv()


def main(mode, input_path,response_column, output_dir, temperature=0.7):
    # 1. Load input JSONL
    os.makedirs(output_dir, exist_ok=True)
    df = pd.read_json(input_path, lines=True)

    # 2. Create model client
    client = create_clients(mode=mode)

    # 3. Prepare data
    instructions = df["instruction"].tolist()
    model_responses = df[response_column].tolist()
    constraints_list = df["filtered_relevant_constraints"].tolist()

    print("Sending prompts to model...")
    result_list = response_constraint_validator(
        model_responses=model_responses,
        constraints_list=constraints_list,
        instructions=instructions,
        client=client
    )

    print("Extracting alignment scores...")
    result_scores = extract_alignment_scores(result_list)
    df["Constraint_adherence"] = result_scores

    # 4. Save output
    input_filename = os.path.splitext(os.path.basename(input_path))[0]
    output_filename = f"{input_filename}_llmjudge.jsonl"
    output_path = os.path.join(output_dir, output_filename)
    df.to_json(output_path, orient="records", lines=True,force_ascii=False)

    print(f"Results saved to: {output_path}")
if __name__ == "__main__":
    mode = "rits"
    input_path = "./benchmark/response_outputs/Llama-3.1-8B-Instruct_results.jsonl"
    response_column = "response"
    output_dir = "LLMjudge_outputs"
    temperature = 0.1

    # Call main
    main(
        mode=mode,
        input_path=input_path,
        response_column=response_column,
        output_dir=output_dir,
        temperature=temperature
    )




48032bfa1e1d782cece12fedb6b3fb40
Sending prompts to model...


Processing:   0%|                                                                                                                            | 0/10 [00:00<?, ?it/s]

Processing: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:20<00:00,  2.08s/it]

Extracting alignment scores...
Results saved to: LLMjudge_outputs/Llama-3.1-8B-Instruct_results_llmjudge.jsonl





## metrics

In [None]:
import os
import argparse
import ast
import pandas as pd

def compute_metrics_from_column(df, column_name):
    # Ensure the column is parsed into lists
    df[column_name] = df[column_name].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

    # Store per-row CSR and SSR
    csr_values = []
    ssr_values = []

    for r_list in df[column_name]:
        r_list = list(map(int, r_list))
        # CSR: all constraints must be 1
        csr = int(all(r_list))
        # SSR: mean of the binary values
        ssr = sum(r_list) / len(r_list) if r_list else 0
        csr_values.append(csr)
        ssr_values.append(ssr)

    df["CSR_per_row"] = csr_values
    df["SSR_per_row"] = ssr_values

    # Compute overall metrics
    m = len(df)
    overall_csr = sum(csr_values) / m if m > 0 else 0
    overall_ssr = sum(ssr_values) / m if m > 0 else 0

    return df, overall_csr, overall_ssr

def main(input_path, column_name, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    df = pd.read_csv(input_path)

    # Compute metrics
    df_with_metrics, csr, ssr = compute_metrics_from_column(df, column_name)

    # Save detailed output with CSR/SSR per row
    input_filename = os.path.splitext(os.path.basename(input_path))[0]
    detailed_output_path = os.path.join(output_dir, f"{input_filename}_metrics.csv")
    df_with_metrics.to_csv(detailed_output_path, index=False)

    # Save summary file with filename, CSR, SSR
    summary_df = pd.DataFrame([{
        "filename": os.path.basename(input_path),
        "CSR": csr,
        "SSR": ssr
    }])
    summary_output_path = os.path.join(output_dir, "metrics_summary.csv")
    summary_df.to_csv(summary_output_path, index=False)

    print(f"Saved row-level metrics to {detailed_output_path}")
    print(f"Saved overall CSR and SSR to {summary_output_path}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Compute CSR and SSR from constraint adherence column")
    parser.add_argument("--input_path", type=str, required=True, help="Path to input CSV")
    parser.add_argument("--column_name", type=str, required=True, help="Name of the column containing list of 1/0 values")
    parser.add_argument("--output_dir", type=str, required=True, help="Folder to save outputs")

    args = parser.parse_args()

    main(
        input_path=args.input_path,
        column_name=args.column_name,
        output_dir=args.output_dir
    )


In [120]:
benchmark_v4 = pd.read_csv("./benchmark/benchmark_v4.csv")
benchmark_v4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1493 entries, 0 to 1492
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   id                               1493 non-null   int64  
 1   dataset                          1493 non-null   object 
 2   instruction                      1493 non-null   object 
 3   code                             1493 non-null   object 
 4   test                             594 non-null    object 
 5   relevant_categories              1493 non-null   object 
 6   simplified_instruction           1493 non-null   object 
 7   extracted_constraints            1493 non-null   object 
 8   final_comprehensive_constraints  1493 non-null   object 
 9   filtered_relevant_constraints    1493 non-null   object 
 10  quality_scores                   1493 non-null   object 
 11  relevance_score                  1493 non-null   float64
 12  objectivity_score   

In [None]:
benchmark_v4 = benchmark_v4.drop(columns = ["id.1"])
benchmark_v4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1493 entries, 0 to 1492
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   id                               1493 non-null   int64  
 1   dataset                          1493 non-null   object 
 2   instruction                      1493 non-null   object 
 3   code                             1493 non-null   object 
 4   test                             594 non-null    object 
 5   relevant_categories              1493 non-null   object 
 6   simplified_instruction           1493 non-null   object 
 7   extracted_constraints            1493 non-null   object 
 8   final_comprehensive_constraints  1493 non-null   object 
 9   filtered_relevant_constraints    1493 non-null   object 
 10  quality_scores                   1493 non-null   object 
 11  relevance_score                  1493 non-null   float64
 12  objectivity_score   

In [None]:
benchmark_v4.to_csv("../../benchmark/benchmark_v4.csv",index=False)

## LLM Judge for Functional Correctness

- With the given simplified instruction and model response the model should evaluate whether the generated code is correct for the instruction
    - if the code is completely satisfies the instruction and there are no syntax or semantic errors
    - if you have any doubts and think almost it is correct but it might not satisy some edge cases or so then mark it partially correct
    - otherwise if it has any syntax or semantic errors then mark it as wrong 

In [200]:

input_path = "./benchmark/LLMjudge_outputs/constraint_adherence/Llama-3.1-8B-Instruct_results_llmjudge.jsonl"
df = pd.read_json(input_path,lines=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 18 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   id                               10 non-null     int64  
 1   dataset                          10 non-null     object 
 2   instruction                      10 non-null     object 
 3   code                             10 non-null     object 
 4   test                             5 non-null      object 
 5   relevant_categories              10 non-null     object 
 6   simplified_instruction           10 non-null     object 
 7   extracted_constraints            10 non-null     object 
 8   final_comprehensive_constraints  10 non-null     object 
 9   filtered_relevant_constraints    10 non-null     object 
 10  quality_scores                   10 non-null     object 
 11  relevance_score                  10 non-null     float64
 12  objectivity_score        

- the response might contain  other parts that are not part of the instruction provided, don't worry about that and evaluate only on the simplified instruction

In [161]:
def code_correctness_prompt_v1(instruction,generated_code):
    
    prompt = f"""You are an expert Python developer and code reviewer.  
    Your task is to evaluate whether a given Python code correctly follows the instruction.
    
    Evaluation Criteria:
    - Completely Correct: If the code fully satisfies the instruction, and there are no syntax or semantic errors.
    - Partially Correct: If the code is mostly satisfies the instruction and completely correct with no syntax or semantic errors, but may miss some edge cases or implementation details.
    - Wrong: If the code has syntax errors, semantic errors, or clearly does not follow the instruction.
    
    Output Format:
    Return your final evaluation as a dictionary without any explanation:
    {{"reason": "<Your reason for the evaluation>",
      "correctness" :"Completely Correct/Partially Correct/Wrong",
      }}
    
    Input:
    Instruction:  
    {instruction}
    
    Generated Code:  
    ```python
    {generated_code} """

    return prompt

prompt = code_correctness_prompt_v1(df .iloc[0]["instruction"], df.iloc[0]["response"])
print(prompt)


You are an expert Python developer and code reviewer.  
    Your task is to evaluate whether a given Python code correctly follows the instruction.
    
    Evaluation Criteria:
    - Completely Correct: If the code fully satisfies the instruction, and there are no syntax or semantic errors.
    - Partially Correct: If the code is mostly satisfies the instruction and completely correct with no syntax or semantic errors, but may miss some edge cases or implementation details.
    - Wrong: If the code has syntax errors, semantic errors, or clearly does not follow the instruction.
    
    Output Format:
    Return your final evaluation as a dictionary without any explanation:
    {"reason": "<Your reason for the evaluation>",
      "correctness" :"Completely Correct/Partially Correct/Wrong",
      }
    
    Input:
    Instruction:  
    Generates and plots a complex wave with a specified amplitude and frequency over given time points, applying a Hann window to reduce edge effects. The w

In [None]:
client = create_clients(mode="GPT")
instructions = df["simplified_instruction"].to_list()
model_responses = df["response"].to_list()
prompts = [code_correctness_prompt_v1(instruction, response) for instruction, response in zip(instructions, model_responses)]
correctness_evaluation = client.get_model_response_batch(
    user_prompts=prompts,
    system_prompt=None,
    temperature=0.1
)

Processing:   0%|                                                                                                                            | 0/10 [00:00<?, ?it/s]

Processing: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  5.61it/s]


In [160]:
correctness_evaluation

['{"reason": "The code has a syntax error due to an incomplete line at the end, which prevents it from running correctly.", "correctness": "Wrong"}',
 '{"reason": "The code correctly implements the functionality to remove jQuery files from a specified directory, handles exceptions, and logs the removed files. It also adheres to the specified output format.", "correctness": "Completely Correct"}',
 '{"reason": "The code correctly defines the URL patterns for the Django application as per the instructions, including all required views and handling of media files in development. There are no syntax or semantic errors.", "correctness": "Completely Correct"}',
 '```json\n{"reason": "The code is incomplete and does not implement the full functionality required by the instruction. It lacks the logic for transitioning statuses based on bids, complaints, and auction results, as well as unit tests to verify the transitions.", "correctness": "Wrong"}\n```',
 '{"reason": "The code is incomplete an

In [155]:
correctness_evaluation = [extract_json(response, "correctness") for response in correctness_evaluation]
df["correctness_evaluation"] = correctness_evaluation
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 19 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   id                               10 non-null     int64  
 1   dataset                          10 non-null     object 
 2   instruction                      10 non-null     object 
 3   code                             10 non-null     object 
 4   test                             5 non-null      object 
 5   relevant_categories              10 non-null     object 
 6   simplified_instruction           10 non-null     object 
 7   extracted_constraints            10 non-null     object 
 8   final_comprehensive_constraints  10 non-null     object 
 9   filtered_relevant_constraints    10 non-null     object 
 10  quality_scores                   10 non-null     object 
 11  relevance_score                  10 non-null     float64
 12  objectivity_score        

In [156]:
df.head()

Unnamed: 0,id,dataset,instruction,code,test,relevant_categories,simplified_instruction,extracted_constraints,final_comprehensive_constraints,filtered_relevant_constraints,quality_scores,relevance_score,objectivity_score,atomicity_score,unified_quality_score,combined_instruction,response,Constraint_adherence,correctness_evaluation
0,955,bigcode/bigcodebench,Generates and plots a complex wave with a spec...,wave = amplitude * np.exp(1j * 2 * math.pi * f...,import unittest\nimport numpy as np\nimport ma...,"['Code Structure and Modularity', 'Input and O...",Generates and plots a complex wave with a spec...,"[{'type': 'Input and Output Handling', 'constr...","[{'type': 'Input and Output Handling', 'constr...","[{'type': 'Input and Output Handling', 'constr...",{'constraint_evaluations': [{'constraint_text'...,5.0,5.0,5.0,5.0,Generates and plots a complex wave with a spec...,```python\nimport numpy as np\nimport math\nim...,"[1, 1, 1, 1, 1, 1, 1, 1]",Wrong
1,906,bigcode/bigcodebench,Removes all jQuery files (JavaScript files con...,# Configure logging\n logging.basicConfig(f...,import unittest\nfrom unittest.mock import Mag...,"['Code Structure and Modularity', 'Input and O...",Removes all jQuery files (JavaScript files con...,"[{'type': 'File and Data Management', 'constra...","[{'type': 'File and Data Management', 'constra...","[{'type': 'File and Data Management', 'constra...",{'constraint_evaluations': [{'constraint_text'...,4.8,4.6,4.8,4.73,Removes all jQuery files (JavaScript files con...,```python\nimport os\nimport logging\n\ndef ta...,"[1, 1, 1, 1, 1]",Completely Correct
2,1346,Multilingual-Multimodal-NLP/McEval-Instruct,You are tasked with creating a Django applicat...,```python\n# Import necessary Django modules\n...,,"['Code Structure and Modularity', 'Input and O...",You are tasked with creating a Django applicat...,"[{'type': 'Code Structure and Modularity', 'co...","[{'type': 'Code Structure and Modularity', 'co...","[{'type': 'Code Structure and Modularity', 'co...",{'constraint_evaluations': [{'constraint_text'...,5.0,4.5,4.75,4.75,You are tasked with creating a Django applicat...,Here's a Python script that defines the URL pa...,"[1, 1, 1, 0]",Completely Correct
3,989,Multilingual-Multimodal-NLP/McEval-Instruct,Design a system to automate the transition of ...,```python\n# -*- coding: utf-8 -*-\nimport uni...,,"['Code Structure and Modularity', 'Input and O...",Design a system to automate the transition of ...,"[{'type': 'Code Structure and Modularity', 'co...","[{'type': 'Code Structure and Modularity', 'co...","[{'type': 'Code Structure and Modularity', 'co...",{'constraint_evaluations': [{'constraint_text'...,4.85,4.23,4.23,4.44,Design a system to automate the transition of ...,**System Design**\n\nWe will design a system u...,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0]",Wrong
4,964,bigcode/bigcodebench,Train a linear regression model on a given Dat...,"if not isinstance(df, pd.DataFrame):\n ...",import unittest\nimport pandas as pd\nfrom io ...,"['Code Structure and Modularity', 'Input and O...",Train a linear regression model on a given Dat...,"[{'type': 'Error Handling and Robustness', 'co...","[{'type': 'Error Handling and Robustness', 'co...","[{'type': 'Error Handling and Robustness', 'co...",{'constraint_evaluations': [{'constraint_text'...,5.0,5.0,5.0,5.0,Train a linear regression model on a given Dat...,```python\nimport pandas as pd\nfrom sklearn.m...,"[1, 0]",Partially Correct


In [157]:

df_new =df[["simplified_instruction","response","correctness_evaluation"]].copy()
df_new.columns = ["instruction", "response", "correctness_evaluation"]
df_new.info()
df_new.head()
df_new.to_json("./benchmark/LLMjudge_outputs/code_correctness/Llama-3.1-8B-Instruct_results_llmjudge_with_correctness.jsonl", orient="records", lines=True, force_ascii=False)  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   instruction             10 non-null     object
 1   response                10 non-null     object
 2   correctness_evaluation  10 non-null     object
dtypes: object(3)
memory usage: 368.0+ bytes


In [158]:
def print_row(row):
    print(f"Instruction: {row['instruction']}")
    print(f"Response: {row['response']}")
    print(f"Correctness Evaluation: {row['correctness_evaluation']}")
    print("-" * 80)

for _, row in df_new.iterrows():
    print_row(row)
    print("\n")  # Add a newline for better readability between rows

Instruction: Generates and plots a complex wave with a specified amplitude and frequency over given time points, applying a Hann window to reduce edge effects. The wave is represented as a complex number where the real part is the cosine component, and the imaginary part is the sine component. It returns both the wave and the plot object. Note that: The plot title is "Complex Wave with Hann Window". The x-label of the plot is "Time". The y-label of the plot is "Amplitude". The plot displays both the real and imaginary parts of the complex wave. You should write self-contained code starting with:
```
import numpy as np
import math
import matplotlib.pyplot as plt
from scipy.signal import get_window
def task_func(amplitude, frequency, time):
```
Response: ```python
import numpy as np
import math
import matplotlib.pyplot as plt
from scipy.signal import get_window

def task_func(amplitude, frequency, time):
    """
    Generates and plots a complex wave with a specified amplitude and freque

total = 0
count = 0
num = 1

while count < 20:
    total += num
    num += 2
    count += 1

print(total)


In [None]:
def code_correctness_prompt_v2(simplified_instruction,generated_code):
    prompt = f"""You are an expert Python developer and code reviewer.  
    Your task is to evaluate whether a given Python code correctly follows the simplified instruction, based only on three dimensions:

    1. Syntax Correctness: Is the code free of syntax errors?
    2. Semantic Correctness: Does the code make logical sense and run as intended?
    3. Constraint Correctness: Does the code satisfy the constraints explicitly in the **simplified instruction**?

    Important Notes:
    - Only evaluate based on the **simplified instruction**. 
    - Ignore any extra functionality in the response that is not asked for in the instruction.
    - Do not penalize for unrelated content, as long as it doesn't interfere with satisfying the instruction.

    Evaluation Criteria:
    - Completely Correct: Code has no syntax or semantic errors and satisfies all constraints in the simplified instruction.
    - Partially Correct: Code has no syntax or semantic errors, but may miss one or more constraints in the simplified instruction.
    - Wrong: if the code is not partially also correct then it is wrong 
    Output Format:
    Return your final evaluation as a dictionary without any explanation:
    {{"reason": "<Your reason for the evaluation>",
      "correctness": "Completely Correct/Partially Correct/Wrong"
    }}

    Input:
    Simplified Instruction:  
    {simplified_instruction}

    Generated Code:  
    ```python
    {generated_code}
    ```"""
    return prompt

In [222]:
def code_correctness_prompt_v3(instruction,generated_code):
    prompt = f"""You are an expert Python developer and code reviewer.  
    Your task is to evaluate whether a given Python code correctly follows the instruction, based only on three dimensions:

    1. Syntax Correctness: Is the code free of syntax errors?
    2. Semantic Correctness: Does the code make logical sense and run as intended?
    3. Constraint Correctness: Does the code satisfy the constraints explicitly in the instruction?

    Important Notes:
    - Only evaluate based on the instruction. 
    - Ignore any extra functionality in the response that is not asked for in the instruction.
    - Do not penalize for unrelated content, as long as it doesn't interfere with satisfying the instruction.

    Evaluation Criteria:
    - Completely Correct: Code has no syntax or semantic errors and satisfies all constraints in the instruction.
    - Partially Correct: Code has no syntax or semantic errors, but may miss one or more constraints in the instruction.
    - Wrong: If the code is not even partially correct, then it is considered wrong.

    Output Format:
    Return your final evaluation as a dictionary without any explanation:
    {{"reason": "<Your reason for the evaluation>",
      "correctness": "Completely Correct/Partially Correct/Wrong"
    }}

    Input:
    Instruction:  
    {instruction}

    Generated Code:  
    ```python
    {generated_code}
    ```"""
    return prompt


In [223]:
client = create_clients(mode="GPT-azure")
instructions = df["simplified_instruction"].to_list()
model_responses = df["response"].to_list()
prompts = [code_correctness_prompt_v3(instruction, response) for instruction, response in zip(instructions, model_responses)]
correctness_evaluation = client.get_model_response_batch(
    user_prompts=prompts,
    system_prompt=None,
    temperature=0.1
)

Processing:   0%|                                                                                                                            | 0/10 [00:00<?, ?it/s]

Processing: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  4.54it/s]


In [224]:
correctness_evaluation

['```json\n{\n  "reason": "The code is syntactically correct except for an incomplete example usage line at the end, which is outside the function and can be ignored as it is not part of the required function. The function itself is free of syntax errors. Semantically, the code correctly generates a complex wave with the real part as cosine and imaginary part as sine using np.exp(1j * ...), applies a Hann window, and plots both parts with the correct title and axis labels. The constraint about returning both the wave and the plot object is satisfied. The only minor issue is that hann_window is 1D and wave is 1D complex, so multiplying wave * hann_window[:, None] introduces an unnecessary dimension; it should be wave * hann_window. However, this does not cause an error and the code runs as intended. Therefore, the code is Completely Correct.",\n  "correctness": "Completely Correct"\n}\n```',
 '{"reason": "The code is syntactically correct and logically removes all files containing \'jqu

In [220]:
client = create_clients(mode="GPT-azure")
instructions = df["simplified_instruction"].to_list()
model_responses = df["response"].to_list()
prompts = [code_correctness_prompt_v2(instruction, response) for instruction, response in zip(instructions, model_responses)]
correctness_evaluation_v2 = client.get_model_response_batch(
    user_prompts=prompts,
    system_prompt=None,
    temperature=0.1
)
correctness_level_2 = [extract_json(response, "correctness") for response in correctness_evaluation_v2]
correctness_reason_2= [extract_json(response, "reason") for response in correctness_evaluation_v2]
df["correctness_reason"] = correctness_reason_2
df["correctness_level"] = correctness_level_2
df.info()


Processing:   0%|                                                                                                                            | 0/10 [00:00<?, ?it/s]

Processing: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  5.23it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 20 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   id                               10 non-null     int64  
 1   dataset                          10 non-null     object 
 2   instruction                      10 non-null     object 
 3   code                             10 non-null     object 
 4   test                             5 non-null      object 
 5   relevant_categories              10 non-null     object 
 6   simplified_instruction           10 non-null     object 
 7   extracted_constraints            10 non-null     object 
 8   final_comprehensive_constraints  10 non-null     object 
 9   filtered_relevant_constraints    10 non-null     object 
 10  quality_scores                   10 non-null     object 
 11  relevance_score                  10 non-null     float64
 12  objectivity_score        




In [214]:
correctness_reason_2

['The code has a syntax error due to an incomplete example usage line (missing closing parenthesis). Additionally, the code incorrectly applies the Hann window by using wave * hann_window[:, None], which adds an unnecessary dimension and will cause a broadcasting error since wave is 1D. The correct application should be wave * hann_window. Therefore, the code has both syntax and semantic errors and does not fully satisfy the instruction constraints.',
 "The code is syntactically correct and logically removes all files containing 'jquery' in their name from the specified directory. It returns a tuple with the count and list of removed files as required. The additional logging does not interfere with the instruction and is acceptable. The code also properly handles the case where the directory does not exist by raising an exception, which is reasonable and does not violate constraints.",
 'The code is free of syntax errors and logically defines the required URL patterns for the specified

In [213]:
correctness_level_2

['Wrong',
 'Completely Correct',
 'Completely Correct',
 'Wrong',
 'Wrong',
 'Partially Correct',
 'Completely Correct',
 'Wrong',
 'Partially Correct',
 'Partially Correct']

In [225]:
client = create_clients(mode="GPT-azure")
instructions = df["simplified_instruction"].to_list()
model_responses = df["response"].to_list()
prompts = [code_correctness_prompt_v3(instruction, response) for instruction, response in zip(instructions, model_responses)]
correctness_evaluation_v3 = client.get_model_response_batch(
    user_prompts=prompts,
    system_prompt=None,
    temperature=0.1
)
correctness_level_3 = [extract_json(response, "correctness") for response in correctness_evaluation_v3]
correctness_reason_3= [extract_json(response, "reason") for response in correctness_evaluation_v3]
df["correctness_reason"] = correctness_reason_3
df["correctness_level"] = correctness_level_3
df.info()


Processing:   0%|                                                                                                                            | 0/10 [00:00<?, ?it/s]

Processing: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:03<00:00,  3.32it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 20 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   id                               10 non-null     int64  
 1   dataset                          10 non-null     object 
 2   instruction                      10 non-null     object 
 3   code                             10 non-null     object 
 4   test                             5 non-null      object 
 5   relevant_categories              10 non-null     object 
 6   simplified_instruction           10 non-null     object 
 7   extracted_constraints            10 non-null     object 
 8   final_comprehensive_constraints  10 non-null     object 
 9   filtered_relevant_constraints    10 non-null     object 
 10  quality_scores                   10 non-null     object 
 11  relevance_score                  10 non-null     float64
 12  objectivity_score        




In [226]:
correctness_level_3

['Partially Correct',
 'Completely Correct',
 'Completely Correct',
 'Wrong',
 'Wrong',
 'Partially Correct',
 'Partially Correct',
 'Partially Correct',
 'Partially Correct',
 'Partially Correct']

In [218]:
correctness_reason_3

['The code has a syntax error due to an incomplete example usage line (missing closing parenthesis). Additionally, the code incorrectly applies the Hann window by using wave * hann_window[:, None], which adds an unnecessary new axis and will cause a shape mismatch error. The Hann window should be applied as wave * hann_window. The code otherwise logically generates the complex wave, applies the Hann window, plots the real and imaginary parts with correct labels and title, and returns the wave and plot objects as required. Therefore, it is not completely correct due to syntax and semantic errors.',
 "The code is syntactically correct and logically removes all files containing 'jquery' in their names from the specified directory, returning the correct tuple as required. It also satisfies the constraint of being self-contained and starting with the specified imports and function definition. The extra logging functionality does not interfere with the requirements.",
 'The code is free of s

In [181]:
df.head()

Unnamed: 0,id,dataset,instruction,code,test,relevant_categories,simplified_instruction,extracted_constraints,final_comprehensive_constraints,filtered_relevant_constraints,quality_scores,relevance_score,objectivity_score,atomicity_score,unified_quality_score,combined_instruction,response,Constraint_adherence,correctness_reason,correctness_level
0,955,bigcode/bigcodebench,Generates and plots a complex wave with a spec...,wave = amplitude * np.exp(1j * 2 * math.pi * f...,import unittest\nimport numpy as np\nimport ma...,"['Code Structure and Modularity', 'Input and O...",Generates and plots a complex wave with a spec...,"[{'type': 'Input and Output Handling', 'constr...","[{'type': 'Input and Output Handling', 'constr...","[{'type': 'Input and Output Handling', 'constr...",{'constraint_evaluations': [{'constraint_text'...,5.0,5.0,5.0,5.0,Generates and plots a complex wave with a spec...,```python\nimport numpy as np\nimport math\nim...,"[1, 1, 1, 1, 1, 1, 1, 1]",The code has a syntax error due to a missing c...,Wrong
1,906,bigcode/bigcodebench,Removes all jQuery files (JavaScript files con...,# Configure logging\n logging.basicConfig(f...,import unittest\nfrom unittest.mock import Mag...,"['Code Structure and Modularity', 'Input and O...",Removes all jQuery files (JavaScript files con...,"[{'type': 'File and Data Management', 'constra...","[{'type': 'File and Data Management', 'constra...","[{'type': 'File and Data Management', 'constra...",{'constraint_evaluations': [{'constraint_text'...,4.8,4.6,4.8,4.73,Removes all jQuery files (JavaScript files con...,```python\nimport os\nimport logging\n\ndef ta...,"[1, 1, 1, 1, 1]","The code has no syntax or semantic errors, but...",Partially Correct
2,1346,Multilingual-Multimodal-NLP/McEval-Instruct,You are tasked with creating a Django applicat...,```python\n# Import necessary Django modules\n...,,"['Code Structure and Modularity', 'Input and O...",You are tasked with creating a Django applicat...,"[{'type': 'Code Structure and Modularity', 'co...","[{'type': 'Code Structure and Modularity', 'co...","[{'type': 'Code Structure and Modularity', 'co...",{'constraint_evaluations': [{'constraint_text'...,5.0,4.5,4.75,4.75,You are tasked with creating a Django applicat...,Here's a Python script that defines the URL pa...,"[1, 1, 1, 0]",The code is free of syntax errors and logicall...,Partially Correct
3,989,Multilingual-Multimodal-NLP/McEval-Instruct,Design a system to automate the transition of ...,```python\n# -*- coding: utf-8 -*-\nimport uni...,,"['Code Structure and Modularity', 'Input and O...",Design a system to automate the transition of ...,"[{'type': 'Code Structure and Modularity', 'co...","[{'type': 'Code Structure and Modularity', 'co...","[{'type': 'Code Structure and Modularity', 'co...",{'constraint_evaluations': [{'constraint_text'...,4.85,4.23,4.23,4.44,Design a system to automate the transition of ...,**System Design**\n\nWe will design a system u...,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0]",The code has syntax errors (incomplete functio...,Wrong
4,964,bigcode/bigcodebench,Train a linear regression model on a given Dat...,"if not isinstance(df, pd.DataFrame):\n ...",import unittest\nimport pandas as pd\nfrom io ...,"['Code Structure and Modularity', 'Input and O...",Train a linear regression model on a given Dat...,"[{'type': 'Error Handling and Robustness', 'co...","[{'type': 'Error Handling and Robustness', 'co...","[{'type': 'Error Handling and Robustness', 'co...",{'constraint_evaluations': [{'constraint_text'...,5.0,5.0,5.0,5.0,Train a linear regression model on a given Dat...,```python\nimport pandas as pd\nfrom sklearn.m...,"[1, 0]",The code has a syntax error due to an incomple...,Wrong


In [221]:
def print_row(row):
    print(f"Instruction: {row['instruction']}")
    print(f"Response: {row['response']}")
    print("-"*80)
    print(f"Correctness Level: {row['correctness_level']}")
    print(f"Correctness Reason: {row['correctness_reason']}")
    print("-" * 80)

for _, row in df.iterrows():
    print_row(row)
    print("\n")  # Add a newline for better readability between rows

Instruction: Generates and plots a complex wave with a specified amplitude and frequency over given time points, applying a Hann window to reduce edge effects. The wave is represented as a complex number where the real part is the cosine component, and the imaginary part is the sine component. It returns both the wave and the plot object.
Note that: Notes: The plot title is "Complex Wave with Hann Window". The x-label of the plot is "Time". The y-label of the plot is "Amplitude". The plot displays both the real and imaginary parts of the complex wave.
The function should output with:
    numpy.ndarray: The generated complex wave as a numpy array of complex numbers.
    matplotlib.figure.Figure: The figure object of the plot.
    matplotlib.axes.Axes: The axes object of the plot.
You should write self-contained code starting with:
```
import numpy as np
import math
import matplotlib.pyplot as plt
from scipy.signal import get_window
def task_func(amplitude, frequency, time):
```
Response

In [1]:
import os
import argparse
import ast
import pandas as pd
from collections import defaultdict

def compute_metrics_from_column(df, column_name):
    # Parse constraint adherence column
    df[column_name] = df[column_name].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    
    csr_values = []
    ssr_values = []

    for r_list in df[column_name]:
        r_list = list(map(int, r_list))
        csr = int(all(r_list))
        ssr = sum(r_list) / len(r_list) if r_list else 0
        csr_values.append(csr)
        ssr_values.append(ssr)

    df["CSR_per_row"] = csr_values
    df["SSR_per_row"] = ssr_values

    # Compute overall metrics
    m = len(df)
    overall_csr = sum(csr_values) / m if m > 0 else 0
    overall_ssr = sum(ssr_values) / m if m > 0 else 0

    return df, overall_csr, overall_ssr

def compute_combination_metrics(df):
    metrics = {}

    # Only keep CSR=1 combinations
    for correctness in ["Completely Correct", "Partially Correct", "Wrong"]:
        key = f"{correctness.replace(' ', '_')}_CSR1"
        metrics[key] = len(df[(df["correctness_level"] == correctness) & (df["CSR_per_row"] == 1)])

    # Avg SSR for each correctness level
    avg_ssr = (
        df.groupby("correctness_level")["SSR_per_row"]
        .mean()
        .rename(lambda x: f"Avg_SSR_{x.replace(' ', '_')}")
        .to_dict()
    )

    metrics.update(avg_ssr)
    return metrics

def main(input_path, column_name, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    df = pd.read_json(input_path, lines=True) if input_path.endswith(".jsonl") else pd.read_csv(input_path)

    # Compute row-level CSR and SSR
    df_with_metrics, csr, ssr = compute_metrics_from_column(df, column_name)

    # Compute filtered combination metrics
    combo_metrics = compute_combination_metrics(df_with_metrics)

    # Save row-level data
    input_filename = os.path.splitext(os.path.basename(input_path))[0]
    detailed_output_path = os.path.join(output_dir, f"{input_filename}_metrics.csv")
    df_with_metrics.to_csv(detailed_output_path, index=False)

    # Save metrics summary
    summary_data = {
        "filename": os.path.basename(input_path),
        "Overall_CSR": csr,
        "Overall_SSR": ssr
    }
    summary_data.update(combo_metrics)
    summary_df = pd.DataFrame([summary_data])
    summary_output_path = os.path.join(output_dir, "metrics_summary.csv")
    print(summary_df.info())
    # summary_df.to_csv(summary_output_path, index=False)

    print(f"Saved row-level metrics to {detailed_output_path}")
    print(f"Saved overall + combination metrics to {summary_output_path}")


In [3]:
!pwd

/dccstor/shanmukh/sravani_internship/CIF_Benchmark/scripts/constraints


In [5]:
input_path = "../../benchmark/LLMjudge_outputs/code_correctness/Llama-3.1-8B-Instruct_results_llmjudge_with_correctness.jsonl"
column_name = "Constraint_adherence"
output_dir = "../../benchmark/metrics_output"
main(
input_path=input_path,
column_name=column_name,
output_dir=output_dir
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   filename                    1 non-null      object 
 1   Overall_CSR                 1 non-null      float64
 2   Overall_SSR                 1 non-null      float64
 3   Completely_Correct_CSR1     1 non-null      int64  
 4   Partially_Correct_CSR1      1 non-null      int64  
 5   Wrong_CSR1                  1 non-null      int64  
 6   Avg_SSR_Completely_Correct  1 non-null      float64
 7   Avg_SSR_Partially_Correct   1 non-null      float64
 8   Avg_SSR_Wrong               1 non-null      float64
dtypes: float64(5), int64(3), object(1)
memory usage: 200.0+ bytes
None
Saved row-level metrics to ../../benchmark/metrics_output/Llama-3.1-8B-Instruct_results_llmjudge_with_correctness_metrics.csv
Saved overall + combination metrics to ../../benchmark/metrics_output/metri

In [3]:
import json
data = []
with open("/dccstor/shanmukh/sravani_internship/benchmark_experiments/benchmark_dataset/benchmark_v5.jsonl") as f:
    for line in f:
        data.append(json.loads(line.strip()))

In [4]:
len(data)

100

In [5]:
data[0]

{'id': 1400,
 'dataset': 'Multilingual-Multimodal-NLP/McEval-Instruct',
 'instruction': '### 142. Linked List Cycle II\n\nGiven the `head` of a linked list, return the node where the cycle begins. If there is no cycle, return `null`.\n\nTo represent a cycle in the given linked list, we use an integer `pos` which represents the position (0-indexed) in the linked list where the tail connects to. If `pos` is `-1`, then there is no cycle in the linked list.\n\n**Note**: Do not modify the linked list.\n\n**Follow up**:\nCan you solve it using `O(1)` (i.e., constant) memory?\n\n**Example**:\n```\nInput: head = [3,2,0,-4], pos = 1\nOutput: The node with value 2\nExplanation: There is a cycle in the linked list, where the tail connects to the second node.\n```',
 'code': '```python\nfrom ListNode import *\n\n# Definition for singly-linked list.\n# class ListNode(object):\n#     def __init__(self, x):\n#         self.val = x\n#         self.next = None\n\nclass Solution(object):\n    def detect

In [9]:
eval(data[0]["filtered_relevant_constraints"])

[{'type': 'Input and Output Handling',
  'constraint': 'Return the node where the cycle begins.',
  'instruction_part': 'Extracted from instruction'},
 {'type': 'Input and Output Handling',
  'constraint': 'Return null if there is no cycle.',
  'instruction_part': 'Extracted from instruction'},
 {'type': 'Input and Output Handling',
  'constraint': 'Use an integer pos to represent the position (0-indexed) in the linked list where the tail connects to.',
  'instruction_part': 'Extracted from instruction'},
 {'type': 'Input and Output Handling',
  'constraint': 'If pos is -1, then there is no cycle in the linked list.',
  'instruction_part': 'Extracted from instruction'},
 {'type': 'Data Processing and Transformation',
  'constraint': 'Do not modify the linked list.',
  'instruction_part': 'Extracted from instruction'},
 {'type': 'Performance and Optimization',
  'constraint': 'Solve it using O(1) (i.e., constant) memory.',
  'instruction_part': 'Extracted from instruction'},
 {'type': '

In [10]:
import json
fail = 0
for idx, ele in enumerate(data):
    if len(eval(ele['filtered_relevant_constraints'])) == len(ele['constraint_wise_presence']):
        pass
    else:
        fail += 1
        print(idx)