In [19]:
pip install openai

Note: you may need to restart the kernel to use updated packages.


In [22]:
import pandas as pd
from openai import OpenAI
import json
import time
import os

client = OpenAI(api_key = "enter_key")

def generate_prompt(context, question, answer):
    return f"""
You are tasked with perturbing a given context and question while ensuring the answer remains intact. Use the following techniques:

1. **Synonym Replacement**: Replace words in the context and question with appropriate synonyms where possible.
2. **Paraphrasing**: Rephrase sentences in the context and question to make them different but preserve their meaning.

The perturbation must:
- Maintain the overall meaning of the context and question.
- Ensure the answer remains present and unchanged in the context.
- Make the question and context look slightly different from the original.

### Input:
- **Context**: {context}
- **Question**: {question}
- **Answer**: {answer}

### Output Format:
Provide the output in the following json format:
```
{{
  "perturbed_context": "Your perturbed context here",
  "perturbed_question": "Your perturbed question here"
}}
```
"""

def call_gpt4(prompt):
    try:
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            response_format = { "type": "json_object" }
        )
      

        content = completion.choices[0].message.content.strip()

        result = json.loads(content)
        
        return result
    except Exception as e:
        print(f"Error: {e}")
        return None



def process_single_sample(row):
    context = row['context']
    question = row['question']
    answer = row['answer']
    
    prompt = generate_prompt(context, question, answer)
    
    response = call_gpt4(prompt)
    
    if response:
        try:
            return {
                "id": row['id'],
                "title": row['title'],
                "original_context": context,
                "perturbed_context": response.get("perturbed_context", ""),
                "original_question": question,
                "perturbed_question": response.get("perturbed_question", ""),
                "answer": answer,
                "is_impossible": row['is_impossible']
            }
        except json.JSONDecodeError:
            print("Failed to parse JSON response. Skipping...")
            return None
    else:
        print("No response received. Skipping...")
        return None

def process_dataset(input_file, output_file, max_iterations=1000):
    data = pd.read_csv(input_file)
    
    if not os.path.exists(output_file):
        pd.DataFrame(columns=data.columns).to_csv(output_file, index=False)
    
    processed_count = 0
    
    for index, row in data.iterrows():
        if processed_count >= max_iterations:
            break  
        
        result = process_single_sample(row)
        
        if result:
            pd.DataFrame([result]).to_csv(output_file, mode='a', header=False, index=False)
            processed_count += 1
        
        time.sleep(1)
    
    print(f"Processed {processed_count} samples. Perturbed data saved to {output_file}")

    

In [23]:
input_file = '/kaggle/input/squad-small/smallset.csv'  
output_file = "perturbed_dataset_english.csv"  

print("Starting batch perturbation process...")
process_dataset(input_file, output_file)
print("Batch perturbation complete!")

Starting batch perturbation process...
Processed 1000 samples. Perturbed data saved to perturbed_dataset_english.csv
Batch perturbation complete!
