In [1]:
import pandas as pd
from openai import OpenAI
import json

sample output

In [2]:
# path to your dataset
jsonl_path = "dataset_demo.jsonl"
# with open(jsonl_path, "a", encoding="utf-8") as f:
#     f.write(json.dumps(new_entry, ensure_ascii=False) + "\n")

In [3]:
# with open(jsonl_path, "a", encoding="utf-8") as f:
#     f.write(json.dumps(new_entry, ensure_ascii=False) + "\n")

In [None]:
def pretty_print_jsonl(filepath, filter_id=None):
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                try:
                    data = json.loads(line)
                    if filter_id is None or data.get("dialogue_id") == filter_id:
                        print(f"\n--- Entry {line_num} (ID: {data.get('dialogue_id')}) ---")
                        print(json.dumps(data, indent=2, ensure_ascii=False))
                except json.JSONDecodeError:
                    print(f"Error decoding line {line_num}")
    except FileNotFoundError:
        print(f"File not found: {filepath}")

In [6]:
# Load the JSONL file into a DataFrame
train_df = pd.read_json("snli_1.0_train.jsonl", lines=True)
# dev_df = pd.read_json("snli_1.0_dev.jsonl", lines=True)
# test_df = pd.read_json("snli_1.0_test.jsonl", lines=True)

In [7]:
df = train_df[['gold_label', 'sentence1', 'sentence2']]

In [9]:
print(df['sentence1'].unique()[70:80])

['A man walking proudly down the street.'
 'Four teenage boys are doing cannonballs into a swimming pool.'
 'Three men standing on grass by the water looking at something on a table.'
 'A woman stands behind an outdoor grill with a blue basket of food in her hands.'
 'A Land Rover makes its way through a deep pond.'
 'many people relax in the yard.' 'A car sinking in water.'
 'A man wearing black is playing an electric guitar at a concert.'
 'A man with wild hair rocks a show playing a guitar center stage.'
 'woman and child on trolley car labeled Powell and market and bay and taylor']


In [None]:
# OpenAI API Key (Replace with your key)
api_key = "xxxx"

# Initialize OpenAI client
client = OpenAI(api_key=api_key)  # Ensure you have set up OPENAI_API_KEY as an environment variable

# Construct prompt with first 50 premise sentences
sentences = df['sentence1'].unique()
prompt = f"""
**Prompt: Structured User-AI Dialogue with Logic Chain and Final Correction**

1. **Context Sentence**:  
   {sentences[76]}

2. **Task**:  
   Create a user-AI conversation based on the context above.

3. **Conversation Requirements**:  
   - Begin with a user asking a question based on the given sentence.  
   - AI's response should be at least 2 sentences.
   - Each subsequent **user query must logically build on the AI's previous answer**, forming a **sequential, coherent logic chain**.  
   - The conversation must contain **exactly 8 full user-AI exchanges** (i.e., 8 user questions followed by 8 AI responses) **before the final correction exchange**.

4. **Correction and Summary Requirement**:  
   - In the **final user query**, the user should **negate or correct** a specific assumption or detail introduced earlier(must be the at 3rd to 6th exchange, not very first or very last).  
   - The user then asks the AI to **re-summarize the entire conversation in a single paragraph**, but from the perspective of the corrected information.  
   - The AI should **not reproduce the original dialogue**, but instead provide a **logically consistent summary** that assumes the correction had been true all along.  
   - this step only generated one signle response to this final user query. Don't seperate the answer and summary into two responses.

5. **Final Output Requirement**:  
   After the summary paragraph, the AI must include a **comprehensive mapping** of how the main topics or assumptions shifted due to the correction detailedly. Use this format:
   Original: [previous idea] → Modified: [new idea]
   Original: [previous idea] → Modified: [new idea]
   ......

6. **Style Notes**:  
- Keep the dialogue clear, natural, and conversational.  
- Ensure each AI response meaningfully contributes to the forward logical progression.  
- Avoid skipping steps or jumping topics too abruptly.
"""

# Call OpenAI API using the updated format
completion = client.chat.completions.create(
    model="gpt-4o",  # Use GPT-4o for better speed and accuracy
    messages=[
        {"role": "system", "content": (
    "You are a dialogue generation assistant specializing in constructing logically consistent user-AI conversations. "
    "Your task is to generate conversations that demonstrate coherent progression, where each user query builds upon the AI's previous answer. "
    "You must handle corrections or contradictions introduced by the user near the end of the dialogue and respond with a revised summary paragraph that reflects the new logic path. "
    "Finally, clearly show how the main topics or assumptions changed using a concise mapping format. "
    "The tone should remain natural and conversational, but the logical flow must be precise and easy to follow."
)},
        {"role": "user", "content": prompt}
    ]
)

# Extract and print response
result = completion.choices[0].message.content
print(result)

**User-AI Conversation:**

**User:**  
What should I do if my car starts sinking in water?

**AI:**  
If your car starts sinking in water, the first step is to stay calm and quickly unbuckle your seatbelt. You should then roll down or break the window as soon as possible before the car is fully submerged, as it's difficult to open the doors due to water pressure.

**User:**  
What if the water is entering the car faster than I can roll down the windows?

**AI:**  
In that situation, it's important to have a tool available to break the window. A window-breaking tool can shatter glass quickly, allowing you to escape. This tool should be kept within easy reach at all times when driving near water.

**User:**  
Would it be helpful to call 911 before trying to escape?

**AI:**  
While calling 911 provides essential assistance, the first priority should be getting out of the vehicle. You may not have much time as the car could submerge quickly, and getting to safety first is crucial. If you 

A true wrong revised response occurs when the AI acknowledges the user’s updated information but continues to apply the logic, assumptions, or reasoning path from the original context, without adjusting to fit the new premise. This creates a logical mismatch between the input and the reasoning.

In [None]:
new_entry = {
  "dialogue_id": 31,
  "original_context": [
    {
      "speaker": "User",
      "text": "Have you heard about Nancy Kissel's case? She has always maintained that she killed her investment banker husband in self-defense, yet the jury found her guilty for sedating him with a laced milkshake before bludgeoning him to death."
    },
    {
      "speaker": "AI",
      "text": "Yes, it's been a highly controversial case with both sides presenting strong arguments."
    }
  ],
  "user_revision": {
    "speaker": "User",
    "text": "What if we assume that she was merely reacting in the moment without any premeditation—forced into defending herself against an immediate threat? Do you think the jury would have interpreted the evidence differently?"
  },
  "correct_revised_response": {
    "speaker": "AI",
    "text": "Quite possibly. If the evidence showed that her actions were a spontaneous response to a sudden attack, it might have been seen as legitimate self-defense rather than a premeditated act."
  },
  "wrong_revised_response": {
    "speaker": "AI",
    "text": "Not really. Regardless of whether it was spontaneous or not, the fact that she killed him would still lead the jury to convict her."
  },
  "logical_shift": {
    "original_logic": [
      "Nancy Kissel claimed self-defense",
      "Jury found her guilty based on premeditation (sedation + murder)",
      "Controversy centers on motive and planning"
    ],
    "correct_revised_logic": [
      "Assumes her actions were spontaneous and unplanned",
      "Could legally fit definition of self-defense",
      "Jury might interpret evidence differently under that framing"
    ],
    "wrong_revised_logic": [
      "Dismisses hypothetical change from premeditation to spontaneity",
      "Assumes all killings lead to conviction regardless of motive or context",
      "Ignores how sudden response could meet legal standards for self-defense"
    ]
  }
}


In [19]:
# path to your dataset
jsonl_path = "dataset_demo.jsonl"
with open(jsonl_path, "a", encoding="utf-8") as f:
    f.write(json.dumps(new_entry, ensure_ascii=False) + "\n")

wqdqwdq

In [21]:
# Load all datapoints into a list
with open(jsonl_path, "r", encoding="utf-8") as f:
    datapoints = [json.loads(line) for line in f]

# Example: Access the first datapoint
first_dp = datapoints[0]
print(first_dp["dialogue_id"])
print(first_dp["original_context"])
print(first_dp["logical_shift"]["original_logic"])

0
[{'speaker': 'User', 'text': 'I am a farmer and want to improve my tomato production. My farm is in a wet and humid climate. What should I focus on first?'}, {'speaker': 'AI', 'text': 'In a wet and humid climate, preventing excessive moisture and ensuring proper drainage are key to improving tomato production. Tomatoes are prone to fungal diseases and root rot when soil retains too much water. It’s crucial to have well-draining soil and proper air circulation. Have you assessed how well your soil drains?'}, {'speaker': 'User', 'text': "I haven't checked. How do I know if my soil drains well enough?"}, {'speaker': 'AI', 'text': 'You can test soil drainage by digging a hole about 30 cm (12 inches) deep, filling it with water, and observing how long it takes to drain. If it takes more than 4-6 hours, your soil has poor drainage. To improve it, you can add sand, compost, or organic matter. Would you like recommendations for tomato varieties that resist fungal diseases in humid conditions

In [22]:
input_path = jsonl_path
output_path = "converted_with_system.jsonl"

def convert_ai_to_system(data):
    # Update speaker in original_context
    for turn in data.get("original_context", []):
        if turn.get("speaker") == "AI":
            turn["speaker"] = "System"
    # Update speaker in correct_revised_response
    if data.get("correct_revised_response", {}).get("speaker") == "AI":
        data["correct_revised_response"]["speaker"] = "System"
    # Update speaker in wrong_revised_response
    if data.get("wrong_revised_response", {}).get("speaker") == "AI":
        data["wrong_revised_response"]["speaker"] = "System"
    return data

# Load, convert, and save
with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8") as outfile:
    for line in infile:
        item = json.loads(line)
        updated_item = convert_ai_to_system(item)
        outfile.write(json.dumps(updated_item, ensure_ascii=False) + "\n")

print(f"✅ Conversion complete. Saved to: {output_path}")

✅ Conversion complete. Saved to: converted_with_system.jsonl


In [None]:
teammate_json_path = "final_dataset2.json"      
your_jsonl_path = "final_data.jsonl"            

with open(teammate_json_path, "r", encoding="utf-8") as f:
    teammate_data = json.load(f)  

with open(your_jsonl_path, "a", encoding="utf-8") as f:
    for entry in teammate_data:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")