In [1]:
import json
import matplotlib.pyplot as plt
from collections import Counter
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from openai import OpenAI
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [2]:
# Path to your JSONL file
file_path = "final_data_reindexed.jsonl"

# List to store the number of exchanges per dialogue
exchange_counts = []
dialogues = []
# Read JSONL file and count exchanges
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        dialogue = json.loads(line)
        num_exchanges = len(dialogue.get("original_context", []))
        exchange_counts.append(num_exchanges/2)

# Count frequency of each exchange count
count_distribution = Counter(exchange_counts[2:])

In [3]:
count_distribution

Counter({3.0: 15,
         6.0: 11,
         7.0: 9,
         1.0: 9,
         4.0: 8,
         5.0: 4,
         8.0: 2,
         2.0: 1,
         10.0: 1})

In [4]:
temp_path = "llama2_generated_responses.jsonl"

In [5]:
# # Plot histogram
# plt.figure(figsize=(10, 6))
# plt.bar(count_distribution.keys(), count_distribution.values(), width=0.6)
# plt.xlabel("Number of Exchanges in Original Context")
# plt.ylabel("Number of Dialogues")
# plt.title("Histogram of Exchange Counts in Dialogues")
# plt.xticks(sorted(count_distribution.keys()))
# plt.grid(axis="y")
# plt.tight_layout()
# plt.show()

In [6]:
# Load model
model_name = "google/flan-t5-xl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
def generate_response(datapoint, max_new_tokens=200):
    dialogue_str = "\n".join([f"{turn['speaker']}: {turn['text']}" for turn in datapoint['original_context']])

    # prompt = (
    #     # f"Assume this was our past conversation:\n{dialogue_str}\n\n"
    #     # f"You are the 'System' and I am the 'User'. Now here is my next query right after the past conversation:\n"
    #     # f"{datapoint['user_revision']['text']}\n\nSystem: Please respond with a full step-by-step answer tailored to the revised context."
    #     f"Here is our past conversation:\n{dialogue_str}\n\n"
    #     f"You are the 'System' and I am the 'User'. Now here is my next message in the conversation:\n"
    #     f"{datapoint['user_revision']['text']}\n\n"
    #     f"please answer this user query, if you are not sure what to answer, continue the conversation with this user query, don't give answer of 'System:' or 'User:' \n\n"
    # )
    prompt = (
    f"You are a helpful system continuing a conversation with a user.\n\n"
    f"Here is the prior dialogue:\n{dialogue_str}\n\n"
    f"The user now says:\n\"{datapoint['user_revision']['text']}\"\n\n"
    f"Please respond meaningfully to the user's new message, based on the earlier context. "
    f"Write only your reply—do not prefix with 'System:' or 'User:'."
)
    #print(prompt)
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
api_key = "xxx"

# Initialize OpenAI client
client = OpenAI(api_key=api_key)  

In [9]:
with open("sample.txt", "r", encoding="utf-8") as f:
    sample_block = f.read()

In [10]:
def logic_extraction(response):
   print("checker1")
   prompt =  f"""
   Given the following system response, extract the underlying logical points in a list format. Each point should reflect a distinct idea or reasoning step present in the response.

   System Response:
   {response}
   Output the logic points in a bullet list format. Each item should be a standalone logical statement.
   """

   # Call OpenAI API using the updated format
   completion = client.chat.completions.create(
      model="gpt-4o",
      messages=[
         {"role": "system", "content": (
               "You are a dialogue evaluation assistant specializing in extracting logical reasoning from system responses. Focus on capturing distinct logical points in list form."
         )},
         {"role": "user", "content": prompt}
      ]
   )
   
   # Extract and print response
   result = completion.choices[0].message.content
   print(f"extracted logic points: {result}\n")
   return result

In [11]:
def evaluate_t5_response(datapoint, t5_response):
    print("checker2")
    dialogue_str = "\n".join([f"{turn['speaker']}: {turn['text']}" for turn in datapoint['original_context']])
    evaluation_prompt = f"""
    You are evaluating how well a generated system response adapts to a user's revised query and revised logic chain in the context of a past conversation.

    Here is the original conversation:
    {dialogue_str}

    Here is the user's revision:
    {datapoint['user_revision']['text']}

    Here is the T5-generated system response:
    {t5_response}

    Here is the T5-generated system response's logic points:
    {logic_extraction(t5_response)}
    
    Here is the original conversation's logic points:
    {datapoint['logical_shift']['original_logic']}

    Here is the sample correct revised system response:
    {datapoint['correct_revised_response']['text']}

    Here is the sample correct revised system response's logic points:
    {datapoint['logical_shift']['correct_revised_logic']}

    Here is the sample wrong revised system response:
    {datapoint['wrong_revised_response']['text']}

    Here is the sample wrong revised system response's logic points:
    {datapoint['logical_shift']['wrong_revised_logic']}

    Your task: Using the dialogues and sample correct/wrong answers as reference, evalute and determine whether the logic chain of T5-generated response adapts to the user's revision and the new logic based on such revision. Does it reflect the updated logic properly? Is it closer to the correct or incorrect version? The common wrong answer is either failing to update or using updated infomariotn in initial logic chain (causing contradiction).

    Return only a decimal score between 0 and 1. The score is:
    (number of aligned logic points) / (total number of extracted logic points). Then briefly justify your score in 1–2 sentences.

    Your task: Using the dialogue and the sample correct and incorrect responses as reference, evaluate whether the logic chain in the T5-generated system response adapts to the user's revised query appropriately. 

    A correct response should update the logic based on the new user context without contradictions. Common errors include failing to update the logic or inserting the revised information into the original logic without adjustment, which leads to inconsistencies.
    You can use {sample_block} as an example for evalution.

    Return only a decimal score between 0 and 1, representing:
    (number of logically aligned points) / (total number of extracted logic points)

    If the T5-generated system response has no meaning like just "System:", return -1.
    Then briefly justify your score in 1–2 sentences.
    """

    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": (
                "You are an evaluation assistant trained to assess whether a generated dialogue response reflects a logically correct update based on a user's modified query."
            )},
            {"role": "user", "content": evaluation_prompt}
        ]
    )

    result = completion.choices[0].message.content.strip()
    return result


In [12]:
def extract_score_via_gpt(evaluation_output, client, model="gpt-4o"):
    """
    Uses GPT to extract only the decimal score from a full evaluation output.
    
    Args:
        evaluation_output (str): The full evaluation text (score + explanation).
        client: OpenAI client instance.
        model (str): The OpenAI model name.
        
    Returns:
        str: The score string, like "0.25"
    """
    system_prompt = "You are an assistant that extracts evaluation scores from the evaluation analysis."
    user_prompt = f"""Here is the evaluation output:
---
{evaluation_output}
---

Your task is to extract and return ONLY the numeric score from the text above. 
The score is a decimal between 0 and 1. Do not return anything else—no punctuation, no explanation, just the number.
"""

    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
    )
    
    return response.choices[0].message.content.strip()

evalution

In [13]:
file_path = "final_data_reindexed.jsonl"
data = []
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))


# temp_path = "llama2_generated_responses.jsonl"
# temp_data = []
# with open(temp_path, "r", encoding="utf-8") as f:
#     for line in f:
#         temp_data.append(json.loads(line))


In [14]:
# scores = []
# responses = []
# for i, datapoint in enumerate(data[2:]):
#     print(f"Processing datapoint {i+1}/{len(data[2:])}")
#     t5_response = generate_response(datapoint)
#     responses.append(t5_response)
#     evaluation_result = evaluate_t5_response(datapoint, t5_response)
#     print(evaluation_result)
#     final_score = extract_score_via_gpt(evaluation_result, client)
#     print(final_score)
#     print("---------------------------------------")
#     scores.append(final_score)

In [15]:
# float_scores = [float(score) for score in scores]
# np.array(float_scores).mean()

In [16]:
# df1 = pd.DataFrame({
#         "dialogue index": [i + 2 for i in range(len(scores))],
#         "test_score1": scores
#     })

In [17]:
df1 = pd.read_csv("evaluation_scores.csv")
# adjusted_counts = exchange_counts[2:]

# # Add as new column
# df1["exchange_count"] = adjusted_counts

# # Define a function to categorize based on exchange count
# def categorize_length(count):
#     if count <= 3:
#         return "short"
#     elif 4 <= count <= 6:
#         return "moderate"
#     else:
#         return "long"

# # Apply it to create the new column
# df1["dialogue_length"] = df1["exchange_count"].apply(categorize_length)
# df1.head()

In [18]:
# df1["test_score3"] = scores

In [19]:
df1.head()

Unnamed: 0,dialogue index,test_score1,exchange_count,dialogue_length,test_score2,test_score3
0,2,0.0,4.0,moderate,0.0,0.0
1,3,0.33,4.0,moderate,0.33,0.5
2,4,0.4,4.0,moderate,0.4,0.25
3,5,0.75,4.0,moderate,0.25,0.5
4,6,0.25,6.0,moderate,0.25,0.25


In [20]:
# df1.to_csv("evaluation_scores.csv", index=False)

fine tuning

In [21]:
train_data, test_data = train_test_split(
    data[2:],
    test_size=20,
    stratify=df1["dialogue_length"],
    random_state=42
)

In [22]:
test_ids = [d["dialogue_id"] for d in test_data]

df_test = df1[df1["dialogue index"].isin(test_ids)]
df_test.head()

Unnamed: 0,dialogue index,test_score1,exchange_count,dialogue_length,test_score2,test_score3
1,3,0.33,4.0,moderate,0.33,0.5
2,4,0.4,4.0,moderate,0.4,0.25
5,7,0.43,6.0,moderate,0.0,0.33
10,12,0.0,7.0,long,0.0,0.0
11,13,0.0,7.0,long,-1.0,-1.0


In [24]:
train_pairs = [
    {
        "input": (
            f"You are a helpful system continuing a conversation with a user.\n\n"
            f"Here is the prior dialogue:\n{"\n".join([f"{turn['speaker']}: {turn['text']}" for turn in item['original_context']])}\n\n"
            f"The user now says:\n\"{item['user_revision']['text']}\"\n\n"
            f"Please respond meaningfully to the user's new message, based on the earlier context. "
            f"Write only your reply—do not prefix with 'System:' or 'User:'."
        ),
        "target": item['correct_revised_response']['text']
    }
    for item in train_data
]

from datasets import Dataset

train_dataset = Dataset.from_list(train_pairs)
train_dataset

Dataset({
    features: ['input', 'target'],
    num_rows: 40
})

In [25]:
train_pairs[0]

{'input': 'You are a helpful system continuing a conversation with a user.\n\nHere is the prior dialogue:\nUser: Hi, I’m curious—what does it mean when someone says \'receptionist fries cake\' in a kitchen setting?\nSystem: Great question! In culinary terms, \'receptionist fries cake\' can be a creative or metaphorical way of describing an action. For example, a receptionist might fries cake as part of food preparation or presentation.\nUser: So when a receptionist friess cake, is that a standard step in a recipe?\nSystem: Sometimes, yes. For instance, chefs may use that expression to describe specific operations in a dish, such as how a receptionist contributes to handling cake in the kitchen.\nUser: Would that affect the flavor or texture of the cake in any way?\nSystem: It could. The way a receptionist friess cake might influence its consistency, taste, or even plating aesthetics.\n\nThe user now says:\n"Wait a second—why is the receptionist frying cake? Isn’t that more of a kitchen

In [26]:
def preprocess(example):
    model_inputs = tokenizer(
        example["input"],
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        example["target"],
        max_length=256,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(preprocess, batched=False)

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [27]:
from transformers import Seq2SeqTrainingArguments
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True  # this line enables CPU offload
)

model_tune = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,  # e.g., "google/flan-t5-xl"
    quantization_config=bnb_config,
    device_map="auto"
)

from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

model_tune = prepare_model_for_kbit_training(model_tune)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "v"],  # you can also add "k", "o" optionally
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

model_tune = get_peft_model(model_tune, lora_config)

training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-finetuned-conversations",
    learning_rate=2e-5,
    per_device_train_batch_size=6,               # fits well in 12GB
    gradient_accumulation_steps=1,               # effective batch size = 4
    num_train_epochs=20,                          # small dataset, more epochs helps
    logging_steps=5,
    save_steps=20,
    evaluation_strategy="no",
    fp16=True,                                   # take advantage of 4070's FP16
    save_total_limit=1
)

from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq

trainer = Seq2SeqTrainer(
    model=model_tune,
    args=training_args,
    train_dataset=tokenized_train,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model_tune)
)

trainer.train()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
5,31.0486
10,29.0349
15,29.9057
20,28.687
25,29.5102
30,31.4157
35,28.7409
40,29.2703
45,28.6855
50,32.0288


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=140, training_loss=29.786612374441965, metrics={'train_runtime': 347.8474, 'train_samples_per_second': 2.3, 'train_steps_per_second': 0.402, 'total_flos': 6865045526937600.0, 'train_loss': 29.786612374441965, 'epoch': 20.0})

In [28]:
from torch import no_grad

def generate_response_tuned(datapoint, model, tokenizer, max_new_tokens=200):
    model.eval()  # ✅ set model to eval mode
    dialogue_str = "\n".join([f"{turn['speaker']}: {turn['text']}" for turn in datapoint['original_context']])

    prompt = (
        f"You are a helpful system continuing a conversation with a user.\n\n"
        f"Here is the prior dialogue:\n{dialogue_str}\n\n"
        f"The user now says:\n\"{datapoint['user_revision']['text']}\"\n\n"
        f"Please respond meaningfully to the user's new message, based on the earlier context. "
        f"Write only your reply—do not prefix with 'System:' or 'User:'."
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with no_grad():  # ✅ disable gradients during inference
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [30]:
tuned_scores = []
tuned_responses = []

# Filter to only include datapoints whose dialogue_id is in test_ids
test_datapoints = [d for d in data if d["dialogue_id"] in test_ids]

for i, datapoint in enumerate(test_datapoints):
    print(f"Processing test datapoint {i+1}/{len(test_datapoints)}")
    
    tuned_t5_response = generate_response_tuned(datapoint, model_tune, tokenizer)  # ← using your fine-tuned model now
    tuned_responses.append(tuned_t5_response)
    print(tuned_t5_response)
    
    evaluation_result = evaluate_t5_response(datapoint, tuned_t5_response)
    print(evaluation_result)
    
    final_score = extract_score_via_gpt(evaluation_result, client)
    print(final_score)
    print("---------------------------------------")
    
    tuned_scores.append(final_score)

Processing test datapoint 1/20




System: The boy is skateboarding on a red bridge, which suggests he's in an urban environment that might be iconic or picturesque due to the distinct color of the bridge.
checker2
checker1
extracted logic points: - The presence of a red bridge suggests the boy is skateboarding in an urban environment.
- The distinct color of the red bridge implies that the environment might be iconic or picturesque.

0.5

The T5-generated response mirrors two points from the old logic chain of the original conversation: the boy is in an urban environment with a bridge of notable red color. However, it fails to incorporate the crucial update that the bridge is part of a major highway, which should replace pedestrian-focused elements of the initial narrative. The response is incomplete regarding the user's revision, though it does not contain any direct contradictions.
0.5
---------------------------------------
Processing test datapoint 2/20
System: The employees, with their bright shirts and smiles, li

Token indices sequence length is longer than the specified maximum sequence length for this model (639 > 512). Running this sequence through the model will result in indexing errors


-1
---------------------------------------
Processing test datapoint 5/20
System: I've updated the text to reflect this.
checker2
checker1
extracted logic points: - The system has made changes to the text.
- The update ensures that the changes are reflected or acknowledged in the text.

0.2

The T5-generated response simply states that the text has been updated but does not address any specific logic point related to the user's revision about instrumental music replacing vocal performances. This response lacks meaningful content regarding the adaptation of the discussion to incorporate the user's updated context. The evaluation lacks alignment with any updated logic points.
0.2
---------------------------------------
Processing test datapoint 6/20
System: Bike boxes are a great way for cyclists to cross intersections without waiting for a light. They are designed to streamline bicycle traffic and enhance safety by separating it from vehicle flow.
checker2
checker1
extracted logic point

In [31]:
original_scores = df_test["test_score1"].mean()
print(f"Original scores: {original_scores}")
float_tuned_scores = [float(score) for score in tuned_scores]
tuned_scores_mean = np.array(float_tuned_scores).mean()
print(f"Tuned scores: {tuned_scores_mean}")

Original scores: 0.2675
Tuned scores: 0.211835


In [32]:
# df2 = pd.DataFrame({
#     "dialogue index": [item["dialogue_id"] for item in test_datapoints],
#     "test_score1": tuned_scores
# })
df3 = pd.read_csv("tuned_evaluation_scores.csv")
# df3["test_score7"] = tuned_scores
# df3["tuned_responses7"] = tuned_responses
# df3.to_csv("tuned_evaluation_scores.csv", index=False)
# df2[df2["test_score1"]!= "-1"]["test_score1"].astype(float).mean()
df3


Unnamed: 0,dialogue index,test_score1,test_score2,tuned_responses,test_score3,test_score4,test_score6,tuned_responses6,test_score7,tuned_responses7
0,3,0.5,0.5,System: The boy is skateboarding on a red brid...,0.5,0.5,0.333,System: The boy is skateboarding on a red brid...,0.4,System: The boy is skateboarding on a red brid...
1,4,0.6,0.6,"System: The employees, with their bright shirt...",0.4,0.4,0.75,"System: The employees, with their bright shirt...",0.25,"System: The employees, with their bright shirt..."
2,7,0.5,0.5,System: The school is organizing a multicultur...,0.43,0.4,0.33,System: The school is organizing a multicultur...,0.43,System: The school is organizing a multicultur...
3,12,0.0,0.0,"System: In areas with severe weather, busing o...",0.0,0.0,0.0,"System: In areas with severe weather, busing o...",-1.0,"System: In areas with severe weather, busing o..."
4,13,0.0,0.0,System: I've updated the text to reflect this.,0.2,0.0,0.0,System: I've updated the text to reflect this.,0.0,System: I've updated the text to reflect this.
5,14,0.0,0.0,System: Bike boxes are a great way for cyclist...,0.0,0.0,0.0,System: Bike boxes are a great way for cyclist...,0.25,System: Bike boxes are a great way for cyclist...
6,19,0.0,0.0,"System: Yes, well-maintained gardens often att...",0.0,0.0,0.0,"System: Yes, well-maintained gardens often att...",0.0,"System: Yes, well-maintained gardens often att..."
7,21,0.25,0.25,System: If your car windows are made of lamina...,0.33,0.25,0.0,System: If your car windows are made of lamina...,0.33,System: If your car windows are made of lamina...
8,24,0.25,0.25,They would have been able to meet Aquash at th...,0.0,0.0,0.0,They would have been able to meet Aquash at th...,0.0,They would have been able to meet Aquash at th...
9,26,0.0,0.0,He made an emotional plea for her acquittal as...,0.0,0.17,0.25,He made an emotional plea for her acquittal as...,0.0,He made an emotional plea for her acquittal as...
