In [32]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    Trainer
)


In [33]:
train_df = pd.read_csv(r"C:\Users\rajdw\OneDrive\Desktop\PULSE QUEST\pulse-quest-env26\samsum_train.csv")   # id, dialogue, summary
test_df  = pd.read_csv(r"C:\Users\rajdw\OneDrive\Desktop\PULSE QUEST\pulse-quest-env26\samsum_test.csv")    # id, dialogue


In [34]:
train_df["dialogue"] = train_df["dialogue"].astype(str)
train_df["summary"]  = train_df["summary"].astype(str)

train_df = train_df.dropna(subset=["dialogue", "summary"])
train_df = train_df[train_df["dialogue"].str.strip() != ""]
train_df = train_df[train_df["summary"].str.strip() != ""]


In [35]:
train_dataset = Dataset.from_pandas(train_df)


In [36]:
model_name = "facebook/bart-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [37]:
def preprocess(batch):
    inputs = tokenizer(
        batch["dialogue"],
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        text_target=batch["summary"],
        max_length=64,
        truncation=True,
        padding="max_length"
    )

    inputs["labels"] = labels["input_ids"]
    return inputs


In [38]:
tokenized_train = train_dataset.map(
    preprocess,
    batched=True,
    remove_columns=train_dataset.column_names,
    load_from_cache_file=False
)


Map:   0%|          | 0/10312 [00:00<?, ? examples/s]

In [39]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    learning_rate=3e-5,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    logging_steps=100,
    save_total_limit=2,
    report_to=[],
    push_to_hub=False
)


In [40]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    tokenizer=tokenizer
)

trainer.train()


  trainer = Trainer(


Step,Training Loss
100,4.6669
200,1.122
300,0.936
400,0.9766
500,0.9176
600,0.9487
700,0.8665
800,0.9731
900,0.8672
1000,0.9586




TrainOutput(global_step=15468, training_loss=0.7135738331370801, metrics={'train_runtime': 3249.4819, 'train_samples_per_second': 9.52, 'train_steps_per_second': 4.76, 'total_flos': 9431403536056320.0, 'train_loss': 0.7135738331370801, 'epoch': 3.0})

In [41]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# 1️⃣ Load test CSV
test_df = pd.read_csv(r"C:\Users\rajdw\OneDrive\Desktop\PULSE QUEST\pulse-quest-env26\samsum_test.csv")  # contains 'id' and 'dialogue'

# 2️⃣ Load trained model and tokenizer from checkpoint
checkpoint_path = r"C:\Users\rajdw\OneDrive\Desktop\PULSE QUEST\Raj\results\checkpoint-15468"  # replace xxxx with your latest checkpoint number
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
model.eval()  # set to evaluation mode

# 3️⃣ Generate summaries
summaries = []

for text in test_df["dialogue"]:
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=512
    )

    # move inputs to GPU if available
    input_ids = inputs["input_ids"].to(model.device)
    
    # generate summary
    outputs = model.generate(
        input_ids,
        num_beams=4,
        max_length=60,
        early_stopping=True
    )

    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    summaries.append(summary)

# 4️⃣ Save predictions to CSV
test_df["summary"] = summaries
test_df.to_csv("test_predictions.csv", index=False)

print("✅ Summaries generated and saved to test_predictions.csv")


✅ Summaries generated and saved to test_predictions.csv


In [44]:
import pandas as pd
import re

test_df = pd.read_csv(r"C:\Users\rajdw\OneDrive\Desktop\PULSE QUEST\pulse-quest-env26\samsum_test.csv")             # original dialogues
pred_df = pd.read_csv(r"C:\Users\rajdw\OneDrive\Desktop\PULSE QUEST\Raj\test_predictions.csv") # generated summaries


In [45]:
def get_mapping(dialogue_text):
    """
    Maps placeholders like A, B in summary to actual names in dialogue
    """
    # extract speaker placeholders in dialogue (assume format "A:", "B:", etc.)
    placeholders = re.findall(r'(\b[A-Z]\b):', dialogue_text)
    placeholders = list(dict.fromkeys(placeholders))  # remove duplicates, keep order
    
    # extract actual names (if real names are used, otherwise use same placeholders)
    # For example, if dialogue already has actual names, you can replace this step
    mapping = {}
    for i, sp in enumerate(placeholders):
        mapping[sp] = sp  # or map to actual names if known
    return mapping

# Apply mapping to all test dialogues
test_df['mapping'] = test_df['dialogue'].apply(get_mapping)


In [46]:
final_summaries = []

for summary, mapping in zip(pred_df['summary'], test_df['mapping']):
    for placeholder, actual in mapping.items():
        summary = summary.replace(placeholder, actual)
    final_summaries.append(summary)

pred_df['summary'] = final_summaries
pred_df.to_csv("test_predictions_mapped.csv", index=False)

print("✅ Final summaries saved with proper mapping")


✅ Final summaries saved with proper mapping


In [49]:
import pandas as pd
import re

# Load test dialogues and generated summaries
test_df = pd.read_csv(r"C:\Users\rajdw\OneDrive\Desktop\PULSE QUEST\pulse-quest-env26\samsum_test.csv")          # has 'dialogue' with real names
pred_df = pd.read_csv("test_predictions.csv")  # has 'summary' with placeholders A/B

def get_name_mapping(dialogue):
    """
    Maps placeholder A/B in summary to actual speaker names in the dialogue
    Returns a dict: {'A': 'Alice', 'B': 'Bob'}
    """
    # Find all placeholders A, B, C etc. in dialogue
    placeholders = re.findall(r'(\b[A-Z]\b):', dialogue)  # e.g., ['A', 'B']
    
    # Find all actual names in dialogue (text before ':')
    names = re.findall(r'([A-Za-z]+):', dialogue)         # e.g., ['Alice', 'Bob']
    
    mapping = dict(zip(placeholders, names))
    return mapping

# Store mapping for each dialogue
test_df['mapping'] = test_df['dialogue'].apply(get_name_mapping)


In [50]:
final_summaries = []

for summary, mapping in zip(pred_df['summary'], test_df['mapping']):
    for placeholder, real_name in mapping.items():
        summary = summary.replace(placeholder, real_name)
    final_summaries.append(summary)

pred_df['summary'] = final_summaries
pred_df.to_csv("test_predictions_mapped.csv", index=False)
print("✅ Summaries saved with real names instead of A/B")


✅ Summaries saved with real names instead of A/B


In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import evaluate

# 1️⃣ Load validation data
val_df = pd.read_csv(r"C:\Users\rajdw\OneDrive\Desktop\PULSE QUEST\samsum_data\validation\validation.csv")  # must have columns: 'dialogue', 'summary' (reference)

# 2️⃣ Load trained model checkpoint
checkpoint_path = r"C:\Users\rajdw\OneDrive\Desktop\PULSE QUEST\Raj\results\checkpoint-15468"  # replace with your checkpoint
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
model.eval()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 3️⃣ Generate summaries for validation dialogues
generated_summaries = []

for dialogue in val_df['dialogue']:
    inputs = tokenizer(dialogue, return_tensors="pt", truncation=True, max_length=512)
    input_ids = inputs['input_ids'].to(device)

    outputs = model.generate(
        input_ids,
        num_beams=4,
        max_length=60,
        early_stopping=True
    )

    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated_summaries.append(summary)

# 4️⃣ Compute ROUGE metrics using evaluate
references = val_df['summary'].tolist()
predictions = generated_summaries

rouge = evaluate.load("rouge")
results = rouge.compute(predictions=predictions, references=references)

# 5️⃣ Print ROUGE scores
print("✅ ROUGE scores on validation set:\n")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

# 6️⃣ Optional: save generated summaries
val_df['predicted_summary'] = generated_summaries
val_df.to_csv("validation_predictions.csv", index=False)
print("✅ Generated summaries saved to validation_predictions.csv")


  if not hasattr(np, "object"):


Downloading builder script: 0.00B [00:00, ?B/s]

✅ ROUGE scores on validation set:

rouge1: 0.4550
rouge2: 0.2375
rougeL: 0.3841
rougeLsum: 0.3841
✅ Generated summaries saved to validation_predictions.csv


In [3]:
import pandas as pd
import re

df = pd.read_csv(r"C:\Users\rajdw\OneDrive\Desktop\PULSE QUEST\Raj\test_predictions_mapped.csv")   # id, dialogue, summary


In [4]:
def extract_name_mapping(dialogue):
    # Extract speaker names in order
    names = re.findall(r'^(\w+):', dialogue, flags=re.MULTILINE)

    mapping = {}
    if len(names) >= 1:
        mapping['A'] = names[0]
    if len(names) >= 2:
        mapping['B'] = names[1]

    return mapping


In [5]:
def fix_summary(row):
    summary = row['summary']
    dialogue = row['dialogue']

    mapping = extract_name_mapping(dialogue)

    for k, v in mapping.items():
        summary = summary.replace(k, v)

    return summary

df['summary'] = df.apply(fix_summary, axis=1)


In [6]:
df.to_csv("prediction_fixed.csv", index=False)
print("✅ Names restored successfully")


✅ Names restored successfully


In [7]:
import re

def get_action_speaker(dialogue):
    lines = dialogue.split("\n")
    for line in lines:
        if ":" in line:
            speaker, text = line.split(":", 1)
            text = text.lower()
            if any(p in text for p in ["i want", "i will", "i plan", "i am going to"]):
                return speaker.strip()
    return None


In [8]:
def fix_summary(dialogue, summary):
    speaker = get_action_speaker(dialogue)
    if not speaker:
        return summary

    # replace first name in summary with correct speaker
    words = summary.split()
    words[0] = speaker
    return " ".join(words)


In [9]:
import pandas as pd

df = pd.read_csv(r"C:\Users\rajdw\OneDrive\Desktop\PULSE QUEST\Raj\test_predictions_mapped.csv")

df["fixed_summary"] = df.apply(
    lambda x: fix_summary(x["dialogue"], x["summary"]),
    axis=1
)

df.to_csv("prediction_fixed.csv", index=False)


In [10]:
import pandas as pd

df = pd.read_csv("prediction_fixed.csv")

df = df.drop(columns=["dialogue", "summary"])   # remove 2 columns

df.to_csv("final_submission.csv", index=False)
