In [2]:
!pip install transformers rouge-score nltk sentence-transformers torch


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_

In [8]:
import pandas as pd
import json
from collections import defaultdict

file_path = "final_labels.csv"
df = pd.read_csv(file_path, sep=",", encoding="utf-8", quoting=3, on_bad_lines="skip")

def resolve_duplicates(group):
    return group.sort_values(by=["strength"], ascending=False).iloc[0]

df = df.groupby("entry_id", as_index=False).apply(resolve_duplicates)

df["body"].fillna("[deleted]", inplace=True)
df.dropna(subset=["entry_utc", "entry_id", "link_id"], inplace=True)
df["parent_id"].fillna("root", inplace=True)

df["body"] = df["body"].str.replace(r'[^\x00-\x7F]+', ' ', regex=True)

entry_dict = df.set_index("entry_id").to_dict(orient="index")
threads = defaultdict(list)

for entry_id, details in entry_dict.items():
    parent_id = details["parent_id"]
    if parent_id == "root" or parent_id not in entry_dict:
        threads[entry_id] = []
    else:
        threads[parent_id].append(entry_id)

for parent_id in threads:
    threads[parent_id].sort(key=lambda child_id: entry_dict[child_id]["entry_utc"])

def reconstruct_conversation(entry_id, depth=0):
    if entry_id not in entry_dict:
        return ""

    comment = entry_dict[entry_id]
    indent = "  " * depth

    conversation = f"{indent}[{comment['author']}] {comment['body']} ({comment['entry_utc']})\n"

    for child_id in threads.get(entry_id, []):
        conversation += reconstruct_conversation(child_id, depth + 1)

    return conversation

full_conversations = [
    reconstruct_conversation(entry_id)
    for entry_id in threads if entry_id in entry_dict and (entry_dict[entry_id]["parent_id"] == "root" or entry_dict[entry_id]["parent_id"] not in entry_dict)
]

with open("reconstructed_threads.json", "w", encoding="utf-8") as f:
    json.dump(full_conversations, f, indent=4)

from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
conversation_summaries = []

full_conversations = [conv for conv in full_conversations if len(conv.strip()) > 100]

for conversation in full_conversations:
    summary = summarizer(conversation, min_length=10, do_sample=False)
    conversation_summaries.append({"text": conversation, "summary": summary[0]['summary_text']})

with open("conversation_summaries.json", "w", encoding="utf-8") as f:
    json.dump(conversation_summaries, f, indent=4)

from transformers import GPT2LMHeadModel, GPT2Tokenizer
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from sentence_transformers import SentenceTransformer, util
import torch

gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
similarity_model = SentenceTransformer("all-MiniLM-L6-v2")
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def calculate_perplexity(text, epsilon=1e-10):
    tokens = gpt2_tokenizer.encode(text, return_tensors="pt")

    if tokens.shape[1] == 0:
        return float('inf')

    with torch.no_grad():
        output = gpt2_model(tokens, labels=tokens)
        loss = output.loss
        smoothed_loss = torch.clamp(loss, min=0.01) + epsilon

    return torch.exp(smoothed_loss).item()


results = []

for item in conversation_summaries:
    text = item["text"]
    summary = item["summary"]

    reference = [text.split()]
    candidate = summary.split()
    bleu_score = sentence_bleu(reference, candidate)

    rouge_score = scorer.score(text, summary)

    perplexity = calculate_perplexity(summary)

    embedding1 = similarity_model.encode(text, convert_to_tensor=True)
    embedding2 = similarity_model.encode(summary, convert_to_tensor=True)
    semantic_similarity = util.pytorch_cos_sim(embedding1, embedding2).item()

    results.append({
        "text": text,
        "summary": summary,
        "BLEU": bleu_score,
        "ROUGE": rouge_score,
        "Perplexity": perplexity,
        "Semantic Similarity": semantic_similarity
    })

with open("evaluation_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4)

print("Evaluation complete. Results saved to evaluation_results.json")


  df = df.groupby("entry_id", as_index=False).apply(resolve_duplicates)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["body"].fillna("[deleted]", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["parent_id"].fillna("root", inplace=True)
Device set to use cuda:0
Your max_length is set to 142, but your input

Evaluation complete. Results saved to evaluation_results.json


In [9]:
import json
import numpy as np

with open("evaluation_results.json", "r", encoding="utf-8") as f:
    results = json.load(f)

def extract_scores(results, key):
    scores = []
    for item in results:
        value = item.get("ROUGE", {}).get(key, [])

        if isinstance(value, list) and len(value) > 0:
            scores.append(np.mean(value))
        else:
            scores.append(0)

    return scores


bleu_scores = [item.get("BLEU", 0) for item in results]
perplexities = [item.get("Perplexity", 0) for item in results]
semantic_similarities = [item.get("Semantic Similarity", 0) for item in results]
rouge1_scores = extract_scores(results, "rouge1")
rouge2_scores = extract_scores(results, "rouge2")
rougeL_scores = extract_scores(results, "rougeL")

average_metrics = {
    "Average BLEU": np.mean(bleu_scores),
    "Average Perplexity": np.mean(perplexities),
    "Average Semantic Similarity": np.mean(semantic_similarities),
    "Average ROUGE-1": np.mean(rouge1_scores),
    "Average ROUGE-2": np.mean(rouge2_scores),
    "Average ROUGE-L": np.mean(rougeL_scores)
}

with open("summary_matrix.json", "w", encoding="utf-8") as f:
    json.dump(average_metrics, f, indent=4)

print("Summary matrix saved to summary_matrix.json")

Summary matrix saved to summary_matrix.json


In [10]:
import shutil
from google.colab import files  # If using Google Colab

# List of all generated files
file_names = [
    "evaluation_results.json",
    "summary_matrix.json",
    "conversation_summaries.json",
    "reconstructed_threads.json"
]

# Copy and download each file
for file_name in file_names:
    shutil.copy(file_name, f"{file_name}.download")
    files.download(file_name)  # Triggers download

print("All files are ready for download.")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

All files are ready for download.
