In [None]:
####Updated/Reduced Categories Name, Test Cases Names and Numbers, Added Retry, Sleep Time, File Logging

In [None]:
# -------------------------------
# Base GT - ChatGPT
# -------------------------------

import os
import glob
import csv
import openai
import json
import sys
import time

openai.api_key = "placeholder"  # Replace with your actual API key

MAX_RETRIES = 3  # Number of retries on API failure

def get_gpt4_response(prompt, model="gpt-4o", temperature=1, max_tokens=10000):
    """
    Sends a prompt to GPT-4o and returns the response content or raises an Exception.
    """
    #print("\n[GPT-4o] Sending prompt...")
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        max_tokens=max_tokens,
    )
    content = response['choices'][0]['message']['content'].strip()
    #print("[GPT-4o] Received response:")
    #print(content)
    return content

def process_csv_files(folder_path, output_json_file):
    """
    Process all CSV files in a folder. For each CSV:
      1) Read the "Description" column in order.
      2) Construct a single prompt.
      3) Call GPT-4o with retries on error.
      4) Save partial progress in a JSON file so the script can be resumed.
    """
    # 1) Load existing results (if any) so we can resume from partial progress.
    if os.path.exists(output_json_file):
        with open(output_json_file, "r", encoding="utf-8") as jf:
            try:
                results = json.load(jf)
            except json.JSONDecodeError:
                # If the file is empty or invalid, start with an empty dictionary
                results = {}
        print(f"Loaded existing results from '{output_json_file}'.")
    else:
        results = {}
        print(f"No existing '{output_json_file}' found. Starting fresh.")
    
    # 2) Get all CSV files
    csv_files = sorted(glob.glob(os.path.join(folder_path, "*.csv")))
    print(f"Found {len(csv_files)} CSV file(s) in folder '{folder_path}'.")

    # 3) Process each CSV file
    for index, csv_file in enumerate(csv_files, start=1):
        # Check if we already have a result for this index (resume logic)
        if str(index) in results:
            print(f"Skipping file {index} ('{csv_file}') - already in JSON results.")
            continue
        
        #print(f"\nProcessing file {index}: '{csv_file}'")
        descriptions = []

        # Read the CSV and extract the "Description" column
        with open(csv_file, newline='', encoding='utf-8') as f:
            # Adjust delimiter if needed ("," or "\t", etc.)
            reader = csv.DictReader(f, delimiter=",")
            for row_number, row in enumerate(reader, start=1):
                if "Description" in row:
                    description = row["Description"]
                    #print(f"File {index}, Row {row_number} - Description: {description}")
                    descriptions.append(description)
                else:
                    print(
                        f"Warning: 'Description' column not found in row {row_number} of {csv_file}"
                    )

        # Build the prompt
        prompt = (
            "I have a list of scenes describing a movie step by step. "
            "Please transform them into a single paragraph, preserving the chronological order. "
            "Include every detail from each scene without adding or omitting any information. "
            "Use only straightforward, factual wording, and avoid any new or descriptive language beyond what is provided in the scene notes.\n\n"
            "Input: \tDescription\n"
        )
        for i, desc in enumerate(descriptions, start=1):
            prompt += f"{i}\t{desc}\n"

        #print(f"\nConstructed prompt for file {index}:\n{prompt}")

        # 4) Call GPT-4o with retries
        attempt = 0
        while attempt < MAX_RETRIES:
            try:
                response = get_gpt4_response(prompt)
                # If we get here, we have a successful response
                break
            except Exception as e:
                attempt += 1
                #print(f"Error on attempt {attempt} for file {index}: {e}")
                if attempt < MAX_RETRIES:
                    #print(f"Retrying (attempt {attempt+1}/{MAX_RETRIES})...")
                    time.sleep(3)  # short delay before retry
                else:
                    # Exceeded max retries, save partial progress and exit
                    #print("Max retries exceeded. Saving partial progress and exiting.")
                    results[str(index)] = f"Error after {MAX_RETRIES} attempts: {e}"
                    with open(output_json_file, "w", encoding='utf-8') as json_file:
                        json.dump(results, json_file, indent=4, ensure_ascii=False)
                    sys.exit(1)

        # 5) Store result in dictionary
        results[str(index)] = response
        #print(f"[File {index}] Stored GPT-4o response in results dictionary.")

        # 6) Save partial progress to JSON after each file
        with open(output_json_file, "w", encoding='utf-8') as json_file:
            json.dump(results, json_file, indent=4, ensure_ascii=False)
        print(f"Progress saved after file {index}.")

    #print(f"\nAll CSV files processed. Final results have been saved to '{output_json_file}'.")

if __name__ == "__main__":
    folder_path = "/home/jacks.local/hdubey/VADv6/experiment/rawGT3"  # Update to your folder
    output_json_file = "GT.json"

    print("Starting CSV processing and GPT-4o querying...")
    process_csv_files(folder_path, output_json_file)
    print("Processing complete.")

In [None]:
# -------------------------------------------
# Base GT - Gemini
# -------------------------------------------

import os
import glob
import csv
import openai
import json
import sys
import time

openai.api_key = "placeholder"  # Replace with your actual API key

MAX_RETRIES = 3  # Number of retries on API failure

def get_gemini_response(prompt, model="gemini", temperature=1, max_tokens=10000):
    """
    Sends a prompt to Gemini and returns the response content or raises an Exception.
    """
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        max_tokens=max_tokens,
    )
    content = response['choices'][0]['message']['content'].strip()
    return content

def process_csv_files(folder_path, output_json_file):
    """
    Process all CSV files in a folder. For each CSV:
      1) Read the 'Description' column in order.
      2) Construct a single prompt.
      3) Call Gemini with retries on error.
      4) Save partial progress in a JSON file so the script can be resumed.
    """
    # 1) Load existing results (if any) so we can resume from partial progress.
    if os.path.exists(output_json_file):
        with open(output_json_file, "r", encoding="utf-8") as jf:
            try:
                results = json.load(jf)
            except json.JSONDecodeError:
                results = {}
        print(f"Loaded existing results from '{output_json_file}'.")
    else:
        results = {}
        print(f"No existing '{output_json_file}' found. Starting fresh.")
    
    # 2) Get all CSV files
    csv_files = sorted(glob.glob(os.path.join(folder_path, "*.csv")))
    print(f"Found {len(csv_files)} CSV file(s) in folder '{folder_path}'.")

    # 3) Process each CSV file
    for index, csv_file in enumerate(csv_files, start=1):
        # Check if result exists
        if str(index) in results:
            print(f"Skipping file {index} ('{csv_file}') - already in JSON results.")
            continue
        
        descriptions = []
        with open(csv_file, newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f, delimiter=",")
            for row_number, row in enumerate(reader, start=1):
                if "Description" in row:
                    description = row["Description"]
                    descriptions.append(description)
                else:
                    print(
                        f"Warning: 'Description' column not found in row {row_number} of {csv_file}"
                    )

        # Build the prompt (identical wording/punctuation as requested)
        prompt = (
            "I have a list of scenes describing a movie step by step. "
            "Please transform them into a single paragraph, preserving the chronological order. "
            "Include every detail from each scene without adding or omitting any information. "
            "Use only straightforward, factual wording, and avoid any new or descriptive language beyond what is provided in the scene notes.\n\n"
            "Input: \tDescription\n"
        )
        for i, desc in enumerate(descriptions, start=1):
            prompt += f"{i}\t{desc}\n"

        # 4) Call Gemini with retries
        attempt = 0
        while attempt < MAX_RETRIES:
            try:
                response = get_gemini_response(prompt)
                break
            except Exception as e:
                attempt += 1
                if attempt < MAX_RETRIES:
                    time.sleep(3)  # short delay
                else:
                    results[str(index)] = f"Error after {MAX_RETRIES} attempts: {e}"
                    with open(output_json_file, "w", encoding='utf-8') as json_file:
                        json.dump(results, json_file, indent=4, ensure_ascii=False)
                    sys.exit(1)

        # 5) Store result
        results[str(index)] = response

        # 6) Save partial progress
        with open(output_json_file, "w", encoding='utf-8') as json_file:
            json.dump(results, json_file, indent=4, ensure_ascii=False)
        print(f"Progress saved after file {index}.")

if __name__ == "__main__":
    folder_path = "/home/jacks.local/hdubey/VADv6/experiment/rawGT3"  # Update to your folder
    output_json_file = "GT_gemini.json"

    print("Starting CSV processing and Gemini querying...")
    process_csv_files(folder_path, output_json_file)
    print("Processing complete.")


In [None]:
# -------------------------------------------
# Base GT - Claude
# -------------------------------------------

import os
import glob
import csv
import openai
import json
import sys
import time

openai.api_key = "placeholder"  # Replace with your actual API key

MAX_RETRIES = 3  # Number of retries on API failure

def get_claude_response(prompt, model="claude", temperature=1, max_tokens=10000):
    """
    Sends a prompt to Claude and returns the response content or raises an Exception.
    """
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        max_tokens=max_tokens,
    )
    content = response['choices'][0]['message']['content'].strip()
    return content

def process_csv_files(folder_path, output_json_file):
    """
    Process all CSV files in a folder. For each CSV:
      1) Read the 'Description' column in order.
      2) Construct a single prompt.
      3) Call Claude with retries on error.
      4) Save partial progress in a JSON file so the script can be resumed.
    """
    if os.path.exists(output_json_file):
        with open(output_json_file, "r", encoding="utf-8") as jf:
            try:
                results = json.load(jf)
            except json.JSONDecodeError:
                results = {}
        print(f"Loaded existing results from '{output_json_file}'.")
    else:
        results = {}
        print(f"No existing '{output_json_file}' found. Starting fresh.")
    
    csv_files = sorted(glob.glob(os.path.join(folder_path, "*.csv")))
    print(f"Found {len(csv_files)} CSV file(s) in folder '{folder_path}'.")

    for index, csv_file in enumerate(csv_files, start=1):
        if str(index) in results:
            print(f"Skipping file {index} ('{csv_file}') - already in JSON results.")
            continue
        
        descriptions = []
        with open(csv_file, newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f, delimiter=",")
            for row_number, row in enumerate(reader, start=1):
                if "Description" in row:
                    description = row["Description"]
                    descriptions.append(description)
                else:
                    print(
                        f"Warning: 'Description' column not found in row {row_number} of {csv_file}"
                    )

        prompt = (
            "I have a list of scenes describing a movie step by step. "
            "Please transform them into a single paragraph, preserving the chronological order. "
            "Include every detail from each scene without adding or omitting any information. "
            "Use only straightforward, factual wording, and avoid any new or descriptive language beyond what is provided in the scene notes.\n\n"
            "Input: \tDescription\n"
        )
        for i, desc in enumerate(descriptions, start=1):
            prompt += f"{i}\t{desc}\n"

        attempt = 0
        while attempt < MAX_RETRIES:
            try:
                response = get_claude_response(prompt)
                break
            except Exception as e:
                attempt += 1
                if attempt < MAX_RETRIES:
                    time.sleep(3)
                else:
                    results[str(index)] = f"Error after {MAX_RETRIES} attempts: {e}"
                    with open(output_json_file, "w", encoding='utf-8') as json_file:
                        json.dump(results, json_file, indent=4, ensure_ascii=False)
                    sys.exit(1)

        results[str(index)] = response
        with open(output_json_file, "w", encoding='utf-8') as json_file:
            json.dump(results, json_file, indent=4, ensure_ascii=False)
        print(f"Progress saved after file {index}.")

if __name__ == "__main__":
    folder_path = "/home/jacks.local/hdubey/VADv6/experiment/rawGT3"
    output_json_file = "GT_claude.json"

    print("Starting CSV processing and Claude querying...")
    process_csv_files(folder_path, output_json_file)
    print("Processing complete.")


In [None]:
# -------------------------------------------
# Base GT - Mistral
# -------------------------------------------

import os
import glob
import csv
import openai
import json
import sys
import time

openai.api_key = "placeholder"  # Replace with your actual API key

MAX_RETRIES = 3  # Number of retries on API failure

def get_mistral_response(prompt, model="mistral", temperature=1, max_tokens=10000):
    """
    Sends a prompt to Mistral and returns the response content or raises an Exception.
    """
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        max_tokens=max_tokens,
    )
    content = response['choices'][0]['message']['content'].strip()
    return content

def process_csv_files(folder_path, output_json_file):
    """
    Process all CSV files in a folder. For each CSV:
      1) Read the 'Description' column in order.
      2) Construct a single prompt.
      3) Call Mistral with retries on error.
      4) Save partial progress in a JSON file so the script can be resumed.
    """
    if os.path.exists(output_json_file):
        with open(output_json_file, "r", encoding="utf-8") as jf:
            try:
                results = json.load(jf)
            except json.JSONDecodeError:
                results = {}
        print(f"Loaded existing results from '{output_json_file}'.")
    else:
        results = {}
        print(f"No existing '{output_json_file}' found. Starting fresh.")
    
    csv_files = sorted(glob.glob(os.path.join(folder_path, "*.csv")))
    print(f"Found {len(csv_files)} CSV file(s) in folder '{folder_path}'.")

    for index, csv_file in enumerate(csv_files, start=1):
        if str(index) in results:
            print(f"Skipping file {index} ('{csv_file}') - already in JSON results.")
            continue
        
        descriptions = []
        with open(csv_file, newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f, delimiter=",")
            for row_number, row in enumerate(reader, start=1):
                if "Description" in row:
                    description = row["Description"]
                    descriptions.append(description)
                else:
                    print(
                        f"Warning: 'Description' column not found in row {row_number} of {csv_file}"
                    )

        prompt = (
            "I have a list of scenes describing a movie step by step. "
            "Please transform them into a single paragraph, preserving the chronological order. "
            "Include every detail from each scene without adding or omitting any information. "
            "Use only straightforward, factual wording, and avoid any new or descriptive language beyond what is provided in the scene notes.\n\n"
            "Input: \tDescription\n"
        )
        for i, desc in enumerate(descriptions, start=1):
            prompt += f"{i}\t{desc}\n"

        attempt = 0
        while attempt < MAX_RETRIES:
            try:
                response = get_mistral_response(prompt)
                break
            except Exception as e:
                attempt += 1
                if attempt < MAX_RETRIES:
                    time.sleep(3)
                else:
                    results[str(index)] = f"Error after {MAX_RETRIES} attempts: {e}"
                    with open(output_json_file, "w", encoding='utf-8') as json_file:
                        json.dump(results, json_file, indent=4, ensure_ascii=False)
                    sys.exit(1)

        results[str(index)] = response
        with open(output_json_file, "w", encoding='utf-8') as json_file:
            json.dump(results, json_file, indent=4, ensure_ascii=False)
        print(f"Progress saved after file {index}.")

if __name__ == "__main__":
    folder_path = "/home/jacks.local/hdubey/VADv6/experiment/rawGT3"
    output_json_file = "GT_mistral.json"

    print("Starting CSV processing and Mistral querying...")
    process_csv_files(folder_path, output_json_file)
    print("Processing complete.")


In [None]:
# ======================================
# Summarization
# ======================================

import os
import json
import openai
import time


# Specify the folder where you want to save the iterative summarized outputs.
GT_FILE_PATH = "/mmfs1/scratch/jacks.local/mali9292/VAD/data/GroundTruths/GT_part1_25.json"  # Ground Truth File Path
OUTPUT_DIR = "/mmfs1/scratch/jacks.local/mali9292/VAD/data/summarization"  # <-- UPDATE THIS PATH

MODEL = "gpt-4o"  # or "gpt4o" if that is your model name
TEMPERATURE = 0.9
MAX_TOKENS = 5000

# Iterative summarization parameters:
TARGET_PERCENT = 90  # Change this to 80 (or any other value) to target a different word count ratio.
NUM_ITERATIONS = 6   # Number of iterative summarization cycles

# ============================
# PROMPT TEMPLATE
# ============================

# This prompt template uses {target_percent} (e.g., 90) and {text} (the description to summarize)
PROMPT_TEMPLATE = (
    "Summarize the video description below so that it’s about {target_percent}% of the original word count. "
    "First, identify the core plot by highlighting the main characters, their motivations, and the central conflict. "
    "Next, pinpoint the essential events such as the inciting incident, climax, and resolution. "
    "Remove or condense any less crucial details—if there are lengthy descriptions or minor character moments that don’t significantly affect the main narrative, trim them down. "
    "If subplots influence the protagonist’s choices, include them briefly. Combine related sentences into a few succinct lines and use direct, concise language by eliminating filler words and repetition. "
    "Ensure the summary flows logically and maintains cohesion without any gaps that might confuse the reader. "
    "Finally, check the word count and aim for around {target_percent}% of the original, making slight adjustments as needed.\n\n"
    "For example, consider the following original description (89 words):\n"
    "Amelia, a dedicated environmental activist, embarks on a perilous journey to rescue her coastal hometown from looming ecological ruin. "
    "When a powerful corporation’s reckless actions threaten to pollute pristine shores and decimate marine life, she unites a diverse coalition of locals, scientists, and activists. "
    "As protests escalate and sacrifices mount, unexpected alliances form and old rivalries resurface. "
    "Amelia’s unwavering determination to secure a sustainable future becomes a beacon of hope amid growing chaos and environmental despair. "
    "Her journey tests both her courage and her commitment to the cause.\n\n"
    "A transformed summary at approximately {target_percent}% of the original word count (80 words) might read:\n"
    "Amelia, an environmental activist, sets out on a risky mission to save her coastal hometown from impending ecological collapse. "
    "Faced with a corporation’s reckless actions endangering pristine shores and marine life, she forms a coalition of locals, scientists, and fellow activists. "
    "As protests escalate and sacrifices increase, unexpected alliances and revived rivalries emerge. "
    "Her steadfast determination to achieve sustainability shines as a hopeful signal amid chaos, remarkably testing her courage and commitment to the cause while inspiring community action.\n\n"
    "Video Description:\n{text}"
)

# ============================
# FUNCTION DEFINITIONS
# ============================

def get_summary(text, target_percent):
    """
    Given a text and a target percentage, this function builds the prompt,
    calls the OpenAI API to generate a summarized version, and returns the summary.
    """
    prompt = PROMPT_TEMPLATE.format(target_percent=target_percent, text=text)
    try:
        response = openai.ChatCompletion.create(
            model=MODEL,
            messages=[{"role": "user", "content": prompt}],
            temperature=TEMPERATURE,
            max_tokens=MAX_TOKENS
        )
        summary = response.choices[0].message.content.strip()
        return summary
    except Exception as e:
        print(f"Error during API call: {e}")
        return None

def process_ground_truths(gt_file_path, output_dir, target_percent, num_iterations):
    """
    Loads the ground truth file and, for each GT:
      - Iteratively summarizes the text 'num_iterations' times,
      - Each time reducing the text to approximately 'target_percent' of its length.
    The original text and all iterations are saved in a JSON file named "<gt_id>.json" in the output directory.
    """
    # Load the ground truth file.
    try:
        with open(gt_file_path, 'r', encoding='utf-8') as f:
            ground_truths = json.load(f)
    except Exception as e:
        print(f"Error loading ground truth file: {e}")
        return

    # Ensure the output directory exists.
    os.makedirs(output_dir, exist_ok=True)

    # Process each ground truth.
    for gt_id, original_text in ground_truths.items():
        print(f"Processing GT ID: {gt_id}")
        result = {
            "original": original_text,
            "iterations": {}  # Will store each iteration summary as a string.
        }
        current_text = original_text  # Begin with the original text.
        for i in range(1, num_iterations + 1):
            print(f"  Iteration {i}: Summarizing to {target_percent}% of current text length...")
            summary = get_summary(current_text, target_percent)
            if summary is None:
                summary = f"Error generating summary at iteration {i}"
            result["iterations"][str(i)] = summary
            current_text = summary  # Use the summary for the next iteration.
            time.sleep(1)  # Optional pause between API calls to respect rate limits.

        # Save the result for this GT in a JSON file (e.g., "1.json" for GT with key "1").
        output_path = os.path.join(output_dir, f"{gt_id}.json")
        try:
            with open(output_path, 'w', encoding='utf-8') as out_f:
                json.dump(result, out_f, indent=2, ensure_ascii=False)
            print(f"  Saved iterative summaries to {output_path}\n")
        except Exception as e:
            print(f"Error saving file {output_path}: {e}")

# ============================
# MAIN EXECUTION
# ============================

if __name__ == "__main__":
    process_ground_truths(GT_FILE_PATH, OUTPUT_DIR, TARGET_PERCENT, NUM_ITERATIONS)


In [8]:
# ======================================
# Addition
# ======================================

import os
import json
import math
import random
import torch
import re
from wtpsplit import SaT  # <-- from your snippet

# ----------------------------
# Configuration
# ----------------------------
GT_FILE_PATH = "/mmfs1/scratch/jacks.local/mali9292/VAD/data/GroundTruths/MPII_GT_10.json"
BASE_OUTPUT_DIR = "/mmfs1/scratch/jacks.local/mali9292/VAD/data/addition"

NUM_ITERATIONS = 7
UNRELATED_SEGMENTS = [
    "In the neon-lit back alleys of a dystopian metropolis, a rogue hacker races against time while being chased by relentless cyborg enforcers, their footsteps echoing off rain-soaked pavement.",
    "Under the relentless desert sun, a lone gunslinger confronts a notorious outlaw at a deserted crossroads, both men exchanging steely glances as swirling dust blurs the horizon.",
    "On a foggy evening in Victorian London, an intrepid detective uncovers a hidden conspiracy among the city’s elite, with gaslit streets setting the stage for a race against a mounting darkness.",
    "High above a sprawling city skyline, a daredevil pilot executes death-defying maneuvers in a vintage biplane, narrowly evading enemy fire as the ground morphs into a dizzying mosaic below.",
    "Within the creaking walls of an old countryside manor, an estranged family gathers for a mysterious inheritance, their hushed whispers and secret glances weaving a tapestry of betrayal and suspense.",
    "Amidst the roaring tempest of a stormy ocean, a weathered sailor battles nature’s fury on a creaking vessel, his resolve as steadfast as the crashing waves that threaten to engulf him.",
    "On a bustling 1960s New York street, a passionate artist finds unexpected inspiration in the chaotic interplay of urban life, capturing moments of love, loss, and defiant hope on a rain-soaked canvas.",
    "Deep in the heart of an enchanted forest, a brave knight embarks on a perilous quest to rescue a captive princess, facing mythical creatures and treacherous traps with unwavering courage.",
    "During a heated political rally in a futuristic city, a charismatic leader stokes both hope and dissent as holographic banners illuminate the night sky and fervent crowds surge with anticipation.",
    "In the eerie silence of a post-apocalyptic wasteland, a hardened survivor scavenges through abandoned ruins, haunted by the echoes of a once-thriving civilization now reduced to dust.",
    "Beneath the shimmering surface of an underwater kingdom, a rebellious mermaid defies ancient traditions to explore forbidden coral reefs and secret caverns brimming with forgotten lore.",
    "In the quiet confines of a suburban attic, an unassuming teenager stumbles upon a mysterious portal that thrusts him into a surreal world where the very fabric of time and space unravels.",
    "On the rugged highlands of Scotland, a stoic warrior faces rival clans and ancient curses alike, his fierce battle cry resonating across mist-covered moors and turbulent skies.",
    "Amid the vibrant chaos of a South American carnival, a fearless dancer twirls through streets bursting with color and rhythm, each graceful move challenging the rigid boundaries of tradition.",
    "During a clandestine meeting in a snow-covered mountain lodge, two rival spies exchange cryptic messages hinting at an imminent global conspiracy, their whispered words carried off by the howling wind.",
    "Inside a lavish 1920s speakeasy, a troubled jazz musician pours his soul into a melancholic melody, every note echoing the bittersweet memories of lost love and shattered dreams.",
    "In a futuristic space station orbiting a distant planet, an intrepid crew confronts bizarre alien phenomena and an enigmatic interstellar threat, pushing the limits of human endurance.",
    "Amidst the chaos of a raging civil war, a compassionate medic races through bombed-out streets, his every desperate step a race against time to save lives in the midst of utter devastation.",
    "On a sun-dappled afternoon in a quaint European village, an eccentric inventor unveils a groundbreaking contraption that defies conventional physics, igniting wonder and skepticism in equal measure.",
    "In a bustling marketplace in ancient Persia, a cunning street thief navigates labyrinthine alleyways and vibrant bazaars, his calculated every move a daring escape from the ever-watchful royal guards."
]
PERCENTAGE = 10.0  # i.e. 10%

# ----------------------------------------------------
# 1) Load SaT model (from your snippet)
# ----------------------------------------------------
sat_adapted = SaT("sat-12l-sm")
device = "cuda" if torch.cuda.is_available() else "cpu"
sat_adapted.half().to(device)

# ----------------------------------------------------
# 2) Helper Functions (unchanged except for SaT usage)
# ----------------------------------------------------

def ensure_dir(path):
    """Create directory if it doesn't exist."""
    os.makedirs(path, exist_ok=True)

def clean_trailing_punctuation(segment: str) -> str:
    """
    - Removes repeated trailing dots (turning '...' or '..' into a single '.').
    - Similarly normalizes repeated '!' or '?' to a single character.
    - Leaves other punctuation alone (commas, quotes, etc.)
    """
    segment = segment.strip()
    # Convert any trailing sequence of '.' into a single '.'
    segment = re.sub(r'[.]+$', '.', segment)
    # Convert any trailing sequence of '!' into a single '!'
    segment = re.sub(r'[!]+$', '!', segment)
    # Convert any trailing sequence of '?' into a single '?'
    segment = re.sub(r'[?]+$', '?', segment)
    return segment

def join_segments(segments):
    """
    Re-joins your SaT segments into a single text:
      1) Clean up each segment’s trailing punctuation (no '...' or '..').
      2) If there's NO punctuation at the end, add one period '.'.
      3) Join them with a single space in between.
    """
    cleaned = []
    for seg in segments:
        seg = clean_trailing_punctuation(seg)
        if seg and seg[-1] not in {'.', '!', '?'}:
            seg += '.'
        cleaned.append(seg)
    return ' '.join(cleaned)

def add_unrelated_segments_at_beginning(text, unrelated_segments, sat_model, percentage=10.0):
    """
    1. Split text into semantic segments with sat_model.
    2. Calculate how many segments to add from unrelated_segments (randomly).
    3. Insert them at the beginning.
    4. Return the rejoined text.
    """
    current_segments = sat_model.split(text)
    if not current_segments:
        return text

    num_to_add = math.ceil(len(current_segments) * (percentage / 100.0))
    chosen = random.sample(unrelated_segments, k=min(num_to_add, len(unrelated_segments)))

    final_segments = chosen + current_segments
    return join_segments(final_segments)

def add_unrelated_segments_in_middle(text, unrelated_segments, sat_model, percentage=10.0):
    """
    Same as above, but insert the chosen segments in the middle.
    """
    current_segments = sat_model.split(text)
    if not current_segments:
        return text

    num_to_add = math.ceil(len(current_segments) * (percentage / 100.0))
    chosen = random.sample(unrelated_segments, k=min(num_to_add, len(unrelated_segments)))

    mid_index = len(current_segments) // 2
    final_segments = current_segments[:mid_index] + chosen + current_segments[mid_index:]
    return join_segments(final_segments)

def add_unrelated_segments_at_end(text, unrelated_segments, sat_model, percentage=10.0):
    """
    Same as above, but insert the chosen segments at the end.
    """
    current_segments = sat_model.split(text)
    if not current_segments:
        return text

    num_to_add = math.ceil(len(current_segments) * (percentage / 100.0))
    chosen = random.sample(unrelated_segments, k=min(num_to_add, len(unrelated_segments)))

    final_segments = current_segments + chosen
    return join_segments(final_segments)

def add_unrelated_segments_randomly(text, unrelated_segments, sat_model, percentage=10.0):
    """
    1. Split text into semantic segments with sat_model.
    2. Calculate how many segments to add from unrelated_segments (randomly).
    3. Insert them at random positions within the existing segments.
    4. Return the rejoined text.
    """
    current_segments = sat_model.split(text)
    if not current_segments:
        return text

    num_to_add = math.ceil(len(current_segments) * (percentage / 100.0))
    chosen_segments = random.sample(unrelated_segments, k=min(num_to_add, len(unrelated_segments)))

    final_segments = list(current_segments) # create a mutable list
    insertion_points = sorted(random.sample(range(len(current_segments) + 1), k=min(num_to_add, len(current_segments) + 1))) # insertion points, sorted to insert correctly

    inserted_count = 0
    for i, insert_point in enumerate(insertion_points):
        final_segments.insert(insert_point + inserted_count, chosen_segments[i])
        inserted_count += 1 # Increment for each inserted segment

    return join_segments(final_segments)


# ----------------------------------------------------
# 3) Main Logic
# ----------------------------------------------------

def main():
    random.seed(42)  # for reproducible results; remove if you want fully random

    # 1) Read the ground truth file
    try:
        with open(GT_FILE_PATH, 'r', encoding='utf-8') as f:
            ground_truths = json.load(f)
    except Exception as e:
        print(f"Error loading ground truth file: {e}")
        return

    # 2) Prepare subdirectories
    begin_dir = os.path.join(BASE_OUTPUT_DIR, "beginning")
    middle_dir = os.path.join(BASE_OUTPUT_DIR, "middle")
    end_dir = os.path.join(BASE_OUTPUT_DIR, "end")
    random_dir = os.path.join(BASE_OUTPUT_DIR, "random") # New directory for random insertion

    ensure_dir(begin_dir)
    ensure_dir(middle_dir)
    ensure_dir(end_dir)
    ensure_dir(random_dir) # Ensure new directory is created

    # 3) Process each ground truth
    for gt_id, original_text in ground_truths.items():
        print(f"Processing GT ID: {gt_id}")

        # ---- A) Beginning ----
        begin_output = {
            "ground_truth": original_text,
            "iterations": {}
        }
        current_text = original_text
        for i in range(1, NUM_ITERATIONS + 1):
            current_text = add_unrelated_segments_at_beginning(
                current_text,
                UNRELATED_SEGMENTS,
                sat_model=sat_adapted,
                percentage=PERCENTAGE
            )
            begin_output["iterations"][str(i)] = current_text

        begin_file = os.path.join(begin_dir, f"{gt_id}.json")
        with open(begin_file, "w", encoding="utf-8") as f:
            json.dump(begin_output, f, indent=2, ensure_ascii=False)

        # ---- B) Middle ----
        middle_output = {
            "ground_truth": original_text,
            "iterations": {}
        }
        current_text = original_text
        for i in range(1, NUM_ITERATIONS + 1):
            current_text = add_unrelated_segments_in_middle(
                current_text,
                UNRELATED_SEGMENTS,
                sat_model=sat_adapted,
                percentage=PERCENTAGE
            )
            middle_output["iterations"][str(i)] = current_text

        middle_file = os.path.join(middle_dir, f"{gt_id}.json")
        with open(middle_file, "w", encoding="utf-8") as f:
            json.dump(middle_output, f, indent=2, ensure_ascii=False)

        # ---- C) End ----
        end_output = {
            "ground_truth": original_text,
            "iterations": {}
        }
        current_text = original_text
        for i in range(1, NUM_ITERATIONS + 1):
            current_text = add_unrelated_segments_at_end(
                current_text,
                UNRELATED_SEGMENTS,
                sat_model=sat_adapted,
                percentage=PERCENTAGE
            )
            end_output["iterations"][str(i)] = current_text

        end_file = os.path.join(end_dir, f"{gt_id}.json")
        with open(end_file, "w", encoding="utf-8") as f:
            json.dump(end_output, f, indent=2, ensure_ascii=False)

        # ---- D) Random ---- # New category
        random_output = {
            "ground_truth": original_text,
            "iterations": {}
        }
        current_text = original_text
        for i in range(1, NUM_ITERATIONS + 1):
            current_text = add_unrelated_segments_randomly( # Calling the new function
                current_text,
                UNRELATED_SEGMENTS,
                sat_model=sat_adapted,
                percentage=PERCENTAGE
            )
            random_output["iterations"][str(i)] = current_text

        random_file = os.path.join(random_dir, f"{gt_id}.json")
        with open(random_file, "w", encoding="utf-8") as f:
            json.dump(random_output, f, indent=2, ensure_ascii=False)


    print("\nAll done!\n")


if __name__ == "__main__":
    main()

Processing GT ID: 1
Processing GT ID: 2
Processing GT ID: 3
Processing GT ID: 4
Processing GT ID: 5
Processing GT ID: 6
Processing GT ID: 7
Processing GT ID: 8
Processing GT ID: 9
Processing GT ID: 10

All done!



In [9]:
# ======================================
# Deletion
# ======================================

import os
import json
import math
import random
import torch
import re
from wtpsplit import SaT  # <-- from your snippet

# ----------------------------
# Configuration
# ----------------------------
GT_FILE_PATH = "/mmfs1/scratch/jacks.local/mali9292/VAD/data/GroundTruths/MPII_GT_10.json"
BASE_OUTPUT_DIR = "/mmfs1/scratch/jacks.local/mali9292/VAD/data/deletion" # Changed to deletion

NUM_ITERATIONS = 7
PERCENTAGE = 10.0  # i.e. 10%

# ----------------------------------------------------
# 1) Load SaT model (from your snippet)
# ----------------------------------------------------
sat_adapted = SaT("sat-12l-sm")
device = "cuda" if torch.cuda.is_available() else "cpu"
sat_adapted.half().to(device)

# ----------------------------------------------------
# 2) Helper Functions (unchanged except for SaT usage)
# ----------------------------------------------------

def ensure_dir(path):
    """Create directory if it doesn't exist."""
    os.makedirs(path, exist_ok=True)

def clean_trailing_punctuation(segment: str) -> str:
    """
    - Removes repeated trailing dots (turning '...' or '..' into a single '.').
    - Similarly normalizes repeated '!' or '?' to a single character.
    - Leaves other punctuation alone (commas, quotes, etc.)
    """
    segment = segment.strip()
    # Convert any trailing sequence of '.' into a single '.'
    segment = re.sub(r'[.]+$', '.', segment)
    # Convert any trailing sequence of '!' into a single '!'
    segment = re.sub(r'[!]+$', '!', segment)
    # Convert any trailing sequence of '?' into a single '?'
    segment = re.sub(r'[?]+$', '?', segment)
    return segment

def join_segments(segments):
    """
    Re-joins your SaT segments into a single text:
      1) Clean up each segment’s trailing punctuation (no '...' or '..').
      2) If there's NO punctuation at the end, add one period '.'.
      3) Join them with a single space in between.
    """
    cleaned = []
    for seg in segments:
        seg = clean_trailing_punctuation(seg)
        if seg and seg[-1] not in {'.', '!', '?'}:
            seg += '.'
        cleaned.append(seg)
    return ' '.join(cleaned)

def delete_segments_at_beginning(text, sat_model, percentage=10.0):
    """
    1. Split text into semantic segments with sat_model.
    2. Calculate how many segments to delete.
    3. Delete segments from the beginning.
    4. Return the rejoined text.
    """
    current_segments = sat_model.split(text)
    if not current_segments:
        return text

    num_to_delete = math.ceil(len(current_segments) * (percentage / 100.0))
    if num_to_delete >= len(current_segments): # Avoid deleting all segments
        num_to_delete = len(current_segments) - 1 if len(current_segments) > 0 else 0
    segments_to_keep = current_segments[num_to_delete:]
    return join_segments(segments_to_keep)

def delete_segments_in_middle(text, sat_model, percentage=10.0):
    """
    Same as above, but delete segments from the middle.
    """
    current_segments = sat_model.split(text)
    if not current_segments:
        return text

    num_to_delete = math.ceil(len(current_segments) * (percentage / 100.0))
    if num_to_delete >= len(current_segments): # Avoid deleting all segments
        num_to_delete = len(current_segments) - 1 if len(current_segments) > 0 else 0

    mid_index = len(current_segments) // 2
    delete_start = mid_index - (num_to_delete // 2)
    delete_end = delete_start + num_to_delete

    # Adjust indices to stay within bounds
    delete_start = max(0, delete_start)
    delete_end = min(len(current_segments), delete_end)

    segments_to_keep = current_segments[:delete_start] + current_segments[delete_end:]
    return join_segments(segments_to_keep)

def delete_segments_at_end(text, sat_model, percentage=10.0):
    """
    Same as above, but delete segments from the end.
    """
    current_segments = sat_model.split(text)
    if not current_segments:
        return text

    num_to_delete = math.ceil(len(current_segments) * (percentage / 100.0))
    if num_to_delete >= len(current_segments): # Avoid deleting all segments
        num_to_delete = len(current_segments) - 1 if len(current_segments) > 0 else 0

    segments_to_keep = current_segments[:-num_to_delete]
    return join_segments(segments_to_keep)

def delete_segments_randomly(text, sat_model, percentage=10.0):
    """
    1. Split text into semantic segments with sat_model.
    2. Calculate how many segments to delete.
    3. Randomly choose segments to delete from within the existing segments.
    4. Return the rejoined text.
    """
    current_segments = sat_model.split(text)
    if not current_segments:
        return text

    num_to_delete = math.ceil(len(current_segments) * (percentage / 100.0))
    if num_to_delete >= len(current_segments): # Avoid deleting all segments
        num_to_delete = len(current_segments) - 1 if len(current_segments) > 0 else 0

    indices_to_delete = sorted(random.sample(range(len(current_segments)), k=num_to_delete), reverse=True) # Reverse to avoid index issues after deletion
    segments_to_keep = list(current_segments) # Create a mutable list

    for index in indices_to_delete:
        segments_to_keep.pop(index)

    return join_segments(segments_to_keep)


# ----------------------------------------------------
# 3) Main Logic
# ----------------------------------------------------

def main():
    random.seed(42)  # for reproducible results; remove if you want fully random

    # 1) Read the ground truth file
    try:
        with open(GT_FILE_PATH, 'r', encoding='utf-8') as f:
            ground_truths = json.load(f)
    except Exception as e:
        print(f"Error loading ground truth file: {e}")
        return

    # 2) Prepare subdirectories
    begin_dir = os.path.join(BASE_OUTPUT_DIR, "beginning")
    middle_dir = os.path.join(BASE_OUTPUT_DIR, "middle")
    end_dir = os.path.join(BASE_OUTPUT_DIR, "end")
    random_dir = os.path.join(BASE_OUTPUT_DIR, "random") # New directory for random deletion

    ensure_dir(begin_dir)
    ensure_dir(middle_dir)
    ensure_dir(end_dir)
    ensure_dir(random_dir) # Ensure new directory is created

    # 3) Process each ground truth
    for gt_id, original_text in ground_truths.items():
        print(f"Processing GT ID: {gt_id}")

        # ---- A) Beginning ----
        begin_output = {
            "ground_truth": original_text,
            "iterations": {}
        }
        current_text = original_text
        for i in range(1, NUM_ITERATIONS + 1):
            current_text = delete_segments_at_beginning( # Calling the delete function
                current_text,
                sat_model=sat_adapted,
                percentage=PERCENTAGE
            )
            begin_output["iterations"][str(i)] = current_text

        begin_file = os.path.join(begin_dir, f"{gt_id}.json")
        with open(begin_file, "w", encoding="utf-8") as f:
            json.dump(begin_output, f, indent=2, ensure_ascii=False)

        # ---- B) Middle ----
        middle_output = {
            "ground_truth": original_text,
            "iterations": {}
        }
        current_text = original_text
        for i in range(1, NUM_ITERATIONS + 1):
            current_text = delete_segments_in_middle( # Calling the delete function
                current_text,
                sat_model=sat_adapted,
                percentage=PERCENTAGE
            )
            middle_output["iterations"][str(i)] = current_text

        middle_file = os.path.join(middle_dir, f"{gt_id}.json")
        with open(middle_file, "w", encoding="utf-8") as f:
            json.dump(middle_output, f, indent=2, ensure_ascii=False)

        # ---- C) End ----
        end_output = {
            "ground_truth": original_text,
            "iterations": {}
        }
        current_text = original_text
        for i in range(1, NUM_ITERATIONS + 1):
            current_text = delete_segments_at_end( # Calling the delete function
                current_text,
                sat_model=sat_adapted,
                percentage=PERCENTAGE
            )
            end_output["iterations"][str(i)] = current_text

        end_file = os.path.join(end_dir, f"{gt_id}.json")
        with open(end_file, "w", encoding="utf-8") as f:
            json.dump(end_output, f, indent=2, ensure_ascii=False)

        # ---- D) Random ---- # New category for deletion
        random_output = {
            "ground_truth": original_text,
            "iterations": {}
        }
        current_text = original_text
        for i in range(1, NUM_ITERATIONS + 1):
            current_text = delete_segments_randomly( # Calling the random delete function
                current_text,
                sat_model=sat_adapted,
                percentage=PERCENTAGE
            )
            random_output["iterations"][str(i)] = current_text

        random_file = os.path.join(random_dir, f"{gt_id}.json")
        with open(random_file, "w", encoding="utf-8") as f:
            json.dump(random_output, f, indent=2, ensure_ascii=False)


    print("\nAll done!\n")


if __name__ == "__main__":
    main()

Processing GT ID: 1
Processing GT ID: 2
Processing GT ID: 3
Processing GT ID: 4
Processing GT ID: 5
Processing GT ID: 6
Processing GT ID: 7
Processing GT ID: 8
Processing GT ID: 9
Processing GT ID: 10

All done!



In [3]:
import os
import json
import openai
import time

# ----------------------------
# Configuration
# ----------------------------
MODEL = "gpt-4o"  # or "gpt4o" if that's your model identifier
TEMPERATURE = 0.9
MAX_TOKENS = 4000  # Adjust as needed; shorter than summarization since the prompt is simpler

# ----------------------------
# Prompt Template (Unchanged Except for the Original Text Placeholder)
# ----------------------------
SEQUEL_PROMPT_TEMPLATE = (
    "Transform the video description below into a sequel or continuation (Part 2) that mirrors the same setting, participants, objects, and sequence of actions—even using very similar wording—yet conveys a different story or perspective. The result should have:\n\n"
    "High textual overlap: The same subjects, objects, and actions should appear in a similar order with similar phrases.\n"
    "A different storyline: Despite the overlap, the narrative or intent behind these actions must differ in some meaningful way (e.g., altered motivations, different emotional undercurrent, changed outcomes).\n"
    "No contradictions: The sequel shouldn't conflict with facts from the original; instead, it reuses the same framework with a fresh direction or twist in the story.\n"
    "Avoid adding completely new characters or new major actions—focus on reframing or recontextualizing the existing elements to produce a distinct Part 2.\n\n"
    "Example\n"
    "Original Description (Part 1):\n"
    "He steps off the bus and checks his watch, noticing it's exactly 3 PM. Inside the nearby coffee shop, the barista waves at him. He smiles in response and orders a latte, taking a seat at the counter.\n\n"
    "Transformed Sequel (Part 2, Same Actors/Objects/Sequence, Different Narrative):\n"
    "He steps off the bus again at precisely 3 PM, this time with a furrowed brow as he checks his watch. Inside the same coffee shop, the barista waves at him, but he barely returns the gesture and seems preoccupied. He orders a latte as usual, yet his tone carries a hint of impatience. Taking a seat at the counter, he taps his foot nervously, clearly anticipating something beyond the familiar comfort of his daily routine.\n\n"
    "Notice how Part 2 reuses:\n\n"
    'The same subjects ("He," the barista)\n'
    "The same objects (the bus, the coffee shop, a latte, a seat)\n"
    "A similar order of actions (steps off the bus, checks watch, goes inside, orders latte, sits)\n"
    'Nearly identical or synonymous wording ("exactly 3 PM" → "precisely 3 PM," "waves at him" → "the barista waves at him," etc.)\n\n'
    "…but establishes a different narrative tone and emotional context, resulting in a new story.\n\n Output Format: Only reply back with the transformed video description and nothing else.\n\n"
    "Original Description (Part 1):\n"
    "{original_text}\n"
)


def generate_sequel(text):
    """
    Given an original text, build the sequel/continuation prompt and call the OpenAI API
    to generate a 'Part 2' story. Returns the GPT-4o response as a string.
    """
    prompt = SEQUEL_PROMPT_TEMPLATE.format(original_text=text)
    try:
        response = openai.ChatCompletion.create(
            model=MODEL,
            messages=[{"role": "user", "content": prompt}],
            temperature=TEMPERATURE,
            max_tokens=MAX_TOKENS
        )
        new_text = response.choices[0].message.content.strip()
        # Post-process to remove unwanted labels in the output
        return new_text
    except Exception as e:
        print(f"Error during API call: {e}")
        return None

def process_ground_truths(
    gt_file_path,
    output_dir,
    n_versions=4,
    selected_gt_ids=None
):
    """
    Loads the ground truth file from 'gt_file_path' and, for each GT in that file:
      1. Create n sequel transformations (versions) in an iterative way (each version is built from the previous).
      2. Save the results in a file named '{gt_id}.json' in 'output_dir'.

    Args:
        gt_file_path (str): Path to the ground truth JSON file.
        output_dir (str): Directory to save the resulting JSON files.
        n_versions (int): Number of transformations to generate per ground truth.
        selected_gt_ids (list or None): If provided, only process these GT IDs. If None or empty, process all.
    """
    # 1. Load the ground-truth JSON file
    try:
        with open(gt_file_path, 'r', encoding='utf-8') as f:
            ground_truths = json.load(f)
    except Exception as e:
        print(f"Error loading ground truth file '{gt_file_path}': {e}")
        return

    # If the user did not specify GT IDs to process, default to all
    if not selected_gt_ids:
        selected_gt_ids = ground_truths.keys()

    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # 2. Iterate through each selected GT
    for gt_id in selected_gt_ids:
        if gt_id not in ground_truths:
            print(f"GT ID '{gt_id}' not found in the JSON file. Skipping.")
            continue
        
        original_text = ground_truths[gt_id]
        print(f"Processing GT ID: {gt_id}")

        # Prepare the structure to save
        result = {
            "ground_truth": original_text,
            "versions": {}
        }

        current_text = original_text
        # 3. Generate n versions, each sequel built from the previous
        for i in range(1, n_versions + 1):
            print(f"  Generating version {i} of {n_versions}...")
            sequel_text = generate_sequel(current_text)
            if not sequel_text:
                sequel_text = f"Error generating sequel at iteration {i}"
            result["versions"][str(i)] = sequel_text
            current_text = sequel_text  # Next iteration uses the newly generated text
            time.sleep(1)  # Respect rate limits, adjust or remove as needed

        # 4. Save results to a file named "<gt_id>.json"
        output_path = os.path.join(output_dir, f"{gt_id}.json")
        try:
            with open(output_path, 'w', encoding='utf-8') as out_f:
                json.dump(result, out_f, indent=2, ensure_ascii=False)
            print(f"  Saved sequel transformations to {output_path}\n")
        except Exception as e:
            print(f"Error saving file '{output_path}': {e}")

# ----------------------------
# Example usage (if running as a script):
# ----------------------------
if __name__ == "__main__":
    # Example parameters (change these as needed)
    GT_FILE_PATH = "/mmfs1/scratch/jacks.local/mali9292/VAD/data/GroundTruths/MPII_GT_10.json"
    OUTPUT_DIR = "/mmfs1/scratch/jacks.local/mali9292/VAD/data/Versions"
    N_VERSIONS = 4  # produce 4 sequel versions
    SELECTED_GT_IDS = ["1"]

    process_ground_truths(
        gt_file_path=GT_FILE_PATH,
        output_dir=OUTPUT_DIR,
        n_versions=N_VERSIONS,
        selected_gt_ids=SELECTED_GT_IDS
    )


Processing GT ID: 1
  Generating version 1 of 4...
  Generating version 2 of 4...
  Generating version 3 of 4...
  Generating version 4 of 4...
  Saved sequel transformations to /mmfs1/scratch/jacks.local/mali9292/VAD/data/Versions/1.json



In [None]:
# ============================================
# Rotation with Head-Selection Truncation
# Using sat_segmenter from your vad_lib
# ============================================
import os
import json
import math
import re
from vad_lib import sat_segmenter

# --------------------------------
# Global Config
# --------------------------------
GT_FILE_PATH = "/mmfs1/scratch/jacks.local/mali9292/VAD/data/GroundTruths/MPII_GT_2.json"
BASE_OUTPUT_DIR = "/mmfs1/scratch/jacks.local/mali9292/VAD/data/rotation"

NUM_ITERATIONS = 12
NUMBER_OF_SEGMENTS = 12  # how many total segments to keep from the head
ROTATE_SEGMENTS = 1      # how many segments to rotate each iteration (was %)

# --------------------------------
# Helper Functions
# --------------------------------

def ensure_dir(path):
    """Create a directory if it doesn't exist."""
    os.makedirs(path, exist_ok=True)

def clean_trailing_punctuation(segment: str) -> str:
    """
    - Removes repeated trailing dots (turning '...' into '.').
    - Similarly normalizes repeated '!' or '?' to a single character.
    """
    segment = segment.strip()
    segment = re.sub(r'[.]+$', '.', segment)
    segment = re.sub(r'[!]+$', '!', segment)
    segment = re.sub(r'[?]+$', '?', segment)
    return segment

def join_segments(segments):
    """
    Re-joins your segments into a single text:
      1) Clean trailing punctuation on each segment.
      2) If there's no punctuation, add '.'.
      3) Join them with a space.
    """
    cleaned = []
    for seg in segments:
        seg = clean_trailing_punctuation(seg)
        if seg and seg[-1] not in {'.', '!', '?'}:
            seg += '.'
        cleaned.append(seg)
    return ' '.join(cleaned)

def limit_to_fixed_segments(original_text, num_segments):
    """
    1) Split into segments using sat_segmenter.
    2) Keep only the first num_segments.
    3) Re-join them.
    """
    segments = sat_segmenter(original_text)
    if len(segments) > num_segments:
        segments = segments[:num_segments]
    truncated_text = join_segments(segments)
    return truncated_text

def rotate_text_from_beginning(text, rotate_count=1):
    """
    Rotate from the beginning:
    - Move the first 'rotate_count' segments to the end.
    """
    segments = sat_segmenter(text)
    if not segments or rotate_count <= 0:
        return text

    # Make sure we don't rotate more segments than exist
    chunk_size = min(rotate_count, len(segments))

    chunk = segments[:chunk_size]
    remainder = segments[chunk_size:]
    rotated_segments = remainder + chunk
    return join_segments(rotated_segments)

def rotate_text_from_end(text, rotate_count=1):
    """
    Rotate from the end:
    - Move the last 'rotate_count' segments to the front.
    """
    segments = sat_segmenter(text)
    if not segments or rotate_count <= 0:
        return text

    chunk_size = min(rotate_count, len(segments))

    chunk = segments[-chunk_size:]
    remainder = segments[:-chunk_size]
    rotated_segments = chunk + remainder
    return join_segments(rotated_segments)

# --------------------------------
# Main Logic
# --------------------------------

def main():
    # 1) Load original ground truths
    try:
        with open(GT_FILE_PATH, 'r', encoding='utf-8') as f:
            ground_truths = json.load(f)
    except Exception as e:
        print(f"Error loading GT file: {e}")
        return

    # 2) Prepare output folders
    beginning_dir = os.path.join(BASE_OUTPUT_DIR, "beginning")
    end_dir = os.path.join(BASE_OUTPUT_DIR, "end")
    ensure_dir(beginning_dir)
    ensure_dir(end_dir)

    # 3) For each GT, truncate and rotate
    for gt_id, original_text in ground_truths.items():
        print(f"Processing GT ID: {gt_id}")

        # (A) Truncate to NUMBER_OF_SEGMENTS
        truncated_gt = limit_to_fixed_segments(original_text, NUMBER_OF_SEGMENTS)

        # (B) Rotate from Beginning
        begin_output = {
            "ground_truth": truncated_gt,  # truncated version
            "iterations": {}
        }
        current_text = truncated_gt
        for i in range(1, NUM_ITERATIONS + 1):
            current_text = rotate_text_from_beginning(
                current_text,
                rotate_count=ROTATE_SEGMENTS
            )
            begin_output["iterations"][str(i)] = current_text

        begin_file = os.path.join(beginning_dir, f"{gt_id}.json")
        with open(begin_file, "w", encoding="utf-8") as f:
            json.dump(begin_output, f, indent=2, ensure_ascii=False)

        # (C) Rotate from End
        end_output = {
            "ground_truth": truncated_gt,  # truncated version
            "iterations": {}
        }
        current_text = truncated_gt
        for i in range(1, NUM_ITERATIONS + 1):
            current_text = rotate_text_from_end(
                current_text,
                rotate_count=ROTATE_SEGMENTS
            )
            end_output["iterations"][str(i)] = current_text

        end_file = os.path.join(end_dir, f"{gt_id}.json")
        with open(end_file, "w", encoding="utf-8") as f:
            json.dump(end_output, f, indent=2, ensure_ascii=False)

    print("\nFinished rotation with truncation!\n")

if __name__ == "__main__":
    main()
