In [3]:
# Claude (Anthropic)
# -----------------
# Dependencies:
#   pip install anthropic
#   (Built-in libraries: os, glob, csv, json, sys, time, logging)

import os
import glob
import csv
import json
import sys
import time
import logging
# from google.colab import drive  # Removed for Jupyter usage

from anthropic import Anthropic

# Configure logging to a file named claude.log
logging.basicConfig(
    filename='/mmfs1/scratch/jacks.local/mali9292/VAD/data/claude.log',
    level=logging.INFO,
    format='%(asctime)s %(levelname)s: %(message)s'
)

MAX_RETRIES = 3  # Number of retries on API failure

def get_sonnet_response(anthropic_client, prompt, temperature=1, max_tokens=5000):
    response = anthropic_client.messages.create(
        model="claude-3-5-sonnet-20241022",
        max_tokens=max_tokens,
        temperature=temperature,
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ]
    )
    # The original .text usage was array-based; ensure correct extraction:
    return response.content[0].text.strip()

def process_csv_files(folder_path, output_json_file, anthropic_client):
    """
    Process all CSV files in a folder. For each CSV:
      1) Read the "Description" column in order.
      2) Construct a single prompt.
      3) Call Claude Sonnet with retries on error.
      4) Save partial progress in a JSON file so the script can be resumed.
      
      The JSON key for each file is extracted from the filename
      after the last underscore and before the ".csv" extension.
    """
    # 1) Load existing results (if any) so we can resume from partial progress.
    if os.path.exists(output_json_file):
        with open(output_json_file, "r", encoding="utf-8") as jf:
            try:
                results = json.load(jf)
            except json.JSONDecodeError:
                results = {}
        logging.info(f"Loaded existing results from '{output_json_file}'.")
    else:
        results = {}
        logging.info(f"No existing '{output_json_file}' found. Starting fresh.")
    
    # 2) Get all CSV files
    csv_files = sorted(glob.glob(os.path.join(folder_path, "*.csv")))
    logging.info(f"Found {len(csv_files)} CSV file(s) in folder '{folder_path}'.")

    # 3) Process each CSV file
    for csv_file in csv_files:
        # Extract the file ID from the filename
        filename = os.path.basename(csv_file)                # e.g. "0001_American_Beauty_part1_1.csv"
        filename_no_ext = os.path.splitext(filename)[0]      # e.g. "0001_American_Beauty_part1_1"
        file_id = filename_no_ext.split('_')[-1]             # e.g. "1"

        # If we've already processed this file_id, skip it (resume logic)
        if file_id in results:
            logging.info(f"Skipping file with ID '{file_id}' ('{csv_file}') - already in JSON results.")
            continue
        
        logging.info(f"Processing file with ID '{file_id}': '{csv_file}'")

        # Read the CSV and extract the "Description" column
        descriptions = []
        with open(csv_file, newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f, delimiter=",")
            for row_number, row in enumerate(reader, start=1):
                if "Description" in row:
                    description = row["Description"]
                    descriptions.append(description)
                else:
                    logging.info(f"Warning: 'Description' column not found in row {row_number} of {csv_file}")

        # Build the prompt
        prompt = (
            "I have a list of scenes describing a movie in chronological order. Please transform them into a coherent movie description, preserving all events, actions, subjects, and objects. You may apply any or all of the following transformations at your discretion, using your best judgment:  1) Paraphrasing & Varying Language: Use different vocabulary and alter sentence structure from the original text while keeping all information. 2) Active ↔ Passive: Switch active voice to passive or vice versa without losing detail. 3) Brevity: Use fewer words but keep every fact intact. 4) Verbosity: Add descriptive words (no new events) to clarify or emphasize. 5) Micro → Macro / Macro → Micro: Summarize small actions into a bigger overview or break down large actions into step-by-step detail, maintaining the same content. 6) Add or Remove Attributes: Adjust adjectives/adverbs or minor descriptors, but don't remove major content. 7) Express Scenes with Varying Detail: Use fewer words for non-critical moments, and more words for key details. 8) Expand Scenes / Add Emphasis: Elaborate on emotional states or background info, without creating new plot points.  Important: - Maintain chronological order. - Do not omit or alter any essential events or objects. - Feel free to combine transformations if it improves clarity or style. Reply with the output without any de"
            "Please output only the narrative itself. Do not include any introductory or framing sentences such as \"Here's a coherent narrative combining those scenes:"
            "Input: \tDescription\n"
        )
        for i, desc in enumerate(descriptions, start=1):
            prompt += f"{i}\t{desc}\n"

        # 4) Call Claude Sonnet with retries
        attempt = 0
        while attempt < MAX_RETRIES:
            try:
                response = get_sonnet_response(anthropic_client, prompt)
                break
            except Exception as e:
                attempt += 1
                logging.info(f"Error on attempt {attempt} for file ID '{file_id}': {e}")
                if attempt < MAX_RETRIES:
                    logging.info(f"Retrying (attempt {attempt+1}/{MAX_RETRIES})...")
                    time.sleep(3)  # short delay before retry
                else:
                    # Exceeded max retries, save partial progress and exit
                    results[file_id] = f"Error after {MAX_RETRIES} attempts: {e}"
                    with open(output_json_file, "w", encoding='utf-8') as json_file:
                        json.dump(results, json_file, indent=4, ensure_ascii=False)
                    sys.exit(1)

        # 5) Store result in dictionary, keyed by the file_id
        results[file_id] = response
        logging.info(f"[File ID '{file_id}'] Stored Claude Sonnet response in results dictionary.")

        # 6) Save partial progress to JSON after each file
        with open(output_json_file, "w", encoding='utf-8') as json_file:
            json.dump(results, json_file, indent=4, ensure_ascii=False)
        logging.info(f"Progress saved after processing file ID '{file_id}'.")

    logging.info(f"All CSV files processed. Final results have been saved to '{output_json_file}'.")

if __name__ == "__main__":
    # Initialize Anthropic client
    anthropic = Anthropic(api_key="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")  # Replace with your Anthropic API key
    
    # Update paths for your local environment or server
    folder_path = "/mmfs1/scratch/jacks.local/mali9292/VAD/data/NumericTagDataSet/rawGT3_part3"  # Update to your folder
    output_json_file = "/mmfs1/scratch/jacks.local/mali9292/VAD/data/Authors/authors/GT_part3_claude.json"

    logging.info("Starting CSV processing and Claude Sonnet querying...")
    process_csv_files(folder_path, output_json_file, anthropic)
    logging.info("Processing complete.")
