In [3]:
# Grok
# ----
# Dependencies:
#   pip install anthropic
#   (Built-in libraries: os, glob, csv, json, sys, time, logging)

import os
import glob
import csv
import json
import sys
import time
import logging
# from google.colab import drive  # Removed for Jupyter usage

from anthropic import Anthropic

# Configure logging to a file named grok.log
logging.basicConfig(
    filename='/mmfs1/scratch/jacks.local/mali9292/VAD/data/grok.log',
    level=logging.INFO,
    format='%(asctime)s %(levelname)s: %(message)s'
)

# Retrieve the Grok API key (insert your key here)
XAI_API_KEY = "xai-y2v7VfT4eslc2C4QWGehIsFaRvZTTphYk83oStWspRP1tgmkEZak9Er73FXaG4Tn5tPUiFEbQhuN8VvR"

# Initialize the Grok client using the Anthropics SDK with Grok's base URL
client = Anthropic(
    api_key=XAI_API_KEY,
    base_url="https://api.x.ai",
)

MAX_RETRIES = 3  # Number of retries on API failure

def get_grok_response(prompt, model="grok-2-latest", temperature=1, max_tokens=5000):
    """
    Call Grok AI's completions endpoint to get a response using the Anthropics SDK.
    """
    completion = client.messages.create(
        model=model,
        max_tokens=max_tokens,
        temperature=temperature,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    return completion.content[0].text.strip()

def process_csv_files(folder_path, output_json_file):
    """
    Process all CSV files in a folder. For each CSV:
      1) Read the "Description" column in order.
      2) Construct a single prompt.
      3) Call Grok AI with retries on error.
      4) Save partial progress in a JSON file so the script can be resumed.
      
      The JSON key for each file is extracted from the filename
      after the last underscore and before the ".csv" extension.
    """
    # 1) Load existing results (if any)
    if os.path.exists(output_json_file):
        with open(output_json_file, "r", encoding="utf-8") as jf:
            try:
                results = json.load(jf)
            except json.JSONDecodeError:
                results = {}
        logging.info(f"Loaded existing results from '{output_json_file}'.")
    else:
        results = {}
        logging.info(f"No existing '{output_json_file}' found. Starting fresh.")
    
    # 2) Get all CSV files
    csv_files = sorted(glob.glob(os.path.join(folder_path, "*.csv")))
    logging.info(f"Found {len(csv_files)} CSV file(s) in folder '{folder_path}'.")

    # 3) Process each CSV file
    for csv_file in csv_files:
        # Extract the file ID from filename
        filename = os.path.basename(csv_file)
        filename_no_ext = os.path.splitext(filename)[0]
        file_id = filename_no_ext.split('_')[-1]

        # Skip if already processed
        if file_id in results:
            logging.info(f"Skipping file with ID '{file_id}' ('{csv_file}') - already in JSON results.")
            continue
        
        logging.info(f"Processing file with ID '{file_id}': '{csv_file}'")

        # Read the CSV and extract the "Description" column
        descriptions = []
        with open(csv_file, newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f, delimiter=",")
            for row_number, row in enumerate(reader, start=1):
                if "Description" in row:
                    descriptions.append(row["Description"])
                else:
                    logging.info(f"Warning: 'Description' column not found in row {row_number} of {csv_file}")

        # Build the prompt
        prompt = (
            "I have a list of scenes describing a movie step by step. "
            "Please transform them into a single paragraph, preserving the chronological order. "
            "Include every detail from each scene without adding or omitting any information. "
            "Use only straightforward, factual wording, and avoid any new or descriptive language beyond what is provided in the scene notes.\n\n"
            "Please output only the narrative itself. Do not include any introductory or framing sentences such as \"Here's a coherent narrative combining those scenes:"
            "Input: \tDescription\n"
        )
        for i, desc in enumerate(descriptions, start=1):
            prompt += f"{i}\t{desc}\n"

        # 4) Call Grok AI with retries
        attempt = 0
        while attempt < MAX_RETRIES:
            try:
                response = get_grok_response(prompt)
                break
            except Exception as e:
                attempt += 1
                logging.info(f"Error on attempt {attempt} for file ID '{file_id}': {e}")
                if attempt < MAX_RETRIES:
                    logging.info(f"Retrying (attempt {attempt+1}/{MAX_RETRIES})...")
                    time.sleep(3)
                else:
                    results[file_id] = f"Error after {MAX_RETRIES} attempts: {e}"
                    with open(output_json_file, "w", encoding='utf-8') as json_file:
                        json.dump(results, json_file, indent=4, ensure_ascii=False)
                    sys.exit(1)

        # 5) Store result
        results[file_id] = response
        logging.info(f"[File ID '{file_id}'] Stored Grok AI response in results dictionary.")

        # 6) Save partial progress
        with open(output_json_file, "w", encoding='utf-8') as json_file:
            json.dump(results, json_file, indent=4, ensure_ascii=False)
        logging.info(f"Progress saved after processing file ID '{file_id}'.")

    logging.info(f"All CSV files processed. Final results have been saved to '{output_json_file}'.")

if __name__ == "__main__":
    # Update the folder paths for your local environment
    folder_path = "/mmfs1/scratch/jacks.local/mali9292/VAD/data/NumericTagDataSet/rawGT3_part2"  # Update to your folder
    output_json_file = "/mmfs1/scratch/jacks.local/mali9292/VAD/data/Authors/authors/GT_part2_grok.json"

    logging.info("Starting CSV processing and Grok AI querying...")
    process_csv_files(folder_path, output_json_file)
    logging.info("Processing complete.")


AttributeError: 'tuple' object has no attribute 'tb_frame'

In [4]:
# Grok
# ----
# Dependencies:
#   pip install anthropic
#   (Built-in libraries: os, glob, csv, json, sys, time, logging)

import os
import glob
import csv
import json
import sys
import time
import logging
# from google.colab import drive  # Removed for Jupyter usage

from anthropic import Anthropic

# Configure logging to a file named grok.log
logging.basicConfig(
    filename='/mmfs1/scratch/jacks.local/mali9292/VAD/data/grok.log',  # Unchanged location
    level=logging.INFO,
    format='%(asctime)s %(levelname)s: %(message)s'
)

# Retrieve the Grok API key (insert your key here)
XAI_API_KEY = "xai-y2v7VfT4eslc2C4QWGehIsFaRvZTTphYk83oStWspRP1tgmkEZak9Er73FXaG4Tn5tPUiFEbQhuN8VvR" # Unchanged API Key

# Initialize the Grok client using the Anthropics SDK with Grok's base URL
client = Anthropic(
    api_key=XAI_API_KEY,
    base_url="https://api.x.ai",
)

MAX_RETRIES = 3  # Number of retries on API failure

def get_grok_response(prompt, model="grok-2-latest", temperature=1, max_tokens=5000):
    """
    Call Grok AI's completions endpoint to get a response using the Anthropics SDK.
    """
    # MODIFIED PART STARTS HERE
    completion = client.messages.create(
        model=model,
        max_tokens=max_tokens,
        temperature=temperature,
        system="Process the input text according to the user's exact instructions. Output only the requested transformation.",  # System prompt passed as a separate parameter
        messages=[
            {"role": "user", "content": prompt} # Only user message in the list
        ]
    )
    # MODIFIED PART ENDS HERE
    return completion.content[0].text.strip()

def process_csv_files(folder_path, output_json_file):
    """
    Process all CSV files in a folder. For each CSV:
      1) Read the "Description" column in order.
      2) Construct a single prompt.
      3) Call Grok AI with retries on error.
      4) Save partial progress in a JSON file so the script can be resumed.
      
      The JSON key for each file is extracted from the filename
      after the last underscore and before the ".csv" extension.
    """
    # 1) Load existing results (if any)
    if os.path.exists(output_json_file):
        with open(output_json_file, "r", encoding="utf-8") as jf:
            try:
                results = json.load(jf)
            except json.JSONDecodeError:
                results = {}
        logging.info(f"Loaded existing results from '{output_json_file}'.")
    else:
        results = {}
        logging.info(f"No existing '{output_json_file}' found. Starting fresh.")
    
    # 2) Get all CSV files
    csv_files = sorted(glob.glob(os.path.join(folder_path, "*.csv")))
    logging.info(f"Found {len(csv_files)} CSV file(s) in folder '{folder_path}'.")

    # 3) Process each CSV file
    for csv_file in csv_files:
        # Extract the file ID from filename
        filename = os.path.basename(csv_file)
        filename_no_ext = os.path.splitext(filename)[0]
        file_id = filename_no_ext.split('_')[-1]

        # Skip if already processed
        if file_id in results:
            logging.info(f"Skipping file with ID '{file_id}' ('{csv_file}') - already in JSON results.")
            continue
        
        logging.info(f"Processing file with ID '{file_id}': '{csv_file}'")

        # Read the CSV and extract the "Description" column
        descriptions = []
        with open(csv_file, newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f, delimiter=",")
            for row_number, row in enumerate(reader, start=1):
                if "Description" in row:
                    descriptions.append(row["Description"])
                else:
                    logging.info(f"Warning: 'Description' column not found in row {row_number} of {csv_file}")

        # Build the prompt (Unchanged prompt structure)
        prompt = (
            "I have a list of scenes describing a movie step by step. "
            "Please transform them into a single paragraph, preserving the chronological order. "
            "Include every detail from each scene without adding or omitting any information. "
            "Use only straightforward, factual wording, and avoid any new or descriptive language beyond what is provided in the scene notes.\n\n"
            "Please output only the narrative itself. Do not include any introductory or framing sentences such as \"Here's a coherent narrative combining those scenes:"
            "Input: \tDescription\n"
        )
        for i, desc in enumerate(descriptions, start=1):
            prompt += f"{i}\t{desc}\n"

        # 4) Call Grok AI with retries
        attempt = 0
        response_text = None # Initialize response_text to handle potential exit before assignment
        while attempt < MAX_RETRIES:
            try:
                response_text = get_grok_response(prompt) # Renamed 'response' to 'response_text' to avoid conflict if needed later
                break
            except Exception as e:
                attempt += 1
                logging.info(f"Error on attempt {attempt} for file ID '{file_id}': {e}")
                if attempt < MAX_RETRIES:
                    logging.info(f"Retrying (attempt {attempt+1}/{MAX_RETRIES})...")
                    time.sleep(3)
                else:
                    results[file_id] = f"Error after {MAX_RETRIES} attempts: {e}"
                    with open(output_json_file, "w", encoding='utf-8') as json_file:
                        json.dump(results, json_file, indent=4, ensure_ascii=False)
                    sys.exit(1) # Exits if all retries fail

        # 5) Store result
        results[file_id] = response_text # Use response_text here
        logging.info(f"[File ID '{file_id}'] Stored Grok AI response in results dictionary.")

        # 6) Save partial progress
        with open(output_json_file, "w", encoding='utf-8') as json_file:
            json.dump(results, json_file, indent=4, ensure_ascii=False)
        logging.info(f"Progress saved after processing file ID '{file_id}'.")

    logging.info(f"All CSV files processed. Final results have been saved to '{output_json_file}'.")

if __name__ == "__main__":
    # Update the folder paths for your local environment
    folder_path = "/mmfs1/scratch/jacks.local/mali9292/VAD/data/NumericTagDataSet/rawGT3_part2"  # Unchanged location
    output_json_file = "/mmfs1/scratch/jacks.local/mali9292/VAD/data/Authors/authors/GT_part2_grok.json" # Unchanged location

    logging.info("Starting CSV processing and Grok AI querying...")
    process_csv_files(folder_path, output_json_file)
    logging.info("Processing complete.")