In [None]:
# Mistral
# -------
# Dependencies:
#   pip install mistralai
#   (Built-in libraries: os, glob, csv, json, sys, time, logging)

import os
import glob
import csv
import json
import sys
import time
import logging
# from google.colab import drive  # Removed for Jupyter usage

from mistralai import Mistral

# Configure logging to a file named mistral.log
logging.basicConfig(
    filename='/mmfs1/scratch/jacks.local/mali9292/VAD/data/mistral.log',
    level=logging.INFO,
    format='%(asctime)s %(levelname)s: %(message)s'
)

api_key = "NrcuhbTXPHU0LQB6pmGH47JfnfjS5Qua"  # Replace with your Mistral API key
model = "mistral-large-latest"

# Instantiate the Mistral client using the new API style
client = Mistral(api_key=api_key)

MAX_RETRIES = 3  # Number of retries on API failure

def get_mistral_response(prompt, temperature=1, max_tokens=5000):
    """
    Call Mistral's chat.complete endpoint to get a response.
    """
    chat_response = client.chat.complete(
        model=model,
        messages=[
            {
                "role": "user",
                "content": prompt
            },
        ],
        temperature=temperature,
        max_tokens=max_tokens
    )
    return chat_response.choices[0].message.content.strip()

def process_csv_files(folder_path, output_json_file):
    """
    Process all CSV files in a folder:
      1) Read the "Description" column in order.
      2) Construct a single prompt.
      3) Call Mistral with retries on error.
      4) Save partial progress in a JSON file so the script can be resumed.
      
      The JSON key for each file is extracted from the filename
      after the last underscore and before the ".csv" extension.
    """
    # 1) Load existing results (if any)
    if os.path.exists(output_json_file):
        with open(output_json_file, "r", encoding="utf-8") as jf:
            try:
                results = json.load(jf)
            except json.JSONDecodeError:
                results = {}
        logging.info(f"Loaded existing results from '{output_json_file}'.")
    else:
        results = {}
        logging.info(f"No existing '{output_json_file}' found. Starting fresh.")
    
    # 2) Get all CSV files in the folder
    csv_files = sorted(glob.glob(os.path.join(folder_path, "*.csv")))
    logging.info(f"Found {len(csv_files)} CSV file(s) in folder '{folder_path}'.")

    # 3) Process each CSV file
    for csv_file in csv_files:
        # Extract file ID from filename
        filename = os.path.basename(csv_file)
        filename_no_ext = os.path.splitext(filename)[0]
        file_id = filename_no_ext.split('_')[-1]

        # Skip if already processed
        if file_id in results:
            logging.info(f"Skipping file with ID '{file_id}' - already in JSON results.")
            continue
        
        logging.info(f"Processing file with ID '{file_id}': '{csv_file}'")

        # Read the CSV file and extract "Description" column
        descriptions = []
        with open(csv_file, newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f, delimiter=",")
            for row_number, row in enumerate(reader, start=1):
                if "Description" in row:
                    descriptions.append(row["Description"])
                else:
                    logging.info(f"Warning: 'Description' column not found in row {row_number} of {csv_file}")

        # Build the prompt
        prompt = (
            "I have a list of scenes describing a movie step by step. "
            "Please transform them into a single paragraph, preserving the chronological order. "
            "Include every detail from each scene without adding or omitting any information. "
            "Use only straightforward, factual wording, and avoid any new or descriptive language beyond what is provided in the scene notes.\n\n"
            "Please output only the narrative itself. Do not include any introductory or framing sentences such as \"Here's a coherent narrative combining those scenes:"
            "Input:\n"
        )
        for i, desc in enumerate(descriptions, start=1):
            prompt += f"{i}\t{desc}\n"

        # 4) Call Mistral with retries
        attempt = 0
        while attempt < MAX_RETRIES:
            try:
                response = get_mistral_response(prompt)
                break
            except Exception as e:
                attempt += 1
                logging.info(f"Error on attempt {attempt} for file ID '{file_id}': {e}")
                if attempt < MAX_RETRIES:
                    logging.info(f"Retrying (attempt {attempt+1}/{MAX_RETRIES})...")
                    time.sleep(3)
                else:
                    results[file_id] = f"Error after {MAX_RETRIES} attempts: {e}"
                    with open(output_json_file, "w", encoding='utf-8') as json_file:
                        json.dump(results, json_file, indent=4, ensure_ascii=False)
                    sys.exit(1)

        # 5) Store response in results
        results[file_id] = response
        logging.info(f"[File ID '{file_id}'] Stored Mistral response in results dictionary.")

        # 6) Save progress
        with open(output_json_file, "w", encoding='utf-8') as json_file:
            json.dump(results, json_file, indent=4, ensure_ascii=False)
        logging.info(f"Progress saved after processing file ID '{file_id}'.")

    logging.info(f"All CSV files processed. Final results have been saved to '{output_json_file}'.")

if __name__ == "__main__":
    folder_path = "/mmfs1/scratch/jacks.local/mali9292/VAD/data/NumericTagDataSet/rawGT3_part2"  # Update to your folder
    output_json_file = "/mmfs1/scratch/jacks.local/mali9292/VAD/data/Authors/authors/GT_part2_mistral.json"

    logging.info("Starting CSV processing and Mistral querying...")
    process_csv_files(folder_path, output_json_file)
    logging.info("Processing complete.")
