In [100]:
import pandas as pd
import json
import os
import re

# Set up file paths
os.chdir('/Users/nsusser/Desktop/Github/happyDB/')

In [101]:
sentences_path = 'dataframes/clean_sentences.csv'
items_path = 'dataframes/scales_clean.csv'
output_files_dir = 'dataframes/tests/gpt40-mini/outputs/'  # Directory containing batch output files
CIT_output_files_dir = 'dataframes/tests/gpt40-mini/CIT/outputs/'  # Directory containing batch output files
failed_output_files_dir = 'dataframes/tests/gpt40-mini/outputs/failed_outputs'  # Directory containing batch output files
#failed_responses_file = 'dataframes/tests/gpt40-mini/failed_responses.csv'  # File to log invalid responses
#CIT_failed_responses_file = 'dataframes/tests/gpt40-mini/CIT/failed_responses.csv'  # File to log invalid responses



In [102]:
# Load sentences and items
sentences = pd.read_csv(sentences_path)
items = pd.read_csv(items_path)

In [103]:
# Clean and sanitize column names
items['Scale'] = items['Scale'].str.strip().str.replace(r"\s+", "_", regex=True)
items['Dimension'] = items['Dimension'].str.strip().str.replace(r"\s+", "_", regex=True)
items['Items'] = items['Items'].str.strip().str.replace(r"\s+", "_", regex=True)

# Create flattened column names
columns = [f"{scale}_{dimension}_{item}" for scale, dimension, item in zip(
    items['Scale'], items['Dimension'], items['Items']
)]

# Initialize the ratings DataFrame
ratings = pd.DataFrame(columns=["hmid", "cleaned_hm"] + columns)
ratings["hmid"] = sentences["hmid"]
ratings["cleaned_hm"] = sentences["cleaned_hm"]

print(ratings.head())
print(ratings.columns)

    hmid                                         cleaned_hm  \
0  27673  I went on a successful date with someone I fel...   
1  27674  I was happy when my son got 90% marks in his e...   
2  27675       I went to the gym this morning and did yoga.   
3  27676  We had a serious talk with some friends of our...   
4  27677  I went with grandchildren to butterfly display...   

  PERMA_Accomplishment_the_speaker_felt_they_were_making_progress_towards_accomplishing_their_goals?  \
0                                                NaN                                                   
1                                                NaN                                                   
2                                                NaN                                                   
3                                                NaN                                                   
4                                                NaN                                                 

In [104]:
# Prepare a DataFrame to log failed responses
failed_responses = []

# Process all batch output files
# Function to extract numeric value from a filename for proper sorting
def extract_numeric(filepath):
    filename = os.path.basename(filepath)  # Get the file name
    match = re.search(r'\d+', filename)   # Find the first numeric value
    return int(match.group()) if match else float('inf')  # Return the number or infinity

# Define directories in the specific processing order
directories = [
    output_files_dir,
    CIT_output_files_dir
]

# Collect files in the correct order and sort them numerically within each directory
output_files = []
for directory in directories:
    files_in_dir = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".jsonl")]
    files_in_dir.sort(key=extract_numeric)  # Sort numerically within the directory
    output_files.extend(files_in_dir)


In [105]:
# Prepare a DataFrame to log failed responses
failed_responses = []

# Process all batch output files
# Collect all files from the directories
output_files = []
for directory in [output_files_dir, failed_output_files_dir, CIT_output_files_dir]:
    output_files += [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".jsonl")]
# Parse files and collect updates
updates = []
failed_responses = []

for output_file in output_files:
    print(f"Processing output file: {output_file}")
    with open(output_file, 'r') as results_file:
        for line in results_file:
            response = json.loads(line)
            try:
                # Extract details
                custom_id = response["custom_id"]
                result_body = response.get("response", {}).get("body", {})
                choice = result_body.get("choices", [{}])[0]
                response_text = choice.get("message", {}).get("content", "").strip()

                # Extract IDs
                sent_id, item_idx = map(int, custom_id.split("-")[1:])
                sentence = sentences.loc[sentences['hmid'] == sent_id, 'cleaned_hm'].values[0]
                item = items.iloc[item_idx]
                scale, dimension, item_text = item["Scale"], item["Dimension"], item["Items"]
                column_name = f"{scale}_{dimension}_{item_text}"

                # Validate and add update
                try:
                    response_number = int(response_text.strip())
                    updates.append((sent_id, column_name, response_number))
                except ValueError:
                    failed_responses.append({
                        "custom_id": custom_id,
                        "hmid": sent_id,
                        "sentence": sentence,
                        "scale": scale,
                        "dimension": dimension,
                        "item": item_text,
                        "response": response_text
                    })
            except Exception as e:
                print(f"Error processing custom_id {custom_id}: {e}")

# Apply updates to the ratings DataFrame
if updates:
    # Convert updates to a DataFrame
    updates_df = pd.DataFrame(updates, columns=["hmid", "column_name", "response_number"])

    # Reshape updates to have one column per column_name
    updates_pivot = updates_df.pivot(index="hmid", columns="column_name", values="response_number")

    # Merge updates with the existing ratings DataFrame
    ratings = ratings.set_index("hmid")
    ratings.update(updates_pivot)
    ratings = ratings.reset_index()

# Save ratings
output_matrix_path = 'dataframes/tests/gpt40-mini/ratings_matrix.csv'
ratings.to_csv(output_matrix_path, index=False)
print(f"Ratings matrix saved to {output_matrix_path}.")

# Save failed responses
if failed_responses:
    failed_responses_df = pd.DataFrame(failed_responses)
    failed_responses_file = 'dataframes/tests/gpt40-mini/failed_responses.csv'
    failed_responses_df.to_csv(failed_responses_file, index=False)
    print(f"Failed responses logged to {failed_responses_file}.")
else:
    print("No failed responses to log.")

Processing output file: dataframes/tests/gpt40-mini/outputs/output_log_batch_requests_2.jsonl.jsonl
Processing output file: dataframes/tests/gpt40-mini/outputs/output_log_batch_requests_1.jsonl.jsonl
Processing output file: dataframes/tests/gpt40-mini/outputs/output_log_batch_requests_4.jsonl.jsonl
Processing output file: dataframes/tests/gpt40-mini/outputs/output_log_batch_requests_3.jsonl.jsonl
Processing output file: dataframes/tests/gpt40-mini/outputs/failed_outputs/output_failed_batch_requests_2.jsonl
Processing output file: dataframes/tests/gpt40-mini/outputs/failed_outputs/output_failed_batch_requests_3.jsonl
Processing output file: dataframes/tests/gpt40-mini/outputs/failed_outputs/output_failed_batch_requests_1.jsonl
Processing output file: dataframes/tests/gpt40-mini/outputs/failed_outputs/output_failed_cit_batch_requests_5.jsonl.jsonl
Processing output file: dataframes/tests/gpt40-mini/CIT/outputs/output_log_batch_requests_2.jsonl.jsonl
Processing output file: dataframes/tes

In [106]:
# Prepare a DataFrame to log failed responses
failed_responses = []

# Process all batch output files
# Function to extract numeric value from a filename for proper sorting
def extract_numeric(filepath):
    filename = os.path.basename(filepath)  # Get the file name
    match = re.search(r'\d+', filename)   # Find the first numeric value
    return int(match.group()) if match else float('inf')  # Return the number or infinity

# Define directories in the specific processing order
directories = [
    failed_output_files_dir,
    
]

# Collect files in the correct order and sort them numerically within each directory
output_files = []
for directory in directories:
    files_in_dir = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".jsonl")]
    files_in_dir.sort(key=extract_numeric)  # Sort numerically within the directory
    output_files.extend(files_in_dir)

In [107]:
# Parse files and collect updates
updates = []
failed_responses = []

for output_file in output_files:
    print(f"Processing output file: {output_file}")
    with open(output_file, 'r') as results_file:
        for line_number, line in enumerate(results_file, start=1):
            try:
                response = json.loads(line.strip())

                # Extract `custom_id`
                custom_id = response.get("custom_id", "")
                if not custom_id or "-" not in custom_id:
                    print(f"Invalid or missing `custom_id` on line {line_number}: {custom_id}")
                    continue

                # Extract IDs
                try:
                    sent_id, item_idx = map(int, custom_id.split("-")[1:])
                except ValueError as e:
                    print(f"Error parsing `custom_id` on line {line_number}: {custom_id}")
                    continue

                # Extract `response_text`
                try:
                    choice = response["response"]["choices"][0]
                    response_text = choice.get("message", {}).get("content", "").strip()
                    if not response_text:
                        print(f"Empty `response_text` on line {line_number}")
                        raise ValueError("Empty response text")
                except KeyError as e:
                    print(f"Invalid structure for response on line {line_number}: {e}")
                    continue

                # Log debug details for this entry
                print(f"Line {line_number}: sent_id={sent_id}, item_idx={item_idx}, response_text='{response_text}'")

                # Skip if item_idx is out of bounds for `items`
                if item_idx >= len(items):
                    print(f"Skipping line {line_number}: item_idx {item_idx} is out of bounds.")
                    continue

                # Extract sentence and item details
                try:
                    sentence = sentences.loc[sentences['hmid'] == sent_id, 'cleaned_hm'].values[0]
                    item = items.iloc[item_idx]
                    scale, dimension, item_text = item["Scale"], item["Dimension"], item["Items"]
                    column_name = f"{scale}_{dimension}_{item_text}"
                except Exception as e:
                    print(f"Failed to match `sent_id` or `item_idx` on line {line_number}: {e}")
                    continue

                # Validate and add update
                try:
                    response_number = int(response_text)
                    updates.append((sent_id, column_name, response_number))
                except ValueError:
                    failed_responses.append({
                        "custom_id": custom_id,
                        "hmid": sent_id,
                        "sentence": sentence,
                        "scale": scale,
                        "dimension": dimension,
                        "item": item_text,
                        "response": response_text
                    })
            except Exception as e:
                print(f"Error processing line {line_number}: {e}")

# Convert updates to a DataFrame
if updates:
    updates_df = pd.DataFrame(updates, columns=["hmid", "column_name", "response_number"])
    updates_pivot = updates_df.pivot(index="hmid", columns="column_name", values="response_number")

    # Merge with the existing ratings DataFrame
    ratings = ratings.set_index("hmid") if 'ratings' in globals() else pd.DataFrame(index=updates_pivot.index)
    ratings.update(updates_pivot, overwrite=True)
    ratings = ratings.reset_index()

    # Save the updated ratings matrix
    output_matrix_path = 'dataframes/tests/gpt40-mini/ratings_matrix.csv'
    ratings.to_csv(output_matrix_path, index=False)
    print(f"Ratings matrix saved to {output_matrix_path}.")
else:
    print("No updates to apply to the ratings matrix.")


Processing output file: dataframes/tests/gpt40-mini/outputs/failed_outputs/output_failed_batch_requests_1.jsonl
Line 1: sent_id=27879, item_idx=49, response_text='6'
Processing output file: dataframes/tests/gpt40-mini/outputs/failed_outputs/output_failed_batch_requests_2.jsonl
Line 1: sent_id=27979, item_idx=18, response_text='Please'
Line 2: sent_id=27979, item_idx=140, response_text='4'
Processing output file: dataframes/tests/gpt40-mini/outputs/failed_outputs/output_failed_batch_requests_3.jsonl
Line 1: sent_id=27979, item_idx=18, response_text='3'
Processing output file: dataframes/tests/gpt40-mini/outputs/failed_outputs/output_failed_cit_batch_requests_5.jsonl.jsonl
Line 1: sent_id=27979, item_idx=165, response_text='4'
Ratings matrix saved to dataframes/tests/gpt40-mini/ratings_matrix.csv.


In [108]:
# Save the ratings matrix as a CSV file
output_matrix_path = 'dataframes/tests/gpt40-mini/ratings_matrix.csv'
ratings.to_csv(output_matrix_path)
print(f"Ratings matrix saved to {output_matrix_path}.")

Ratings matrix saved to dataframes/tests/gpt40-mini/ratings_matrix.csv.
