In [1]:
!pip install openai==0.28



# QA

In [7]:
import pandas as pd
import openai
import os
import time
from tqdm import tqdm
from google.colab import userdata
import re

# Set your OpenAI API key
openai.api_key = userdata.get('OPENAI_API_KEY')


def get_gpt4_evaluation_qa(model_name, norm_type, variant, question, context, reference_answers, generated_answer):
    # Construct the prompt with instructions for commenting on each column
    prompt = f"""
As an expert evaluator, your task is to assess the quality of an answer generated by a question-answering system based on the provided context. Please focus on the following criteria:

1. **Correctness**: Is the answer correct based on the context?
2. **Completeness**: Does the answer fully address the question?
3. **Relevance**: Is the answer relevant to the question and context?
4. **Fluency**: Is the answer well-written with proper grammar and style?
5. **Conciseness**: Is the answer concise and to the point?

For each criterion, provide:
- **Score**: A number from 1 to 5 (where 1 is poor and 5 is excellent).
- **Explanation**: A brief justification for the score.

After evaluating each criterion, provide:
- **Overall Score**: The average of the five scores.
- **Overall Feedback**: A short overall feedback.

**Additionally**, provide a short paragraph commenting on the following columns:
- **Model Name**: {model_name}
- **Norm Type**: {norm_type}
- **Variant**: {variant}
- **Question**: {question}
- **Context**: {context}
- **Reference Answers**: {reference_answers}
- **Generated Answer**: {generated_answer}

**Please present your evaluation in the following structured format:**

```
Model Name: {model_name}

Correctness Score: [1-5]
Correctness Explanation: [Your explanation]

Completeness Score: [1-5]
Completeness Explanation: [Your explanation]

Relevance Score: [1-5]
Relevance Explanation: [Your explanation]

Fluency Score: [1-5]
Fluency Explanation: [Your explanation]

Conciseness Score: [1-5]
Conciseness Explanation: [Your explanation]

Overall Score: [Average score]
Overall Feedback: [Your feedback]

Comments on Columns:
[Your short paragraph commenting on each column]
```

---
**Model Name:**
{model_name}

---
**Question:**
{question}

---
**Context:**
{context}

---
**Reference Answers:**
{reference_answers}

---
**Generated Answer:**
{generated_answer}

---
"""
    # Call the OpenAI API using the ChatCompletion endpoint
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o",
            messages=[
                {"role": "user", "content": prompt}
            ],
            max_tokens=700,  # Adjusted to accommodate additional comments
            temperature=0.0,  # For deterministic output
        )
        evaluation = response['choices'][0]['message']['content']
        return evaluation
    except openai.error.RateLimitError as e:
        print(f"Rate limit error: {e}")
        retry_after = int(e.headers.get("Retry-After", 5))
        print(f"Retrying after {retry_after} seconds...")
        time.sleep(retry_after)
        # Retry the request
        return get_gpt4_evaluation_qa(model_name, norm_type, variant, question, context, reference_answers, generated_answer)
    except Exception as e:
        print(f"Error during OpenAI API call: {e}")
        return None


def parse_evaluation_qa(evaluation_text):
    """
    Parses GPT-4's evaluation text and extracts scores, explanations, and comments.
    """
    patterns = {
        'Correctness Score': r'Correctness Score:\s*(\d)',
        'Correctness Explanation': r'Correctness Explanation:\s*(.*?)\n\n',
        'Completeness Score': r'Completeness Score:\s*(\d)',
        'Completeness Explanation': r'Completeness Explanation:\s*(.*?)\n\n',
        'Relevance Score': r'Relevance Score:\s*(\d)',
        'Relevance Explanation': r'Relevance Explanation:\s*(.*?)\n\n',
        'Fluency Score': r'Fluency Score:\s*(\d)',
        'Fluency Explanation': r'Fluency Explanation:\s*(.*?)\n\n',
        'Conciseness Score': r'Conciseness Score:\s*(\d)',
        'Conciseness Explanation': r'Conciseness Explanation:\s*(.*?)\n\n',
        'Overall Score': r'Overall Score:\s*([\d\.]+)',
        'Overall Feedback': r'Overall Feedback:\s*(.*?)\n\n',
        'Comments on Columns': r'Comments on Columns:\s*(.*)',  # Captures the paragraph
    }

    result = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, evaluation_text, re.DOTALL)
        if match:
            result[key] = match.group(1).strip()
        else:
            result[key] = None
    return result


def evaluate_qa():
    variants = ["baseModel", "noNorm", "AttnOnly", "FFNonly"]
    norm_types = ["LN", "RMSN"]
    results = []

    for norm_type in norm_types:
        for variant in variants:
            filename = f"./modified_QA-source/{norm_type}_{variant}_gpt4_evaluation_data_modified.csv"
            print(f"Processing file: {filename}")

            if not os.path.exists(filename):
                print(f"File {filename} does not exist. Skipping.")
                continue

            # Read the CSV file
            df = pd.read_csv(filename)
            # Limit to the first 25 rows to manage costs
            df_limited = df.head(25) # <-- no limits placed on QA eval
            evaluations = []

            for idx, row in tqdm(df_limited.iterrows(), total=df_limited.shape[0], desc=f"Evaluating QA for {filename}"):
                model_name = row['model_name']
                norm_type = row['norm_type']
                variant = row['variant']
                question = row['question']
                context = row['context']
                reference_answers = row['reference_answers']
                generated_answer = row['generated_answer']

                # Convert reference_answers from string representation to list if necessary
                if isinstance(reference_answers, str):
                    try:
                        reference_answers = eval(reference_answers)
                        if not isinstance(reference_answers, list):
                            reference_answers = [reference_answers]
                    except:
                        reference_answers = [reference_answers]
                elif not isinstance(reference_answers, list):
                    reference_answers = [reference_answers]

                # Optionally truncate context if too long
                max_context_length = 2000  # Adjust as needed
                if len(context.split()) > max_context_length:
                    context = ' '.join(context.split()[:max_context_length]) + "..."

                # Get GPT-4 evaluation
                evaluation_text = get_gpt4_evaluation_qa(model_name, norm_type, variant, question, context, reference_answers, generated_answer)

                if evaluation_text:
                    parsed = parse_evaluation_qa(evaluation_text)
                    parsed['model_name'] = model_name
                    parsed['norm_type'] = norm_type
                    parsed['variant'] = variant
                    evaluations.append(parsed)
                    time.sleep(1)  # To respect API rate limits
                else:
                    evaluations.append({
                        'model_name': model_name,
                        'norm_type': norm_type,
                        'variant': variant,
                        'Correctness Score': None,
                        'Correctness Explanation': None,
                        'Completeness Score': None,
                        'Completeness Explanation': None,
                        'Relevance Score': None,
                        'Relevance Explanation': None,
                        'Fluency Score': None,
                        'Fluency Explanation': None,
                        'Conciseness Score': None,
                        'Conciseness Explanation': None,
                        'Overall Score': None,
                        'Overall Feedback': None,
                        'Comments on Columns': 'Error or Empty Response'
                    })

            # Save evaluations to a new CSV file
            eval_df = pd.DataFrame(evaluations)
            parsed_eval_filename = f"{norm_type}_{variant}_gpt4_qa_parsed_evaluations.csv"
            eval_df.to_csv(parsed_eval_filename, index=False)
            print(f"Saved parsed evaluations to {parsed_eval_filename}")

if __name__ == "__main__":
    evaluate_qa()

Processing file: ./modified_QA-source/LN_baseModel_gpt4_evaluation_data_modified.csv


Evaluating QA for ./modified_QA-source/LN_baseModel_gpt4_evaluation_data_modified.csv: 100%|██████████| 25/25 [02:16<00:00,  5.47s/it]


Saved parsed evaluations to LN_baseModel_gpt4_qa_parsed_evaluations.csv
Processing file: ./modified_QA-source/LN_noNorm_gpt4_evaluation_data_modified.csv


Evaluating QA for ./modified_QA-source/LN_noNorm_gpt4_evaluation_data_modified.csv: 100%|██████████| 25/25 [02:17<00:00,  5.49s/it]


Saved parsed evaluations to LN_noNorm_gpt4_qa_parsed_evaluations.csv
Processing file: ./modified_QA-source/LN_AttnOnly_gpt4_evaluation_data_modified.csv


Evaluating QA for ./modified_QA-source/LN_AttnOnly_gpt4_evaluation_data_modified.csv: 100%|██████████| 25/25 [02:24<00:00,  5.80s/it]


Saved parsed evaluations to LN_AttnOnly_gpt4_qa_parsed_evaluations.csv
Processing file: ./modified_QA-source/LN_FFNonly_gpt4_evaluation_data_modified.csv


Evaluating QA for ./modified_QA-source/LN_FFNonly_gpt4_evaluation_data_modified.csv: 100%|██████████| 25/25 [02:20<00:00,  5.61s/it]


Saved parsed evaluations to LN_FFNonly_gpt4_qa_parsed_evaluations.csv
Processing file: ./modified_QA-source/RMSN_baseModel_gpt4_evaluation_data_modified.csv


Evaluating QA for ./modified_QA-source/RMSN_baseModel_gpt4_evaluation_data_modified.csv: 100%|██████████| 25/25 [02:19<00:00,  5.56s/it]


Saved parsed evaluations to RMSN_baseModel_gpt4_qa_parsed_evaluations.csv
Processing file: ./modified_QA-source/RMSN_noNorm_gpt4_evaluation_data_modified.csv


Evaluating QA for ./modified_QA-source/RMSN_noNorm_gpt4_evaluation_data_modified.csv: 100%|██████████| 25/25 [02:20<00:00,  5.63s/it]


Saved parsed evaluations to RMSN_noNorm_gpt4_qa_parsed_evaluations.csv
Processing file: ./modified_QA-source/RMSN_AttnOnly_gpt4_evaluation_data_modified.csv


Evaluating QA for ./modified_QA-source/RMSN_AttnOnly_gpt4_evaluation_data_modified.csv: 100%|██████████| 25/25 [02:29<00:00,  6.00s/it]


Saved parsed evaluations to RMSN_AttnOnly_gpt4_qa_parsed_evaluations.csv
Processing file: ./modified_QA-source/RMSN_FFNonly_gpt4_evaluation_data_modified.csv


Evaluating QA for ./modified_QA-source/RMSN_FFNonly_gpt4_evaluation_data_modified.csv: 100%|██████████| 25/25 [02:24<00:00,  5.77s/it]

Saved parsed evaluations to RMSN_FFNonly_gpt4_qa_parsed_evaluations.csv





# Summary BillSum

In [12]:
import pandas as pd
import openai
import os
import time
from tqdm import tqdm
from google.colab import userdata

# Set your OpenAI API key
openai.api_key = userdata.get('OPENAI_API_KEY')


def get_gpt4_evaluation_summary(model_name, norm_type, variant, truncated_input, generated_summary):
    # Construct the prompt for GPT-4 evaluation
    prompt = f"""
As an expert evaluator, your task is to assess the quality of a generated summary based on the provided truncated input text. Please focus on the following criteria:

1. **Relevance**: Does the summary capture the main points of the truncated input?
2. **Conciseness**: Is the summary succinct without unnecessary details?
3. **Fluency**: Is the summary well-written with proper grammar and style?
4. **Accuracy**: Does the summary accurately represent the content of the truncated input without errors?
5. **Coherence**: Is the summary logically organized and easy to understand?

For each criterion, provide:
- **Score**: A number from 1 to 5 (where 1 is poor and 5 is excellent).
- **Explanation**: A brief justification for the score.

After evaluating each criterion, provide:
- **Overall Score**: The average of the five scores.
- **Overall Feedback**: A short overall feedback.

**Additionally**, provide a short paragraph commenting on the following columns:
- **Model Name**: {model_name}
- **Norm Type**: {norm_type}
- **Variant**: {variant}
- **Truncated Input**: [Truncated input is provided above.]
- **Generated Summary**: [Generated summary is provided above.]

**Please present your evaluation in the following structured format:**

```
Model Name: {model_name}

Relevance Score: [1-5]
Relevance Explanation: [Your explanation]

Conciseness Score: [1-5]
Conciseness Explanation: [Your explanation]

Fluency Score: [1-5]
Fluency Explanation: [Your explanation]

Accuracy Score: [1-5]
Accuracy Explanation: [Your explanation]

Coherence Score: [1-5]
Coherence Explanation: [Your explanation]

Overall Score: [Average score]
Overall Feedback: [Your feedback]

Comments on Columns:
[Your short paragraph commenting on each column]
```

---
**Truncated Input:**

{truncated_input}

---
**Generated Summary:**

{generated_summary}

---
"""
    # Call the OpenAI API using the ChatCompletion endpoint
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o",
            messages=[
                {"role": "user", "content": prompt}
            ],
            max_tokens=700,  # As per your requirement
            temperature=0.0,  # For deterministic output
        )
        evaluation = response['choices'][0]['message']['content']
        return evaluation
    except openai.error.RateLimitError as e:
        print(f"Rate limit error: {e}")
        retry_after = int(e.headers.get("Retry-After", 5))
        print(f"Retrying after {retry_after} seconds...")
        time.sleep(retry_after)
        # Retry the request
        return get_gpt4_evaluation_summary(model_name, norm_type, variant, truncated_input, generated_summary)
    except Exception as e:
        print(f"Error during OpenAI API call: {e}")
        return None

def parse_evaluation_summary(evaluation_text):
    """
    Parses GPT-4's evaluation text and extracts scores, explanations, and comments.
    """
    patterns = {
        'Relevance Score': r'Relevance Score:\s*(\d)',
        'Relevance Explanation': r'Relevance Explanation:\s*(.*?)\n\n',
        'Conciseness Score': r'Conciseness Score:\s*(\d)',
        'Conciseness Explanation': r'Conciseness Explanation:\s*(.*?)\n\n',
        'Fluency Score': r'Fluency Score:\s*(\d)',
        'Fluency Explanation': r'Fluency Explanation:\s*(.*?)\n\n',
        'Accuracy Score': r'Accuracy Score:\s*(\d)',
        'Accuracy Explanation': r'Accuracy Explanation:\s*(.*?)\n\n',
        'Coherence Score': r'Coherence Score:\s*(\d)',
        'Coherence Explanation': r'Coherence Explanation:\s*(.*?)\n\n',
        'Overall Score': r'Overall Score:\s*([\d\.]+)',
        'Overall Feedback': r'Overall Feedback:\s*(.*?)\n\n',
        'Comments on Columns': r'Comments on Columns:\s*(.*)',  # Captures the paragraph
    }

    result = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, evaluation_text, re.DOTALL)
        if match:
            result[key] = match.group(1).strip()
        else:
            result[key] = None
    return result

def evaluate_summaries():
    variants = ["baseModel", "noNorm", "AttnOnly", "FFNonly"]
    norm_types = ["LN", "RMSN"]

    for norm_type in norm_types:
        for variant in variants:
            filename = f"./Summarization/Original/{norm_type}_{variant}_evaluation_data_modified.csv"
            print(f"Processing file: {filename}")

            if not os.path.exists(filename):
                print(f"File {filename} does not exist. Skipping.")
                continue

            # Read the CSV file
            df = pd.read_csv(filename)
            # Limit to the first 25 rows to manage costs
            df_limited = df.head(25)
            evaluations = []

            for idx, row in tqdm(df_limited.iterrows(), total=df_limited.shape[0], desc=f"Evaluating Summaries for {filename}"):
                model_name = row['model_name']
                norm_type = row['norm_type']
                variant = row['variant']
                truncated_input = row['truncated_input']
                generated_summary = row['generated_summary']

                # Optionally truncate input if too long
                max_input_length = 1000  # Adjust as needed
                if len(truncated_input.split()) > max_input_length:
                    truncated_input = ' '.join(truncated_input.split()[:max_input_length]) + "..."

                # Get GPT-4 evaluation
                evaluation_text = get_gpt4_evaluation_summary(
                    model_name,
                    norm_type,
                    variant,
                    truncated_input,
                    generated_summary
                )

                if evaluation_text:
                    parsed = parse_evaluation_summary(evaluation_text)
                    parsed['model_name'] = model_name
                    parsed['norm_type'] = norm_type
                    parsed['variant'] = variant
                    evaluations.append(parsed)
                    time.sleep(1)  # To respect API rate limits
                else:
                    evaluations.append({
                        'model_name': model_name,
                        'norm_type': norm_type,
                        'variant': variant,
                        'Relevance Score': None,
                        'Relevance Explanation': None,
                        'Conciseness Score': None,
                        'Conciseness Explanation': None,
                        'Fluency Score': None,
                        'Fluency Explanation': None,
                        'Accuracy Score': None,
                        'Accuracy Explanation': None,
                        'Coherence Score': None,
                        'Coherence Explanation': None,
                        'Overall Score': None,
                        'Overall Feedback': None,
                        'Comments on Columns': 'Error or Empty Response'
                    })

            # Save evaluations to a new CSV file
            eval_df = pd.DataFrame(evaluations)
            parsed_eval_filename = f"./Summarization/{norm_type}_{variant}_gpt4_summary_parsed_evaluations.csv"
            eval_df.to_csv(parsed_eval_filename, index=False)
            print(f"Saved parsed evaluations to {parsed_eval_filename}")

if __name__ == "__main__":
    evaluate_summaries()

Processing file: ./Summarization/Original/LN_baseModel_evaluation_data_modified.csv


Evaluating Summaries for ./Summarization/Original/LN_baseModel_evaluation_data_modified.csv: 100%|██████████| 25/25 [02:35<00:00,  6.22s/it]


Saved parsed evaluations to ./Summarization/LN_baseModel_gpt4_summary_parsed_evaluations.csv
Processing file: ./Summarization/Original/LN_noNorm_evaluation_data_modified.csv


Evaluating Summaries for ./Summarization/Original/LN_noNorm_evaluation_data_modified.csv: 100%|██████████| 25/25 [02:32<00:00,  6.10s/it]


Saved parsed evaluations to ./Summarization/LN_noNorm_gpt4_summary_parsed_evaluations.csv
Processing file: ./Summarization/Original/LN_AttnOnly_evaluation_data_modified.csv


Evaluating Summaries for ./Summarization/Original/LN_AttnOnly_evaluation_data_modified.csv: 100%|██████████| 25/25 [02:33<00:00,  6.13s/it]


Saved parsed evaluations to ./Summarization/LN_AttnOnly_gpt4_summary_parsed_evaluations.csv
Processing file: ./Summarization/Original/LN_FFNonly_evaluation_data_modified.csv


Evaluating Summaries for ./Summarization/Original/LN_FFNonly_evaluation_data_modified.csv: 100%|██████████| 25/25 [02:33<00:00,  6.13s/it]


Saved parsed evaluations to ./Summarization/LN_FFNonly_gpt4_summary_parsed_evaluations.csv
Processing file: ./Summarization/Original/RMSN_baseModel_evaluation_data_modified.csv


Evaluating Summaries for ./Summarization/Original/RMSN_baseModel_evaluation_data_modified.csv: 100%|██████████| 25/25 [02:36<00:00,  6.27s/it]


Saved parsed evaluations to ./Summarization/RMSN_baseModel_gpt4_summary_parsed_evaluations.csv
Processing file: ./Summarization/Original/RMSN_noNorm_evaluation_data_modified.csv


Evaluating Summaries for ./Summarization/Original/RMSN_noNorm_evaluation_data_modified.csv: 100%|██████████| 25/25 [02:39<00:00,  6.39s/it]


Saved parsed evaluations to ./Summarization/RMSN_noNorm_gpt4_summary_parsed_evaluations.csv
Processing file: ./Summarization/Original/RMSN_AttnOnly_evaluation_data_modified.csv


Evaluating Summaries for ./Summarization/Original/RMSN_AttnOnly_evaluation_data_modified.csv: 100%|██████████| 25/25 [02:41<00:00,  6.44s/it]


Saved parsed evaluations to ./Summarization/RMSN_AttnOnly_gpt4_summary_parsed_evaluations.csv
Processing file: ./Summarization/Original/RMSN_FFNonly_evaluation_data_modified.csv


Evaluating Summaries for ./Summarization/Original/RMSN_FFNonly_evaluation_data_modified.csv: 100%|██████████| 25/25 [02:40<00:00,  6.43s/it]

Saved parsed evaluations to ./Summarization/RMSN_FFNonly_gpt4_summary_parsed_evaluations.csv





### Truncation, removing unneccesary columns for summarisation csv

In [2]:
import pandas as pd
import os
from tqdm import tqdm

def modify_csv_files():
    variants = ["baseModel", "noNorm", "AttnOnly", "FFNonly"]
    norm_types = ["LN", "RMSN"]

    for norm_type in norm_types:
        for variant in variants:
            original_filename = f"{norm_type}_{variant}_evaluation_data.csv"
            modified_filename = f"./modified/{norm_type}_{variant}_evaluation_data_modified.csv"
            print(f"Processing file: {original_filename}")

            if not os.path.exists(original_filename):
                print(f"File {original_filename} does not exist. Skipping.")
                continue

            # Read the original CSV file
            df = pd.read_csv(original_filename)

            # Keep only the required columns
            columns_to_keep = [
                'model_name',
                'norm_type',
                'variant',
                'truncated_input',
                'generated_summary'
            ]

            # Optionally include 'truncated_reference_summary' if you decide to keep it
            # columns_to_keep.append('truncated_reference_summary')

            df_modified = df[columns_to_keep]

            # Save the modified DataFrame to a new CSV file
            df_modified.to_csv(modified_filename, index=False)
            print(f"Modified file saved as: {modified_filename}")

if __name__ == "__main__":
    modify_csv_files()


Processing file: LN_baseModel_evaluation_data.csv
Modified file saved as: ./modified/LN_baseModel_evaluation_data_modified.csv
Processing file: LN_noNorm_evaluation_data.csv
Modified file saved as: ./modified/LN_noNorm_evaluation_data_modified.csv
Processing file: LN_AttnOnly_evaluation_data.csv
Modified file saved as: ./modified/LN_AttnOnly_evaluation_data_modified.csv
Processing file: LN_FFNonly_evaluation_data.csv
Modified file saved as: ./modified/LN_FFNonly_evaluation_data_modified.csv
Processing file: RMSN_baseModel_evaluation_data.csv
Modified file saved as: ./modified/RMSN_baseModel_evaluation_data_modified.csv
Processing file: RMSN_noNorm_evaluation_data.csv
Modified file saved as: ./modified/RMSN_noNorm_evaluation_data_modified.csv
Processing file: RMSN_AttnOnly_evaluation_data.csv
Modified file saved as: ./modified/RMSN_AttnOnly_evaluation_data_modified.csv
Processing file: RMSN_FFNonly_evaluation_data.csv
Modified file saved as: ./modified/RMSN_FFNonly_evaluation_data_modif

### modifying format of SQuAD QA csv file

In [9]:
import pandas as pd
import os
import re
from tqdm import tqdm

def extract_norm_type_variant(model_name):
    """
    Extracts norm_type and variant from the model_name string.

    Assumes the model_name format is:
    shng2025/GPT-Valkyrie_<norm_type>-124m__<variant>__SQuAD

    Example:
    shng2025/GPT-Valkyrie_LN-124m__AttnOnly__SQuAD
    => norm_type: LN
    => variant: AttnOnly
    """
    try:
        # Split the model_name by '__' to separate components
        parts = model_name.split('_')
        if len(parts) < 3:
            raise ValueError("Model name does not have enough parts separated by '__'.")

        # Extract norm_type from the second part (e.g., 'LN-124m')
        config_part = parts[1]  # 'LN-124m'
        norm_type = config_part.split('-')[0]  # 'LN'

        # Extract variant from the third part (e.g., 'AttnOnly')
        variant = parts[3]  # 'AttnOnly'

        return norm_type, variant
    except Exception as e:
        print(f"Error extracting norm_type and variant from model_name '{model_name}': {e}")
        return "Unknown", "Unknown"

def process_qa_csv(input_filepath, output_filepath, max_rows=25):
    """
    Processes a single QA CSV file:
    - Extracts norm_type and variant from model_name
    - Adds them as separate columns
    - Removes unnecessary columns
    - Limits to the first `max_rows` rows
    - Saves the modified DataFrame to a new CSV file
    """
    try:
        # Read the CSV file
        df = pd.read_csv(input_filepath)
        print(f"Processing '{input_filepath}' with {len(df)} rows.")

        # Extract norm_type and variant from model_name
        df['norm_type'], df['variant'] = zip(*df['model_name'].apply(extract_norm_type_variant))

        # Define columns to keep
        columns_to_keep = ['model_name', 'norm_type', 'variant', 'question', 'context', 'reference_answers', 'generated_answer']

        # Check if these columns exist in the DataFrame
        existing_columns = [col for col in columns_to_keep if col in df.columns]
        missing_columns = set(columns_to_keep) - set(existing_columns)
        if missing_columns:
            print(f"Warning: The following expected columns are missing and will be skipped: {missing_columns}")

        # Create a new DataFrame with only the desired columns
        df_modified = df[existing_columns]

        # Limit to the first `max_rows` rows
        df_modified = df_modified.head(max_rows)

        # Save the modified DataFrame to a new CSV file
        df_modified.to_csv(output_filepath, index=False)
        print(f"Saved modified CSV to '{output_filepath}' with {len(df_modified)} rows.\n")

    except Exception as e:
        print(f"Error processing '{input_filepath}': {e}\n")

def modify_all_qa_csvs(input_dir, output_dir, max_rows=25):
    """
    Processes all QA CSV files in the input directory and saves the modified versions to the output directory.

    Parameters:
    - input_dir: Directory containing the original QA CSV files.
    - output_dir: Directory where modified CSV files will be saved.
    - max_rows: Maximum number of rows to keep per CSV.
    """
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # List all CSV files in the input directory
    csv_files = [f for f in os.listdir(input_dir) if f.endswith('.csv')]

    if not csv_files:
        print(f"No CSV files found in '{input_dir}'.")
        return

    # Process each CSV file with a progress bar
    for csv_file in tqdm(csv_files, desc="Processing QA CSV files"):
        input_filepath = os.path.join(input_dir, csv_file)
        filename, ext = os.path.splitext(csv_file)
        output_filename = f"{filename}_modified{ext}"
        output_filepath = os.path.join(output_dir, output_filename)

        process_qa_csv(input_filepath, output_filepath, max_rows=max_rows)

if __name__ == "__main__":
    # Define input and output directories
    input_directory = "./QA-source"      # Replace with your actual input directory
    output_directory = "./modified_QA-source"  # Replace with your desired output directory

    # Define maximum number of rows per CSV to keep
    max_rows_per_csv = 100

    # Run the modification process
    modify_all_qa_csvs(input_directory, output_directory, max_rows=max_rows_per_csv)


Processing QA CSV files: 100%|██████████| 8/8 [00:00<00:00, 159.62it/s]

Processing './QA-source/LN_noNorm_gpt4_evaluation_data.csv' with 100 rows.
Saved modified CSV to './modified_QA-source/LN_noNorm_gpt4_evaluation_data_modified.csv' with 100 rows.

Processing './QA-source/LN_AttnOnly_gpt4_evaluation_data.csv' with 100 rows.
Saved modified CSV to './modified_QA-source/LN_AttnOnly_gpt4_evaluation_data_modified.csv' with 100 rows.

Processing './QA-source/RMSN_FFNonly_gpt4_evaluation_data.csv' with 100 rows.
Saved modified CSV to './modified_QA-source/RMSN_FFNonly_gpt4_evaluation_data_modified.csv' with 100 rows.

Processing './QA-source/LN_FFNonly_gpt4_evaluation_data.csv' with 100 rows.
Saved modified CSV to './modified_QA-source/LN_FFNonly_gpt4_evaluation_data_modified.csv' with 100 rows.

Processing './QA-source/RMSN_baseModel_gpt4_evaluation_data.csv' with 100 rows.
Saved modified CSV to './modified_QA-source/RMSN_baseModel_gpt4_evaluation_data_modified.csv' with 100 rows.

Processing './QA-source/LN_baseModel_gpt4_evaluation_data.csv' with 100 rows.





In [None]:
p

# starting out before QA

0.72 USD spent - 3.37 USD final

# starting out before Summarization

3.39 USD spent -