In [None]:
!pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
Successfully installed openai-0.28.0


In [None]:
import pandas as pd
import openai
import os
import time
from tqdm import tqdm
from google.colab import userdata
import re

# Set your OpenAI API key
openai.api_key = userdata.get('OPENAI_API_KEY')

In [None]:
def get_gpt4_evaluation_text(model_name, norm_type, variant, prompt, generated_text):
    # Construct the prompt for GPT-4o evaluation
    evaluation_prompt = f"""
As an expert evaluator, your task is to assess the quality of a generated text based on the provided prompt. Please focus on the following criteria:

1. **Creativity**: How original and imaginative is the text?
2. **Coherence**: Is the text logically consistent and does it flow well?
3. **Fluency**: Is the text well-written with proper grammar and style?
4. **Relevance**: Does the text stay on topic and relate to the prompt?
5. **Engagement**: Is the text interesting and engaging to read?

For each criterion, provide:
- **Score**: A number from 1 to 5 (1 is poor, 5 is excellent).
- **Explanation**: A brief justification for the score.

After evaluating each criterion, provide:
- **Overall Score**: The average of the five scores.
- **Overall Feedback**: A short overall feedback.

**Additionally**, provide a short paragraph commenting on the following columns:
- **Model Name**: {model_name}
- **Norm Type**: {norm_type}
- **Variant**: {variant}
- **Prompt**: [Prompt is provided above.]
- **Generated Text**: [Generated text is provided above.]

**Please present your evaluation in the following structured format:**

```
Model Name: {model_name}

Creativity Score: [1-5]
Creativity Explanation: [Your explanation]

Coherence Score: [1-5]
Coherence Explanation: [Your explanation]

Fluency Score: [1-5]
Fluency Explanation: [Your explanation]

Relevance Score: [1-5]
Relevance Explanation: [Your explanation]

Engagement Score: [1-5]
Engagement Explanation: [Your explanation]

Overall Score: [Average score]
Overall Feedback: [Your feedback]

Comments on Columns:
[Your short paragraph commenting on each column]
```

---
**Prompt:**

{prompt}

---
**Generated Text:**

{generated_text}

---
"""
    # Call the OpenAI API using the ChatCompletion endpoint
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o",  # Ensure gpt-4o is available to you
            messages=[
                {"role": "user", "content": evaluation_prompt}
            ],
            max_tokens=700,  # As per your requirement
            temperature=0.0,  # For deterministic output
        )
        evaluation = response['choices'][0]['message']['content']
        return evaluation
    except openai.error.RateLimitError as e:
        print(f"Rate limit error: {e}")
        retry_after = int(e.headers.get("Retry-After", 5))
        print(f"Retrying after {retry_after} seconds...")
        time.sleep(retry_after)
        # Retry the request
        return get_gpt4_evaluation_text(model_name, norm_type, variant, prompt, generated_text)
    except Exception as e:
        print(f"Error during OpenAI API call: {e}")
        return None

def parse_evaluation_text(evaluation_text):
    """
    Parses GPT-4o's evaluation text and extracts scores, explanations, and comments.
    """
    patterns = {
        'Creativity Score': r'Creativity Score:\s*(\d)',
        'Creativity Explanation': r'Creativity Explanation:\s*(.*?)\n\n',
        'Coherence Score': r'Coherence Score:\s*(\d)',
        'Coherence Explanation': r'Coherence Explanation:\s*(.*?)\n\n',
        'Fluency Score': r'Fluency Score:\s*(\d)',
        'Fluency Explanation': r'Fluency Explanation:\s*(.*?)\n\n',
        'Relevance Score': r'Relevance Score:\s*(\d)',
        'Relevance Explanation': r'Relevance Explanation:\s*(.*?)\n\n',
        'Engagement Score': r'Engagement Score:\s*(\d)',
        'Engagement Explanation': r'Engagement Explanation:\s*(.*?)\n\n',
        'Overall Score': r'Overall Score:\s*([\d\.]+)',
        'Overall Feedback': r'Overall Feedback:\s*(.*?)\n\n',
        'Comments on Columns': r'Comments on Columns:\s*(.*)',  # Captures the paragraph
    }

    result = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, evaluation_text, re.DOTALL)
        if match:
            result[key] = match.group(1).strip()
        else:
            result[key] = None
    return result

def evaluate_generated_texts():
    # Directory containing generated text CSV files
    input_dir = "generated_texts"
    output_dir = "evaluations"
    os.makedirs(output_dir, exist_ok=True)

    # List CSV files in the input directory
    csv_files = [f for f in os.listdir(input_dir) if f.endswith('.csv')]

    for csv_file in csv_files:
        input_filepath = os.path.join(input_dir, csv_file)
        print(f"Processing file: {input_filepath}")

        # Read the CSV file
        df = pd.read_csv(input_filepath)

        # Limit to the first 25 rows to manage costs
        df_limited = df.head(25)
        evaluations = []

        for idx, row in tqdm(df_limited.iterrows(), total=df_limited.shape[0], desc=f"Evaluating texts in {csv_file}"):
            model_name = row['model_name']
            prompt = row['prompt']
            generated_text = row['generated_text']

            # Extract norm_type and variant from model_name
            norm_type, variant = extract_norm_type_variant(model_name)

            # Optionally truncate generated_text if too long
            max_text_length = 1000  # Adjust as needed
            if len(generated_text.split()) > max_text_length:
                generated_text = ' '.join(generated_text.split()[:max_text_length]) + "..."

            # Get GPT-4o evaluation
            evaluation_text = get_gpt4_evaluation_text(
                model_name,
                norm_type,
                variant,
                prompt,
                generated_text
            )

            if evaluation_text:
                parsed = parse_evaluation_text(evaluation_text)
                parsed['model_name'] = model_name
                parsed['norm_type'] = norm_type
                parsed['variant'] = variant
                evaluations.append(parsed)
                time.sleep(1)  # To respect API rate limits
            else:
                evaluations.append({
                    'model_name': model_name,
                    'norm_type': norm_type,
                    'variant': variant,
                    'Creativity Score': None,
                    'Creativity Explanation': None,
                    'Coherence Score': None,
                    'Coherence Explanation': None,
                    'Fluency Score': None,
                    'Fluency Explanation': None,
                    'Relevance Score': None,
                    'Relevance Explanation': None,
                    'Engagement Score': None,
                    'Engagement Explanation': None,
                    'Overall Score': None,
                    'Overall Feedback': None,
                    'Comments on Columns': 'Error or Empty Response'
                })

        # Save evaluations to a new CSV file
        eval_df = pd.DataFrame(evaluations)
        output_filename = f"{norm_type}_{variant}_evaluations.csv"
        output_filepath = os.path.join(output_dir, output_filename)
        eval_df.to_csv(output_filepath, index=False)
        print(f"Saved evaluations to {output_filepath}\n")

def extract_norm_type_variant(model_name):
    """
    Extracts norm_type and variant from the model_name string.
    """
    try:
        # Assuming the model_name format is: shng2025/GPT-Valkyrie_<norm_type>-124m__<variant>__
        base_name = model_name.split('/')[-1]
        parts = base_name.split('__')
        config_part = parts[0]  # e.g., 'GPT-Valkyrie_RMSN-124m'
        variant = parts[1] if len(parts) > 1 else 'Unknown'
        norm_type = config_part.split('_')[1].split('-')[0]  # e.g., 'RMSN'
        return norm_type, variant
    except Exception as e:
        print(f"Error extracting norm_type and variant from model_name '{model_name}': {e}")
        return "Unknown", "Unknown"

if __name__ == "__main__":
    evaluate_generated_texts()

Processing file: generated_texts/LN_noNorm_generated_texts.csv


Evaluating texts in LN_noNorm_generated_texts.csv: 100%|██████████| 25/25 [02:01<00:00,  4.85s/it]


Saved evaluations to evaluations/LN_noNorm_evaluations.csv

Processing file: generated_texts/RMSN_noNorm_generated_texts.csv


Evaluating texts in RMSN_noNorm_generated_texts.csv: 100%|██████████| 25/25 [01:59<00:00,  4.76s/it]


Saved evaluations to evaluations/RMSN_noNorm_evaluations.csv

Processing file: generated_texts/RMSN_FFNonly_generated_texts.csv


Evaluating texts in RMSN_FFNonly_generated_texts.csv: 100%|██████████| 25/25 [01:55<00:00,  4.61s/it]


Saved evaluations to evaluations/RMSN_FFNonly_evaluations.csv

Processing file: generated_texts/RMSN_baseModel_generated_texts.csv


Evaluating texts in RMSN_baseModel_generated_texts.csv: 100%|██████████| 25/25 [02:06<00:00,  5.06s/it]


Saved evaluations to evaluations/RMSN_baseModel_evaluations.csv

Processing file: generated_texts/LN_baseModel_generated_texts.csv


Evaluating texts in LN_baseModel_generated_texts.csv: 100%|██████████| 25/25 [02:04<00:00,  4.96s/it]


Saved evaluations to evaluations/LN_baseModel_evaluations.csv

Processing file: generated_texts/LN_AttnOnly_generated_texts.csv


Evaluating texts in LN_AttnOnly_generated_texts.csv: 100%|██████████| 25/25 [01:59<00:00,  4.77s/it]


Saved evaluations to evaluations/LN_AttnOnly_evaluations.csv

Processing file: generated_texts/RMSN_AttnOnly_generated_texts.csv


Evaluating texts in RMSN_AttnOnly_generated_texts.csv: 100%|██████████| 25/25 [01:59<00:00,  4.78s/it]


Saved evaluations to evaluations/RMSN_AttnOnly_evaluations.csv

Processing file: generated_texts/LN_FFNonly_generated_texts.csv


Evaluating texts in LN_FFNonly_generated_texts.csv: 100%|██████████| 25/25 [02:03<00:00,  4.96s/it]

Saved evaluations to evaluations/LN_FFNonly_evaluations.csv






# Collecting Raw Data

In [None]:
prompts = [
    "Once upon a time in a distant galaxy,",
    "The secret to happiness is",
    "In the midst of the bustling city,",
    "As the sun set over the horizon,",
    "The mysterious package arrived on my doorstep,",
    "In a world where robots and humans coexist,",
    "The ancient prophecy foretold that",
    "Amidst the chaos of the storm,",
    "The scientist peered into the microscope and saw",
    "Every morning, she would start her day with",
    "The last thing I expected to find in the attic was",
    "Under the cover of darkness,",
    "With a heavy heart, he decided to",
    "On the eve of the grand festival,",
    "The sound of laughter filled the air as",
    "If time travel were possible,",
    "Deep beneath the ocean waves,",
    "In the quiet village nestled among the hills,",
    "The door creaked open to reveal",
    "Legends speak of a sword that",
    "In the year 2525, humanity has",
    "The aroma of freshly baked bread",
    "She looked into his eyes and knew that",
    "On the first day of school,",
    "The journey to the top of the mountain was",
]


In [None]:
import os
import pandas as pd
from transformers import pipeline
import torch
from tqdm import tqdm

def main():
    # List of models to evaluate
    models = [
        "shng2025/GPT-Valkyrie_RMSN-124m__noNorm__",
        "shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__",
        "shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__",
        "shng2025/GPT-Valkyrie_RMSN-124m__baseModel__",
        "shng2025/GPT-Valkyrie_LN-124m__noNorm__",
        "shng2025/GPT-Valkyrie_LN-124m__FFNonly__",
        "shng2025/GPT-Valkyrie_LN-124m__AttnOnly__",
        "shng2025/GPT-Valkyrie_LN-124m__baseModel__"
    ]

    # Ensure output directory exists
    output_dir = "generated_texts"
    os.makedirs(output_dir, exist_ok=True)

    # List of prompts
    prompts = [
        "Once upon a time in a distant galaxy,",
        "The secret to happiness is",
        "In the midst of the bustling city,",
        "As the sun set over the horizon,",
        "The mysterious package arrived on my doorstep,",
        "In a world where robots and humans coexist,",
        "The ancient prophecy foretold that",
        "Amidst the chaos of the storm,",
        "The scientist peered into the microscope and saw",
        "Every morning, she would start her day with",
        "The last thing I expected to find in the attic was",
        "Under the cover of darkness,",
        "With a heavy heart, he decided to",
        "On the eve of the grand festival,",
        "The sound of laughter filled the air as",
        "If time travel were possible,",
        "Deep beneath the ocean waves,",
        "In the quiet village nestled among the hills,",
        "The door creaked open to reveal",
        "Legends speak of a sword that",
        "In the year 2525, humanity has",
        "The aroma of freshly baked bread",
        "She looked into his eyes and knew that",
        "On the first day of school,",
        "The journey to the top of the mountain was",
    ]

    for model_name in models:
        print(f"Processing model: {model_name}")
        # Load the text generation pipeline
        generator = pipeline(
            'text-generation',
            model=model_name,
            tokenizer="gpt2",
            device=0 if torch.cuda.is_available() else -1
        )
        # Set pad_token to eos_token
        generator.tokenizer.pad_token = generator.tokenizer.eos_token

        generated_texts = []

        for prompt in tqdm(prompts, desc=f"Generating texts for {model_name}"):
            outputs = generator(
                prompt,
                max_length=1024,
                num_return_sequences=1,
                do_sample=True,
                top_k=50,
                top_p=0.95,
                temperature=1.0,
                pad_token_id=generator.tokenizer.eos_token_id
            )
            generated_text = outputs[0]['generated_text']
            generated_texts.append({
                "model_name": model_name,
                "prompt": prompt,
                "generated_text": generated_text
            })

        # Save the outputs to a CSV file
        df = pd.DataFrame(generated_texts)
        # Extract norm_type and variant from model_name
        norm_type, variant = extract_norm_type_variant(model_name)
        output_filename = f"{norm_type}_{variant}_generated_texts.csv"
        output_filepath = os.path.join(output_dir, output_filename)
        df.to_csv(output_filepath, index=False)
        print(f"Saved generated texts to {output_filepath}\n")

def extract_norm_type_variant(model_name):
    """
    Extracts norm_type and variant from the model_name string.
    """
    try:
        # Assuming the model_name format is: shng2025/GPT-Valkyrie_<norm_type>-124m__<variant>__
        base_name = model_name.split('/')[-1]
        parts = base_name.split('__')
        config_part = parts[0]  # e.g., 'GPT-Valkyrie_RMSN-124m'
        variant = parts[1] if len(parts) > 1 else 'Unknown'
        norm_type = config_part.split('_')[1].split('-')[0]  # e.g., 'RMSN'
        return norm_type, variant
    except Exception as e:
        print(f"Error extracting norm_type and variant from model_name '{model_name}': {e}")
        return "Unknown", "Unknown"

if __name__ == "__main__":
    main()


Processing model: shng2025/GPT-Valkyrie_RMSN-124m__noNorm__


Generating texts for shng2025/GPT-Valkyrie_RMSN-124m__noNorm__:   0%|          | 0/25 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Generating texts for shng2025/GPT-Valkyrie_RMSN-124m__noNorm__:  40%|████      | 10/25 [01:37<02:27,  9.81s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Generating texts for shng2025/GPT-Valkyrie_RMSN-124m__noNorm__: 100%|██████████| 25/25 [03:59<00:00,  9.56s/it]


Saved generated texts to generated_texts/RMSN_noNorm_generated_texts.csv

Processing model: shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__


Generating texts for shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__:   0%|          | 0/25 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Generating texts for shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__: 100%|██████████| 25/25 [01:31<00:00,  3.64s/it]


Saved generated texts to generated_texts/RMSN_FFNonly_generated_texts.csv

Processing model: shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__


config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Generating texts for shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__:   0%|          | 0/25 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Generating texts for shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__: 100%|██████████| 25/25 [04:01<00:00,  9.64s/it]


Saved generated texts to generated_texts/RMSN_AttnOnly_generated_texts.csv

Processing model: shng2025/GPT-Valkyrie_RMSN-124m__baseModel__


config.json:   0%|          | 0.00/751 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Generating texts for shng2025/GPT-Valkyrie_RMSN-124m__baseModel__:   0%|          | 0/25 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Generating texts for shng2025/GPT-Valkyrie_RMSN-124m__baseModel__: 100%|██████████| 25/25 [03:02<00:00,  7.32s/it]


Saved generated texts to generated_texts/RMSN_baseModel_generated_texts.csv

Processing model: shng2025/GPT-Valkyrie_LN-124m__noNorm__


config.json:   0%|          | 0.00/840 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Generating texts for shng2025/GPT-Valkyrie_LN-124m__noNorm__:   0%|          | 0/25 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Generating texts for shng2025/GPT-Valkyrie_LN-124m__noNorm__: 100%|██████████| 25/25 [03:40<00:00,  8.80s/it]


Saved generated texts to generated_texts/LN_noNorm_generated_texts.csv

Processing model: shng2025/GPT-Valkyrie_LN-124m__FFNonly__


config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Generating texts for shng2025/GPT-Valkyrie_LN-124m__FFNonly__:   0%|          | 0/25 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Generating texts for shng2025/GPT-Valkyrie_LN-124m__FFNonly__: 100%|██████████| 25/25 [02:11<00:00,  5.25s/it]


Saved generated texts to generated_texts/LN_FFNonly_generated_texts.csv

Processing model: shng2025/GPT-Valkyrie_LN-124m__AttnOnly__


config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Generating texts for shng2025/GPT-Valkyrie_LN-124m__AttnOnly__:   0%|          | 0/25 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Generating texts for shng2025/GPT-Valkyrie_LN-124m__AttnOnly__: 100%|██████████| 25/25 [03:48<00:00,  9.15s/it]


Saved generated texts to generated_texts/LN_AttnOnly_generated_texts.csv

Processing model: shng2025/GPT-Valkyrie_LN-124m__baseModel__


config.json:   0%|          | 0.00/751 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Generating texts for shng2025/GPT-Valkyrie_LN-124m__baseModel__:   0%|          | 0/25 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Generating texts for shng2025/GPT-Valkyrie_LN-124m__baseModel__: 100%|██████████| 25/25 [03:01<00:00,  7.28s/it]

Saved generated texts to generated_texts/LN_baseModel_generated_texts.csv






# Base textGen cost

Starting 6.63 USD - Post 9.11 USD