In [None]:
!pip install openai==0.28

In [None]:
import pandas as pd
import openai
import os
import time
from tqdm import tqdm
from google.colab import userdata
import re

# Set your OpenAI API key
openai.api_key = userdata.get('OPENAI_API_KEY')

# Collecting Raw Data

In [1]:
prompts = [
    "Once upon a time in a distant galaxy,",
    "The secret to happiness is",
    "In the midst of the bustling city,",
    "As the sun set over the horizon,",
    "The mysterious package arrived on my doorstep,",
    "In a world where robots and humans coexist,",
    "The ancient prophecy foretold that",
    "Amidst the chaos of the storm,",
    "The scientist peered into the microscope and saw",
    "Every morning, she would start her day with",
    "The last thing I expected to find in the attic was",
    "Under the cover of darkness,",
    "With a heavy heart, he decided to",
    "On the eve of the grand festival,",
    "The sound of laughter filled the air as",
    "If time travel were possible,",
    "Deep beneath the ocean waves,",
    "In the quiet village nestled among the hills,",
    "The door creaked open to reveal",
    "Legends speak of a sword that",
    "In the year 2525, humanity has",
    "The aroma of freshly baked bread",
    "She looked into his eyes and knew that",
    "On the first day of school,",
    "The journey to the top of the mountain was",
]


In [None]:
import os
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from tqdm import tqdm

def load_model_and_tokenizer(model_name):
    """
    Loads the language model and tokenizer from the specified Hugging Face repository.
    """
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(model_name)
    # Ensure model is in evaluation mode
    model.eval()
    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    return model, tokenizer, device

def generate_text(model, tokenizer, device, prompt, max_length=1024):
    """
    Generates text using the model and tokenizer for a given prompt.
    """
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(device)
    # Ensure the total length does not exceed the model's maximum
    max_total_length = min(tokenizer.model_max_length, max_length + inputs.shape[1])
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=max_total_length,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=1.0,
            pad_token_id=tokenizer.eos_token_id
        )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

def main():
    # List of models to evaluate
    models = [
        "shng2025/GPT-Valkyrie_RMSN-124m__noNorm__",
        "shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__",
        "shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__",
        "shng2025/GPT-Valkyrie_LN-124m__noNorm__",
        "shng2025/GPT-Valkyrie_LN-124m__FFNonly__",
        "shng2025/GPT-Valkyrie_LN-124m__AttnOnly__",
        "shng2025/GPT-Valkyrie_RMSN-124m__baseModel__",
        "shng2025/GPT-Valkyrie_LN-124m__baseModel__"
    ]

    # Ensure output directory exists
    output_dir = "generated_texts"
    os.makedirs(output_dir, exist_ok=True)

    # List of prompts
    prompts = [
        "Once upon a time in a distant galaxy,",
        "The secret to happiness is",
        "In the midst of the bustling city,",
        "As the sun set over the horizon,",
        "The mysterious package arrived on my doorstep,",
        "In a world where robots and humans coexist,",
        "The ancient prophecy foretold that",
        "Amidst the chaos of the storm,",
        "The scientist peered into the microscope and saw",
        "Every morning, she would start her day with",
        "The last thing I expected to find in the attic was",
        "Under the cover of darkness,",
        "With a heavy heart, he decided to",
        "On the eve of the grand festival,",
        "The sound of laughter filled the air as",
        "If time travel were possible,",
        "Deep beneath the ocean waves,",
        "In the quiet village nestled among the hills,",
        "The door creaked open to reveal",
        "Legends speak of a sword that",
        "In the year 2525, humanity has",
        "The aroma of freshly baked bread",
        "She looked into his eyes and knew that",
        "On the first day of school,",
        "The journey to the top of the mountain was",
    ]

    for model_name in models:
        print(f"Processing model: {model_name}")
        model, tokenizer, device = load_model_and_tokenizer(model_name)
        generated_texts = []

        for prompt in tqdm(prompts, desc=f"Generating texts for {model_name}"):
            generated_text = generate_text(model, tokenizer, device, prompt, max_length=1024)
            generated_texts.append({
                "model_name": model_name,
                "prompt": prompt,
                "generated_text": generated_text
            })

        # Save the outputs to a CSV file
        df = pd.DataFrame(generated_texts)
        # Extract norm_type and variant from model_name
        norm_type, variant = extract_norm_type_variant(model_name)
        output_filename = f"{norm_type}_{variant}_generated_texts.csv"
        output_filepath = os.path.join(output_dir, output_filename)
        df.to_csv(output_filepath, index=False)
        print(f"Saved generated texts to {output_filepath}\n")

def extract_norm_type_variant(model_name):
    """
    Extracts norm_type and variant from the model_name string.
    """
    try:
        # Assuming the model_name format is: shng2025/GPT-Valkyrie_<norm_type>-124m__<variant>__
        base_name = model_name.split('/')[-1]
        parts = base_name.split('__')
        config_part = parts[0]  # e.g., 'GPT-Valkyrie_RMSN-124m'
        variant = parts[1] if len(parts) > 1 else 'Unknown'
        norm_type = config_part.split('_')[1].split('-')[0]  # e.g., 'RMSN'
        return norm_type, variant
    except Exception as e:
        print(f"Error extracting norm_type and variant from model_name '{model_name}': {e}")
        return "Unknown", "Unknown"

if __name__ == "__main__":
    main()


Processing model: shng2025/GPT-Valkyrie_RMSN-124m__noNorm__


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Generating texts for shng2025/GPT-Valkyrie_RMSN-124m__noNorm__:   0%|          | 0/25 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Generating texts for shng2025/GPT-Valkyrie_RMSN-124m__noNorm__:  24%|██▍       | 6/25 [00:55<02:55,  9.22s/it]