In [None]:
#!/usr/bin/env python3
"""
LLM Randomness Testing Script for Google Colab
Tests whether language models are truly random for number and word generation
"""

import json
import os
import time
from datetime import datetime
from typing import List, Dict, Any, Optional
from collections import Counter
import re

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from google.colab import drive
import pandas as pd

# Configuration
class Config:
    def __init__(self):
        # Model configuration
        self.model_name = "Qwen/Qwen2.5-14B-Instruct"  # You can change this to other Qwen models
        self.use_4bit = False  # Use 4-bit quantization to fit in GPU memory

        # Task configuration
        self.num_iterations = 50  # Number of times to repeat each task
        self.numbers_per_query = [1, 10]  # Generate 1, 3, and 5 numbers per query
        self.words_per_query = [1, 10]    # Generate 1, 3, and 5 words per query

        # Number generation parameters
        self.number_range = (1, 10)  # Range for random numbers

        # Word generation parameters
        # self.word_categories = ["animals", "colors", "foods", "countries", "professions"]
        self.word_categories = ["any"]

        # Output configuration
        self.output_dir = "/content/drive/MyDrive/nac/"
        self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Prompt types
        self.use_cot = True  # Whether to test Chain of Thought reasoning

        # Generation parameters
        self.max_new_tokens = 200
        self.temperature = 1.0
        self.do_sample = True
        self.top_p = 0.9

# Initialize configuration
config = Config()

def setup_environment():
    """Setup the environment and mount Google Drive"""
    print("Setting up environment...")

    # Mount Google Drive
    try:
        drive.mount('/content/drive')
        print("✓ Google Drive mounted successfully")
    except Exception as e:
        print(f"✗ Error mounting Google Drive: {e}")
        return False

    # Create output directory
    os.makedirs(config.output_dir, exist_ok=True)
    print(f"✓ Output directory created: {config.output_dir}")

    # Check GPU
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0)
        print(f"✓ GPU available: {gpu_name}")
        print(f"✓ GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    else:
        print("✗ No GPU available")
        return False

    return True

def load_model():
    """Load the language model and tokenizer"""
    print(f"Loading model: {config.model_name}")

    # Configure quantization for GPU memory efficiency
    if config.use_4bit:
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )
    else:
        quantization_config = None

    try:
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            config.model_name,
            trust_remote_code=True
        )

        # Load model
        model = AutoModelForCausalLM.from_pretrained(
            config.model_name,
            quantization_config=quantization_config,
            device_map="auto",
            torch_dtype=torch.float16,
            trust_remote_code=True
        )

        print("✓ Model loaded successfully")
        return model, tokenizer

    except Exception as e:
        print(f"✗ Error loading model: {e}")
        return None, None

def create_prompts():
    """Create prompts for different tasks"""
    prompts = {
        "numbers": {
            "normal": "Generate {count} random number{s} between {min_val} and {max_val}. Only provide the number{s}, no explanation.",
            "cot": "I need {count} random number{s} between {min_val} and {max_val}. Let me think step by step about how to generate truly random numbers, then provide {count} number{s}."
        },
        "words": {
            "normal": "Generate {count} random {category} word{s}. Only provide the word{s}, no explanation.",
            "cot": "I need to generate {count} random {category} word{s}. Let me think step by step about different {category} to ensure randomness, then provide {count} word{s}."
        }
    }
    return prompts

def generate_response(model, tokenizer, prompt: str) -> str:
    """Generate response from the model"""
    try:
        # Prepare the input
        messages = [
            {"role": "user", "content": prompt}
        ]

        # Apply chat template
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        # Tokenize
        inputs = tokenizer([text], return_tensors="pt").to(model.device)

        # Generate
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=config.max_new_tokens,
                temperature=config.temperature,
                do_sample=config.do_sample,
                top_p=config.top_p,
                pad_token_id=tokenizer.eos_token_id
            )

        # Decode response
        response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
        return response.strip()

    except Exception as e:
        print(f"✗ Error generating response: {e}")
        return ""

def extract_numbers(text: str) -> List[int]:
    """Extract numbers from generated text"""
    # Find all numbers in the text
    numbers = re.findall(r'\b\d+\b', text)
    try:
        return [int(num) for num in numbers if config.number_range[0] <= int(num) <= config.number_range[1]]
    except:
        return []

def extract_words(text: str) -> List[str]:
    """Extract words from generated text"""
    # Simple word extraction - split by common delimiters and clean
    words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
    # Filter out common stop words and very short words
    stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'cannot', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them', 'my', 'your', 'his', 'her', 'its', 'our', 'their'}
    filtered_words = [word for word in words if len(word) > 2 and word not in stop_words]
    return filtered_words

def test_number_randomness(model, tokenizer, prompts: Dict) -> List[Dict]:
    """Test number randomness"""
    print("\n=== Testing Number Randomness ===")
    results = []

    total_tests = len(config.numbers_per_query) * 2 * config.num_iterations  # 2 for normal and CoT
    current_test = 0

    for count in config.numbers_per_query:
        for prompt_type in ["normal", "cot"]:
            print(f"\nTesting {count} number{'s' if count > 1 else ''} - {prompt_type.upper()}")

            generated_numbers = []
            responses_data = []

            for i in range(config.num_iterations):
                current_test += 1

                # Create prompt
                s = 's' if count > 1 else ''
                prompt = prompts["numbers"][prompt_type].format(
                    count=count,
                    s=s,
                    min_val=config.number_range[0],
                    max_val=config.number_range[1]
                )

                # Generate response
                response = generate_response(model, tokenizer, prompt)

                # Extract numbers
                numbers = extract_numbers(response)
                generated_numbers.extend(numbers)

                # Store detailed response data
                responses_data.append({
                    "iteration": i + 1,
                    "prompt": prompt,
                    "response": response,
                    "extracted_numbers": numbers,
                    "expected_count": count,
                    "actual_count": len(numbers)
                })

                # Progress update
                if (i + 1) % 10 == 0:
                    print(f"  Progress: {i + 1}/{config.num_iterations} ({current_test}/{total_tests} total)")

            # Calculate statistics
            number_counts = Counter(generated_numbers)
            total_numbers = len(generated_numbers)

            result = {
                "task": "numbers",
                "count_per_query": count,
                "prompt_type": prompt_type,
                "total_iterations": config.num_iterations,
                "total_numbers_generated": total_numbers,
                "unique_numbers": len(number_counts),
                "number_distribution": dict(number_counts),
                "most_common": number_counts.most_common(10),
                "expected_total": config.num_iterations * count,
                "generation_success_rate": total_numbers / (config.num_iterations * count) if config.num_iterations * count > 0 else 0,
                "responses": responses_data
            }

            results.append(result)
            print(f"  Generated {total_numbers} numbers, {len(number_counts)} unique")

    return results

def test_word_randomness(model, tokenizer, prompts: Dict) -> List[Dict]:
    """Test word randomness"""
    print("\n=== Testing Word Randomness ===")
    results = []

    total_tests = len(config.words_per_query) * len(config.word_categories) * 2 * config.num_iterations
    current_test = 0

    for count in config.words_per_query:
        for category in config.word_categories:
            for prompt_type in ["normal", "cot"]:
                print(f"\nTesting {count} {category} word{'s' if count > 1 else ''} - {prompt_type.upper()}")

                generated_words = []
                responses_data = []

                for i in range(config.num_iterations):
                    current_test += 1

                    # Create prompt
                    s = 's' if count > 1 else ''
                    prompt = prompts["words"][prompt_type].format(
                        count=count,
                        category=category,
                        s=s
                    )

                    # Generate response
                    response = generate_response(model, tokenizer, prompt)

                    # Extract words
                    words = extract_words(response)
                    generated_words.extend(words)

                    # Store detailed response data
                    responses_data.append({
                        "iteration": i + 1,
                        "prompt": prompt,
                        "response": response,
                        "extracted_words": words,
                        "expected_count": count,
                        "actual_count": len(words)
                    })

                    # Progress update
                    if (i + 1) % 10 == 0:
                        print(f"  Progress: {i + 1}/{config.num_iterations} ({current_test}/{total_tests} total)")

                # Calculate statistics
                word_counts = Counter(generated_words)
                total_words = len(generated_words)

                result = {
                    "task": "words",
                    "category": category,
                    "count_per_query": count,
                    "prompt_type": prompt_type,
                    "total_iterations": config.num_iterations,
                    "total_words_generated": total_words,
                    "unique_words": len(word_counts),
                    "word_distribution": dict(word_counts),
                    "most_common": word_counts.most_common(10),
                    "expected_total": config.num_iterations * count,
                    "generation_success_rate": total_words / (config.num_iterations * count) if config.num_iterations * count > 0 else 0,
                    "responses": responses_data
                }

                results.append(result)
                print(f"  Generated {total_words} words, {len(word_counts)} unique")

    return results

def save_results(results: List[Dict], result_type: str):
    """Save results to JSONL files"""
    print(f"\nSaving {result_type} results...")

    # Save summary results
    summary_file = os.path.join(config.output_dir, f"{result_type}_summary_{config.timestamp}.jsonl")
    with open(summary_file, 'w', encoding='utf-8') as f:
        for result in results:
            # Create summary without detailed responses
            summary = {k: v for k, v in result.items() if k != 'responses'}
            f.write(json.dumps(summary, ensure_ascii=False) + '\n')

    # Save detailed responses
    responses_file = os.path.join(config.output_dir, f"{result_type}_responses_{config.timestamp}.jsonl")
    with open(responses_file, 'w', encoding='utf-8') as f:
        for result in results:
            for response in result['responses']:
                response_data = {
                    "task": result['task'],
                    "count_per_query": result['count_per_query'],
                    "prompt_type": result['prompt_type'],
                    **response
                }
                if 'category' in result:
                    response_data['category'] = result['category']
                f.write(json.dumps(response_data, ensure_ascii=False) + '\n')

    print(f"✓ Results saved:")
    print(f"  Summary: {summary_file}")
    print(f"  Responses: {responses_file}")

def print_summary(number_results: List[Dict], word_results: List[Dict]):
    """Print a summary of results"""
    print("\n" + "="*60)
    print("EXPERIMENT SUMMARY")
    print("="*60)

    print(f"Model: {config.model_name}")
    print(f"Timestamp: {config.timestamp}")
    print(f"Total iterations per test: {config.num_iterations}")

    print("\nNumber Generation Results:")
    for result in number_results:
        print(f"  {result['count_per_query']} numbers ({result['prompt_type']}): "
              f"{result['total_numbers_generated']} generated, "
              f"{result['unique_numbers']} unique, "
              f"{result['generation_success_rate']:.2%} success rate")

    print("\nWord Generation Results:")
    for result in word_results:
        print(f"  {result['count_per_query']} {result['category']} ({result['prompt_type']}): "
              f"{result['total_words_generated']} generated, "
              f"{result['unique_words']} unique, "
              f"{result['generation_success_rate']:.2%} success rate")

def main():
    """Main execution function"""
    print("LLM Randomness Testing Script")
    print("="*50)

    # Setup
    if not setup_environment():
        return

    # Load model
    model, tokenizer = load_model()
    if model is None or tokenizer is None:
        return

    # Create prompts
    prompts = create_prompts()

    # Test number randomness
    number_results = test_number_randomness(model, tokenizer, prompts)

    # Test word randomness
    word_results = test_word_randomness(model, tokenizer, prompts)

    # Save results
    save_results(number_results, "numbers")
    save_results(word_results, "words")

    # Print summary
    print_summary(number_results, word_results)

    # Save configuration
    config_file = os.path.join(config.output_dir, f"config_{config.timestamp}.json")
    config_data = {
        "model_name": config.model_name,
        "num_iterations": config.num_iterations,
        "numbers_per_query": config.numbers_per_query,
        "words_per_query": config.words_per_query,
        "number_range": config.number_range,
        "word_categories": config.word_categories,
        "use_4bit": config.use_4bit,
        "max_new_tokens": config.max_new_tokens,
        "temperature": config.temperature,
        "do_sample": config.do_sample,
        "top_p": config.top_p,
        "timestamp": config.timestamp
    }

    with open(config_file, 'w', encoding='utf-8') as f:
        json.dump(config_data, f, indent=2, ensure_ascii=False)

    print(f"\n✓ Configuration saved: {config_file}")
    print(f"✓ All files saved to: {config.output_dir}")
    print("\nExperiment completed successfully!")

# Install required packages
def install_requirements():
    """Install required packages"""
    print("Installing required packages...")
    os.system("pip install -q transformers accelerate bitsandbytes")
    print("✓ Packages installed")

if __name__ == "__main__":
    # Uncomment the line below if packages are not installed
    # install_requirements()
    main()

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x00000229E5A47ED0>>
Traceback (most recent call last):
  File "C:\Users\Vivobook\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\ipykernel\ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


ModuleNotFoundError: No module named 'google.colab'

In [None]:
import json
import os
import time
import re
from datetime import datetime
from typing import List, Dict, Any, Tuple
from collections import Counter
import google.generativeai as genai
# from google.colab import drive

# Configuration
CONFIG = {
    "api_key": "API_KEY",  # Replace with your actual API key
    "model_name": "gemini-2.0-flash",
    "output_dir": "/content/drive/MyDrive/nac/",
    "iterations": 50,  # Number of iterations for each test
    "number_counts": [1, 10],  # Generate 1, 3, and 5 numbers per query
    "word_counts": [1, 10],    # Generate 1, 3, and 5 words per query
    "number_range": (1, 10),    # Range for random numbers
    "delay_between_requests": 1, # Delay in seconds between API calls
    "max_retries": 3,           # Maximum retries for failed requests
}

class RandomnessTestRunner:
    def __init__(self, config: Dict[str, Any]):
        self.config = config
        self.results = []
        self.reasoning_data = []
        
        # Initialize Gemini API
        genai.configure(api_key=config["api_key"])
        self.model = genai.GenerativeModel(config["model_name"])
        
        # Create output directory
        os.makedirs(config["output_dir"], exist_ok=True)
        
        # Mount Google Drive
        # drive.mount('/content/drive')
        
    def generate_prompts(self, task_type: str, count: int, use_cot: bool = False) -> Tuple[str, str]:
        """Generate prompts for different tasks and reasoning types"""
        
        if task_type == "number":
            if use_cot:
                system_prompt = """You are a helpful assistant that generates truly random numbers. 
Think step by step about how to ensure randomness in your selection."""
                
                user_prompt = f"""Please generate {count} random number(s) between {self.config['number_range'][0]} and {self.config['number_range'][1]}.

Think through your process step by step:
1. Consider what makes a number truly random
2. Avoid patterns or sequences
3. Select the number(s) randomly

Please provide your reasoning and then give me the final number(s) in this exact format:
NUMBERS: [number1, number2, ...] (if multiple) or NUMBERS: [number] (if single)"""
            else:
                system_prompt = "You are a helpful assistant that generates random numbers."
                user_prompt = f"Generate {count} random number(s) between {self.config['number_range'][0]} and {self.config['number_range'][1]}. Respond with just the number(s) in this format: NUMBERS: [number1, number2, ...] or NUMBERS: [number]"
                
        elif task_type == "word":
            if use_cot:
                system_prompt = """You are a helpful assistant that generates truly random words. 
Think step by step about how to ensure randomness in your word selection."""
                
                user_prompt = f"""Please generate {count} random word(s) from common English vocabulary.

Think through your process step by step:
1. Consider what makes word selection truly random
2. Avoid related words or themes
3. Select the word(s) randomly from your vocabulary

Please provide your reasoning and then give me the final word(s) in this exact format:
WORDS: [word1, word2, ...] (if multiple) or WORDS: [word] (if single)"""
            else:
                system_prompt = "You are a helpful assistant that generates random words."
                user_prompt = f"Generate {count} random word(s) from common English vocabulary. Respond with just the word(s) in this format: WORDS: [word1, word2, ...] or WORDS: [word]"
        
        return system_prompt, user_prompt
    
    def extract_results(self, response: str, task_type: str) -> List[str]:
        """Extract numbers or words from the response"""
        
        if task_type == "number":
            # Look for NUMBERS: pattern
            pattern = r'NUMBERS:\s*\[([^\]]+)\]'
            match = re.search(pattern, response)
            if match:
                numbers_str = match.group(1)
                # Split by comma and clean up
                numbers = [num.strip() for num in numbers_str.split(',')]
                return numbers
            else:
                # Fallback: extract any numbers from the response
                numbers = re.findall(r'\b\d+\b', response)
                return numbers
                
        elif task_type == "word":
            # Look for WORDS: pattern
            pattern = r'WORDS:\s*\[([^\]]+)\]'
            match = re.search(pattern, response)
            if match:
                words_str = match.group(1)
                # Split by comma and clean up
                words = [word.strip().strip('"\'') for word in words_str.split(',')]
                return words
            else:
                # Fallback: extract words (simple approach)
                words = re.findall(r'\b[a-zA-Z]+\b', response)
                return words[-5:]  # Take last 5 words as potential answers
    
    def make_api_call(self, system_prompt: str, user_prompt: str) -> str:
        """Make API call with retry logic"""
        
        for attempt in range(self.config["max_retries"]):
            try:
                full_prompt = f"{system_prompt}\n\n{user_prompt}"
                response = self.model.generate_content(full_prompt)
                return response.text
            except Exception as e:
                print(f"API call failed (attempt {attempt + 1}/{self.config['max_retries']}): {e}")
                if attempt < self.config["max_retries"] - 1:
                    time.sleep(2 ** attempt)  # Exponential backoff
                else:
                    raise e
    
    def run_test(self, task_type: str, count: int, use_cot: bool) -> None:
        """Run a single test configuration"""
        
        test_name = f"{task_type}_{count}_{'cot' if use_cot else 'normal'}"
        print(f"\n{'='*50}")
        print(f"Running test: {test_name}")
        print(f"{'='*50}")
        
        for i in range(self.config["iterations"]):
            try:
                # Generate prompts
                system_prompt, user_prompt = self.generate_prompts(task_type, count, use_cot)
                
                # Make API call
                response = self.make_api_call(system_prompt, user_prompt)
                
                # Extract results
                extracted_results = self.extract_results(response, task_type)
                
                # Store results
                result_entry = {
                    "test_name": test_name,
                    "iteration": i + 1,
                    "task_type": task_type,
                    "count_requested": count,
                    "use_cot": use_cot,
                    "extracted_results": extracted_results,
                    "count_extracted": len(extracted_results),
                    "timestamp": datetime.now().isoformat()
                }
                self.results.append(result_entry)
                
                # Store reasoning data
                reasoning_entry = {
                    "test_name": test_name,
                    "iteration": i + 1,
                    "system_prompt": system_prompt,
                    "user_prompt": user_prompt,
                    "response": response,
                    "timestamp": datetime.now().isoformat()
                }
                self.reasoning_data.append(reasoning_entry)
                
                # Progress update
                if (i + 1) % 10 == 0:
                    print(f"Progress: {i + 1}/{self.config['iterations']} iterations completed")
                
                # Delay between requests
                time.sleep(self.config["delay_between_requests"])
                
            except Exception as e:
                print(f"Error in iteration {i + 1}: {e}")
                continue
        
        print(f"Completed test: {test_name}")
    
    def run_all_tests(self) -> None:
        """Run all test configurations"""
        
        print("Starting LLM Randomness Testing")
        print(f"Model: {self.config['model_name']}")
        print(f"Iterations per test: {self.config['iterations']}")
        print(f"Output directory: {self.config['output_dir']}")
        
        start_time = time.time()
        
        # Test numbers
        for count in self.config["number_counts"]:
            self.run_test("number", count, False)  # Normal
            self.run_test("number", count, True)   # CoT
        
        # Test words
        for count in self.config["word_counts"]:
            self.run_test("word", count, False)    # Normal
            self.run_test("word", count, True)     # CoT
        
        end_time = time.time()
        print(f"\nAll tests completed in {end_time - start_time:.2f} seconds")
        
        # Save results
        self.save_results()
        self.generate_summary()
    
    def save_results(self) -> None:
        """Save results and reasoning data to JSONL files"""
        
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Save results
        results_file = os.path.join(self.config["output_dir"], f"results_{timestamp}.jsonl")
        with open(results_file, 'w') as f:
            for result in self.results:
                f.write(json.dumps(result) + '\n')
        
        # Save reasoning data
        reasoning_file = os.path.join(self.config["output_dir"], f"reasoning_{timestamp}.jsonl")
        with open(reasoning_file, 'w') as f:
            for reasoning in self.reasoning_data:
                f.write(json.dumps(reasoning) + '\n')
        
        print(f"\nResults saved to: {results_file}")
        print(f"Reasoning data saved to: {reasoning_file}")
    
    def generate_summary(self) -> None:
        """Generate summary statistics"""
        
        print("\n" + "="*50)
        print("SUMMARY STATISTICS")
        print("="*50)
        
        # Group results by test configuration
        test_groups = {}
        for result in self.results:
            test_name = result["test_name"]
            if test_name not in test_groups:
                test_groups[test_name] = []
            test_groups[test_name].append(result)
        
        # Analyze each test group
        for test_name, results in test_groups.items():
            print(f"\nTest: {test_name}")
            print("-" * 30)
            
            # Collect all extracted results
            all_extracted = []
            for result in results:
                all_extracted.extend(result["extracted_results"])
            
            if all_extracted:
                # Count frequency
                counter = Counter(all_extracted)
                
                print(f"Total responses: {len(results)}")
                print(f"Total extracted items: {len(all_extracted)}")
                print(f"Unique items: {len(counter)}")
                
                # Show most common items
                most_common = counter.most_common(5)
                print(f"Most common items: {most_common}")
                
                # Calculate distribution metrics
                frequencies = list(counter.values())
                if frequencies:
                    print(f"Min frequency: {min(frequencies)}")
                    print(f"Max frequency: {max(frequencies)}")
                    print(f"Average frequency: {sum(frequencies) / len(frequencies):.2f}")
        
        # Save summary
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        summary_file = os.path.join(self.config["output_dir"], f"summary_{timestamp}.json")
        
        summary_data = {
            "config": self.config,
            "total_tests": len(test_groups),
            "total_iterations": len(self.results),
            "test_groups": {}
        }
        
        for test_name, results in test_groups.items():
            all_extracted = []
            for result in results:
                all_extracted.extend(result["extracted_results"])
            
            counter = Counter(all_extracted)
            summary_data["test_groups"][test_name] = {
                "total_responses": len(results),
                "total_extracted": len(all_extracted),
                "unique_items": len(counter),
                "distribution": dict(counter),
                "most_common": counter.most_common(10)
            }
        
        with open(summary_file, 'w') as f:
            json.dump(summary_data, f, indent=2)
        
        print(f"\nSummary saved to: {summary_file}")

def main():
    """Main execution function"""
    
    # Validate API key
    if CONFIG["api_key"] == "YOUR_GEMINI_API_KEY":
        print("ERROR: Please set your Gemini API key in the CONFIG dictionary!")
        return
    
    # Initialize and run tests
    runner = RandomnessTestRunner(CONFIG)
    runner.run_all_tests()
    
    print("\n" + "="*50)
    print("TESTING COMPLETED SUCCESSFULLY!")
    print("="*50)
    print(f"Check the output directory: {CONFIG['output_dir']}")
    print("Files generated:")
    print("- results_[timestamp].jsonl: Raw results data")
    print("- reasoning_[timestamp].jsonl: Full reasoning and responses")
    print("- summary_[timestamp].json: Summary statistics and analysis")

if __name__ == "__main__":
    main()

Starting LLM Randomness Testing
Model: gemini-2.0-flash
Iterations per test: 50
Output directory: /content/drive/MyDrive/nac/

Running test: number_1_normal
Progress: 10/50 iterations completed
API call failed (attempt 1/3): 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 21
}
]
API call failed (attempt 2/3): 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 19
}
]
API call failed (attempt 3/3): 429 You excee