# Essay Generation for LLM-as-a-Judge Competition

This notebook implements a system for generating essays that maximize disagreement between LLM judges while maintaining high quality. The goal is to explore vulnerabilities in automated evaluation systems and contribute to the development of more robust AI-based assessment tools.

---

## **1. Setup and Imports**

First, we import all necessary libraries and set up logging for tracking progress and errors.

In [1]:
# Import required libraries
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import pandas as pd
import numpy as np
import re
import random
import time
from tqdm import tqdm
import logging
from typing import Dict, List, Optional, Tuple
import json
from difflib import SequenceMatcher

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Initialize logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),  # Log to console
        logging.FileHandler('essay_generation.log')  # Also log to file
    ]
)
logger = logging.getLogger(__name__)

# Force logging to be shown immediately
logger.setLevel(logging.INFO)
for handler in logger.handlers:
    handler.setLevel(logging.INFO)

---

## **2. Competition Configuration**

Define the competition-specific parameters, including target essay length, quality thresholds, and scoring formula components.

In [2]:
# Competition scoring formula:
# final_score = (avg_h × min_v × avg_e) / (avg_s × (9 - avg_q))
COMPETITION_CONFIG = {
    'target_length': 100,       # Target essay length
    'length_tolerance': 15,     # Stricter tolerance
    'min_quality_score': 7.0,   # Target higher quality
    'max_quality_score': 8.5,   # Not too perfect to allow disagreement
    'target_variance': 2.5,     # High variance between judges
    'min_english_score': 0.95,  # Very high English quality
    'max_similarity': 0.25      # Strict similarity threshold
}

---

## **3. Model Configuration**

Configure the model and tokenizer settings for essay generation. We use `gpt2-medium` for better performance.

In [3]:
# Enhanced model configuration
MODEL_CONFIG = {
    'name': 'gpt2-medium',      # More capable model
    'path': "gpt2-medium",
    'params': {
        'max_new_tokens': 250,  # Longer generation
        'temperature': 1.1,     # Higher creativity
        'top_p': 0.95,         # More diversity
        'top_k': 100,          # More options
        'repetition_penalty': 1.8,
        'do_sample': True,
        'no_repeat_ngram_size': 4,
        'num_return_sequences': 1,
        'pad_token_id': 50256   # GPT-2 specific
    },
    'tokenizer_config': {
        'padding': True,
        'truncation': True,
        'max_length': 512
    }
}

---

## **4. Style Configurations**

Define different writing styles to maximize disagreement between judges. Each style has a unique tone, structure, and approach.

In [4]:
# Carefully crafted styles to maximize disagreement
STYLE_CONFIGS = {
    'controversial': {
        'tone': 'logically provocative',
        'structure': 'clear but challenging',
        'approach': 'present unconventional perspectives',
        'prompt_elements': [
            'challenge established norms while maintaining logical coherence',
            'use sophisticated rhetoric to present contentious ideas',
            'balance controversial claims with solid reasoning'
        ]
    },
    'ambiguous': {
        'tone': 'deliberately nuanced',
        'structure': 'multi-layered analysis',
        'approach': 'explore multiple valid interpretations',
        'prompt_elements': [
            'present valid but competing viewpoints',
            'use precise language that allows multiple readings',
            'maintain logical consistency while allowing ambiguity'
        ]
    },
    'philosophical': {
        'tone': 'deeply analytical',
        'structure': 'complex reasoning',
        'approach': 'examine fundamental assumptions',
        'prompt_elements': [
            'question core premises while maintaining rigor',
            'use abstract reasoning to challenge intuitions',
            'explore edge cases and counterintuitive conclusions'
        ]
    }
}

---

## **5. Model Manager**

The `ModelManager` class handles model initialization and GPU memory management.

In [5]:
class ModelManager:
    """Enhanced model management with better error handling"""
    
    @staticmethod
    def cleanup_models():
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            logger.info("GPU memory cleared")
    
    @staticmethod
    def initialize_model() -> pipeline:
        try:
            ModelManager.cleanup_models()
            
            # Initialize tokenizer
            tokenizer = AutoTokenizer.from_pretrained(
                MODEL_CONFIG['path'],
                trust_remote_code=True,
                padding_side='left',
                **MODEL_CONFIG['tokenizer_config']
            )
            
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token
            
            # Load model with optimizations
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_CONFIG['path'],
                device_map="auto",
                torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
                trust_remote_code=True,
                low_cpu_mem_usage=True
            )
            
            # Create pipeline without device specification
            generation_pipe = pipeline(
                task="text-generation",
                model=model,
                tokenizer=tokenizer,
                trust_remote_code=True,
                **MODEL_CONFIG['params']
            )
            
            logger.info(f"Model {MODEL_CONFIG['name']} initialized successfully")
            return generation_pipe
            
        except Exception as e:
            logger.error(f"Model initialization failed: {str(e)}")
            raise

---

## **6. Prompt Engineering**

The `PromptEngineering` class generates detailed prompts for essay generation, tailored to specific styles.

In [6]:
class PromptEngineering:
    """Enhanced prompt engineering"""
    
    @staticmethod
    def get_prompt(topic: str, style_name: str) -> str:
        """Generate a more detailed prompt"""
        style_config = STYLE_CONFIGS[style_name]
        
        prompt = f"""Write a focused and thought-provoking essay about the topic: "{topic}".

Your essay must be exactly 100 words long and should generate diverse opinions among readers.

Writing Requirements:
1. Use sophisticated vocabulary and complex sentence structures
2. Present logical but potentially controversial arguments
3. Maintain perfect grammar and coherence
4. Create 4-5 well-constructed sentences
5. Apply {style_config['tone']} tone throughout
6. Follow {style_config['structure']} format
7. {style_config['approach']}

Focus on quality, clarity, and thought-provoking content.

Begin your essay with "Essay:" followed by your 100-word response:
Essay:"""
        
        return prompt.strip()

---

## **7. Essay Generator**

The `EssayGenerator` class handles essay generation and processing, including text cleaning and validation.

In [7]:
class EssayGenerator:
    """Handles essay generation with improved quality control"""
    
    def __init__(self):
        self.model_pipe = ModelManager.initialize_model()
        self.generation_count = 0
    
    def process_essay(self, generated_text: str) -> Optional[str]:
        """Process and clean generated essay text with detailed logging"""
        try:
            if not generated_text:
                logger.warning("Empty generated text")
                return None

            # Log original text length
            logger.info(f"Processing generated text ({len(generated_text)} chars)")
            
            # Clean up the text
            text = generated_text.strip()
            
            # Try to find essay start marker
            essay_start = text.find('Essay:')
            if essay_start != -1:
                text = text[essay_start + 6:].strip()
                logger.info("Found essay marker")
            else:
                logger.warning("No essay marker found")
            
            # Basic cleaning
            text = re.sub(r'\s+', ' ', text)
            text = text.strip('"\'')
            text = re.sub(r'\n+', ' ', text)
            
            # Log cleaning results
            logger.info(f"After cleaning: {len(text.split())} words")
            
            # Check minimum length
            if len(text.split()) < 50:
                logger.warning(f"Essay too short: {len(text.split())} words")
                return None
                
            # Process sentences
            sentences = [s.strip() for s in re.split('[.!?]', text) if s.strip()]
            logger.info(f"Found {len(sentences)} sentences")
            
            if len(sentences) < 3:
                logger.warning("Too few sentences")
                return None
            
            # Reconstruct with proper spacing
            text = '. '.join(sentences) + '.'
            
            # Validate essay
            metrics = QualityControl.calculate_metrics(text)
            logger.info("Quality metrics calculated:")
            logger.info(f"- Word count: {metrics['word_count']}")
            logger.info(f"- Unique ratio: {metrics['unique_ratio']:.2f}")
            logger.info(f"- Complexity score: {metrics['complexity_score']:.2f}")
            
            is_valid, feedback = QualityControl.validate_essay(text, metrics)
            
            if is_valid:
                logger.info("Essay passed validation")
                return text
            else:
                logger.warning(f"Validation failed: {feedback}")
                return None
                
        except Exception as e:
            logger.error(f"Error processing essay: {str(e)}")
            return None

---

## **8. Batch Processor**

The `BatchProcessor` class handles batch processing of topics, ensuring efficient use of GPU resources.

In [8]:
class BatchProcessor:
    """Handles batch processing with improved error handling"""
    
    def __init__(self, batch_size: int = 1):
        self.batch_size = batch_size
        self.generator = EssayGenerator()
        self.results = []
        self.stats = {
            'total_processed': 0,
            'successful_generations': 0,
            'failed_generations': 0,
            'style_usage': {},
            'generated_essays': set()
        }

    def is_duplicate(self, essay: str) -> bool:
        """Check if essay is too similar to existing ones"""
        if not essay:
            return True
            
        # Check exact duplicates
        if essay in self.stats['generated_essays']:
            return True
            
        # Check similarity with existing essays
        words = set(essay.lower().split())
        for existing_essay in self.stats['generated_essays']:
            existing_words = set(existing_essay.lower().split())
            similarity = len(words & existing_words) / len(words | existing_words)
            if similarity > COMPETITION_CONFIG['max_similarity']:
                return True
                
        return False

    def process_batch(self, topics: List[Tuple[int, str]]) -> None:
        """Process a batch of topics efficiently"""
        try:
            if self.stats['total_processed'] % 5 == 0:
                torch.cuda.empty_cache()
            
            for topic_id, topic in topics:
                logger.info(f"\nProcessing topic {topic_id}: {topic}")
                essay_generated = False
                max_attempts = 3
                
                for attempt in range(max_attempts):
                    if essay_generated:
                        break
                        
                    logger.info(f"Attempt {attempt + 1}/{max_attempts}")
                    
                    for style in STYLE_CONFIGS.keys():
                        if essay_generated:
                            break
                            
                        logger.info(f"Trying style: {style}")
                        try:
                            prompt = PromptEngineering.get_prompt(topic, style)
                            logger.info("Generating essay...")
                            
                            outputs = self.generator.model_pipe(
                                prompt,
                                max_new_tokens=200,
                                temperature=1.1,
                                top_p=0.95,
                                do_sample=True,
                                num_return_sequences=1
                            )
                            
                            if outputs and len(outputs) > 0:
                                essay = self.generator.process_essay(outputs[0]['generated_text'])
                                
                                if essay and not self.is_duplicate(essay):
                                    self.stats['generated_essays'].add(essay)
                                    self.results.append({
                                        'id': topic_id,
                                        'essay': essay
                                    })
                                    self.stats['style_usage'][style] = self.stats['style_usage'].get(style, 0) + 1
                                    self.stats['successful_generations'] += 1
                                    essay_generated = True
                                    logger.info(f"Successfully generated essay for topic {topic_id}")
                                    break
                                else:
                                    logger.warning(f"Invalid or duplicate essay generated for topic {topic_id}")
                            
                        except Exception as e:
                            logger.error(f"Error with style {style} for topic {topic_id}: {str(e)}")
                            continue
                    
                if not essay_generated:
                    logger.error(f"Failed to generate valid essay for topic {topic_id} after {max_attempts} attempts")
                    self.stats['failed_generations'] += 1
                    self.results.append({
                        'id': topic_id,
                        'essay': f"Error generating essay for topic {topic_id}"
                    })
            
            self.stats['total_processed'] += len(topics)
            
        except Exception as e:
            logger.error(f"Batch processing error: {str(e)}")
            for topic_id, _ in topics:
                if not any(r['id'] == topic_id for r in self.results):
                    self.results.append({
                        'id': topic_id,
                        'essay': f"Error generating essay for topic {topic_id}"
                    })
                    self.stats['failed_generations'] += 1

---

## **9. Quality Control**

The `QualityControl` class calculates and validates essay metrics to ensure they meet competition requirements.

In [9]:
class QualityControl:
    """Enhanced quality control with sophisticated metrics"""
    
    @staticmethod
    def calculate_metrics(essay: str) -> Dict:
        if not essay:
            return None
        
        words = essay.split()
        word_count = len(words)
        unique_words = len(set(words))
        sentences = [s.strip() for s in re.split('[.!?]', essay) if s.strip()]
        
        return {
            'word_count': word_count,
            'unique_ratio': unique_words / word_count if word_count > 0 else 0,
            'avg_word_length': sum(len(w) for w in words) / word_count if word_count > 0 else 0,
            'sentence_count': len(sentences),
            'avg_sentence_length': word_count / len(sentences) if sentences else 0,
            'complexity_score': (sum(len(w) for w in words) / word_count) * (unique_words / word_count) if word_count > 0 else 0
        }
    
    @staticmethod
    def validate_essay(essay: str, metrics: Optional[Dict] = None) -> Tuple[bool, str]:
        if not metrics:
            metrics = QualityControl.calculate_metrics(essay)
        
        target = COMPETITION_CONFIG['target_length']
        tolerance = COMPETITION_CONFIG['length_tolerance']
        
        checks = [
            (target - tolerance <= metrics['word_count'] <= target + tolerance,
             f"Word count {metrics['word_count']} outside target range {target}±{tolerance}"),
            (metrics['unique_ratio'] >= 0.65,
             f"Low word variety: {metrics['unique_ratio']:.2f}"),
            (3 <= metrics['sentence_count'] <= 5,
             f"Invalid sentence count: {metrics['sentence_count']}"),
            (20 <= metrics['avg_sentence_length'] <= 35,
             f"Invalid average sentence length: {metrics['avg_sentence_length']:.1f}"),
            (metrics['complexity_score'] >= 3.8,
             f"Low complexity score: {metrics['complexity_score']:.2f}")
        ]
        
        failed = [msg for condition, msg in checks if not condition]
        return not bool(failed), '; '.join(failed)

---

## **10. Submission Creation**

The `create_submission` function generates and saves the final submission file.

In [10]:
def create_submission(processor: BatchProcessor):
    """Create and validate submission file"""
    try:
        submission_df = pd.DataFrame(processor.results)
        submission_df = submission_df.sort_values('id')
        
        # Save locally
        local_path = 'submission.csv'
        submission_df.to_csv(local_path, index=False)
        logger.info(f"Submission saved to {local_path}")
        
        # Try to save to Drive
        try:
            drive_path = '/content/drive/MyDrive/GenerativeAI/submission.csv'
            submission_df.to_csv(drive_path, index=False)
            logger.info(f"Submission also saved to Drive at {drive_path}")
        except Exception as e:
            logger.warning(f"Could not save to Drive: {str(e)}")
        
        # Analyze submission
        analyze_submission(submission_df)
        
        return submission_df
        
    except Exception as e:
        logger.error(f"Error creating submission: {str(e)}")
        raise

---

## **11. Submission Analysis**

The `analyze_submission` function provides detailed statistics and sample essays from the submission.

In [11]:
def analyze_submission(df: pd.DataFrame):
    """Analyze submission quality"""
    try:
        total_essays = len(df)
        valid_essays = df[~df['essay'].str.contains('Error', na=True)].shape[0]
        
        logger.info("\nSubmission Analysis:")
        logger.info(f"Total Essays: {total_essays}")
        logger.info(f"Valid Essays: {valid_essays}")
        logger.info(f"Success Rate: {(valid_essays/total_essays)*100:.1f}%")
        
        if valid_essays > 0:
            # Sample essays
            logger.info("\nSample Essays:")
            sample_df = df[~df['essay'].str.contains('Error', na=True)].sample(min(3, valid_essays))
            for _, row in sample_df.iterrows():
                logger.info(f"\nID: {row['id']}")
                logger.info(f"Essay:\n{row['essay']}\n")
                logger.info("-" * 80)
    
    except Exception as e:
        logger.error(f"Error analyzing submission: {str(e)}")

---

## **12. Main Execution**

The `main` function orchestrates the entire process, from loading data to generating essays and creating the submission.

In [12]:
def main():
    """Main execution flow"""
    try:
        start_time = time.time()
        
        # Initial setup logging
        logger.info("=" * 80)
        logger.info("Starting essay generation")
        logger.info(f"GPU available: {torch.cuda.is_available()}")
        if torch.cuda.is_available():
            logger.info(f"Using device: {torch.cuda.get_device_name(0)}")
        logger.info("=" * 80)
        
        # Load competition data first
        try:
            file_path = '/kaggle/input/llms-you-cant-please-them-all/test.csv'  # Adjust this path as needed
            test_data = pd.read_csv(file_path)
            logger.info(f"Loaded {len(test_data)} topics from {file_path}")
        except Exception as e:
            logger.error(f"Error loading test data from {file_path}: {str(e)}")
            raise
        
        # Initialize processor
        batch_size = 1  # Process one topic at a time for stability
        processor = BatchProcessor(batch_size)
        
        # Process topics in batches
        topic_batches = [
            test_data.iloc[i:i+batch_size][['id', 'topic']].values.tolist()
            for i in range(0, len(test_data), batch_size)
        ]
        
        # Process with progress bar
        with tqdm(total=len(topic_batches), desc="Processing topics") as pbar:
            for batch_idx, batch in enumerate(topic_batches):
                try:
                    processor.process_batch(batch)
                    pbar.update(1)
                    
                    # Log progress every few topics
                    if (batch_idx + 1) % 2 == 0:  # Increased frequency
                        elapsed = time.time() - start_time
                        rate = (batch_idx + 1) / elapsed if elapsed > 0 else 0
                        logger.info(f"""
                        Progress Update:
                        - Completed: {batch_idx + 1}/{len(topic_batches)} topics
                        - Success Rate: {(processor.stats['successful_generations']/(batch_idx+1))*100:.1f}%
                        - Processing Rate: {rate:.2f} topics/second
                        - Time Elapsed: {elapsed:.1f} seconds
                        """)
                        
                except Exception as e:
                    logger.error(f"Error processing batch {batch_idx}: {str(e)}")
                    continue
        
        # Create submission if we have results
        if processor.results:
            submission_df = create_submission(processor)
            logger.info(f"""
            Generation Complete:
            - Total Topics: {len(test_data)}
            - Successful Generations: {processor.stats['successful_generations']}
            - Failed Generations: {processor.stats['failed_generations']}
            - Overall Success Rate: {(processor.stats['successful_generations']/len(test_data))*100:.1f}%
            """)
        else:
            raise ValueError("No essays were generated successfully")
        
    except Exception as e:
        logger.error(f"Execution failed: {str(e)}")
        raise

---

## **13. Run the Script**

Finally, execute the script by calling the `main` function.

In [13]:
if __name__ == "__main__":
    try:
        # Configure logging first
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.StreamHandler(),
                logging.FileHandler('essay_generation.log')
            ]
        )
        logger = logging.getLogger(__name__)
        logger.setLevel(logging.INFO)
        
        # Suppress warnings
        import warnings
        warnings.filterwarnings('ignore')
        
        # GPU setup
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            device = torch.cuda.get_device_name(0)
            memory = torch.cuda.get_device_properties(0).total_memory / 1e9
            logger.info(f"Using GPU: {device} ({memory:.1f}GB)")
        else:
            logger.warning("No GPU available, using CPU")
        
        # Set random seeds
        random.seed(42)
        np.random.seed(42)
        torch.manual_seed(42)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(42)
            torch.backends.cudnn.deterministic = True
        
        # Run main process
        main()
        
    except Exception as e:
        logger.error(f"Script execution failed: {str(e)}")
        raise

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/llms-you-cant-please-them-all/test.csv'