
# Fine-tune GPT-2 (Decoder-only) for Recipe Generation (Kaggle-ready)

This notebook fine-tunes GPT-2 on the provided Kaggle recipe dataset using LoRA (PEFT) for parameter-efficient training.
Follow instructions and run **Run All** in Kaggle. It samples 20% of the dataset (overall) for fine-tuning, splits into train/validation/test, and uses checkpoints to resume training if interrupted.

**Notes**
- The notebook installs required libraries (transformers, datasets, accelerate, peft, bitsandbytes if desired).
- Uses mixed precision (fp16) by default; contains a fallback branch if 4-bit quantization is available in the environment.
- Checkpoints are saved in `checkpoints/`.
- Final model `model.pkl` (state_dict + tokenizer files saved separately) will be saved to `/kaggle/working/` and `/kaggle/working/checkpoints/`.
- Do **not** run this on CPU-only without adjusting batch sizes; Kaggle GPU is recommended.


In [None]:

!pip install -q transformers datasets accelerate peft[safe] evaluate sentencepiece
!pip install -q git+https://github.com/huggingface/peft.git@main
try:
    import bitsandbytes as bnb
except Exception:
    pass


In [None]:

import os, sys, math, random, glob, pickle, time
from pathlib import Path
import pandas as pd
import numpy as np
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
num_gpus = torch.cuda.device_count()
print(f"Using {num_gpus} GPU(s)")


from datasets import Dataset, DatasetDict, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer, set_seed, logging
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training


logging.set_verbosity_info()
set_seed(42)
os.makedirs('checkpoints', exist_ok=True)


In [None]:

input_root = '/kaggle/input'
candidates = []
for root, dirs, files in os.walk(input_root):
    for f in files:
        if f.endswith(('.csv','.json','.txt')):
            candidates.append(os.path.join(root,f))
candidates[:20]


In [None]:

# Adjust this path if you know the exact file inside /kaggle/input
files = [p for p in candidates if p.endswith('.csv') or p.endswith('.json')]
if len(files)==0:
    raise FileNotFoundError('No dataset files found under /kaggle/input. Upload the Kaggle dataset to the notebook.')
data_path = files[0]
print('Using', data_path)
if data_path.endswith('.csv'):
    df = pd.read_csv(data_path)
else:
    df = pd.read_json(data_path, lines=True)
print('Rows:', len(df))
display(df.head())


In [None]:

# Attempt to detect common fields for recipes
possible_title = [c for c in df.columns if 'title' in c.lower() or 'name' in c.lower()]
possible_ingredients = [c for c in df.columns if 'ingredient' in c.lower() or 'ingredients' in c.lower()]
possible_instructions = [c for c in df.columns if 'instruction' in c.lower() or 'directions' in c.lower() or 'steps' in c.lower()]

print('title candidates:', possible_title)
print('ingredients candidates:', possible_ingredients)
print('instructions candidates:', possible_instructions)

title_col = possible_title[0] if possible_title else None
ing_col = possible_ingredients[0] if possible_ingredients else None
inst_col = possible_instructions[0] if possible_instructions else None

if not (ing_col or title_col) or not inst_col:
    # fallback: try columns by manual inspection (first 6 columns)
    print('Automatic detection failed. Showing first 6 columns for manual pick.')
    display(df.iloc[:5,:6])
else:
    print('Detected columns:', title_col, ing_col, inst_col)


In [None]:

# Create a text prompt->target format. If some columns are missing, we craft reasonable fallbacks.
def row_to_pair(r):
    title = r[title_col] if title_col in r and pd.notnull(r[title_col]) else ''
    ings = r[ing_col] if ing_col in r and pd.notnull(r[ing_col]) else ''
    inst = r[inst_col] if inst_col in r and pd.notnull(r[inst_col]) else ''
    if isinstance(ings, (list, tuple)):
        ings = ', '.join(ings)
    return title, ings, inst

pairs = []
for _, r in df.iterrows():
    title, ings, inst = row_to_pair(r)
    if not inst or (not ings and not title):
        continue
    prompt = ''
    if ings:
        prompt += 'Ingredients: ' + str(ings).strip() + '\n'
    if title:
        prompt += 'Title: ' + str(title).strip() + '\n'
    prompt += 'Recipe:\n'
    target = str(inst).strip()
    text = prompt + target
    pairs.append({'text': text})

if len(pairs)==0:
    raise ValueError('No usable recipe rows found. Please inspect the dataset and set correct columns.')

df_pairs = pd.DataFrame(pairs)
sample_frac = 0.20
df_sample = df_pairs.sample(frac=sample_frac, random_state=42).reset_index(drop=True)
print('Original rows:', len(df_pairs), 'Sampled (20%):', len(df_sample))

train_frac = 0.8
val_frac = 0.1
test_frac = 0.1
n = len(df_sample)
n_train = int(n * train_frac)
n_val = int(n * val_frac)
train_df = df_sample.iloc[:n_train]
val_df = df_sample.iloc[n_train:n_train+n_val]
test_df = df_sample.iloc[n_train+n_val:]
print('Train/Val/Test sizes:', len(train_df), len(val_df), len(test_df))

dataset_dict = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(val_df),
    'test': Dataset.from_pandas(test_df)
})


In [None]:

model_name = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token_id is None:
    tokenizer.add_special_tokens({'pad_token':'<|pad|>'})

def preprocess(examples):
    outputs = tokenizer(examples['text'], truncation=True, max_length=512)
    return outputs

tokenized = dataset_dict.map(preprocess, batched=True, remove_columns=['text'])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

from datasets import DatasetDict

# ‚úÖ Reduce dataset size to 15% for faster training
train_dataset = tokenized["train"].shuffle(seed=42).select(range(int(0.15 * len(tokenized["train"]))))
eval_dataset = tokenized["validation"].shuffle(seed=42).select(range(int(0.15 * len(tokenized["validation"]))))



In [None]:

device_map = 'auto' if torch.cuda.is_available() else None
use_4bit = False
try:
    if 'bnb' in sys.modules:
        from transformers import AutoConfig
        model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device_map, load_in_4bit=False)
    else:
        model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device_map)
except Exception as e:
    model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))
print('Model loaded.')


In [None]:

target_modules = ['c_attn','q_proj','v_proj','k_proj']

try:
    model = prepare_model_for_kbit_training(model)
except Exception:
    pass

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=target_modules,
    lora_dropout=0.1,
    bias='none',
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, config)
print('LoRA applied. Parameter count (trainable):', sum(p.numel() for p in model.parameters() if p.requires_grad))


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="checkpoints/gpt2-lora",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=2,
    learning_rate=2e-4,
    fp16=True,
    dataloader_num_workers=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    save_total_limit=1,
    load_best_model_at_end=False,
    report_to="none",
    ddp_find_unused_parameters=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()
trainer.save_model(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir)


In [None]:

import torch
save_path = '/kaggle/working/model.pkl'
to_save = {}
to_save['state_dict'] = model.state_dict()
to_save['config'] = model.config.to_dict()
with open(save_path, 'wb') as f:
    pickle.dump(to_save, f)
print('Saved model.pkl to', save_path)

# Also save tokenizer and peft adapter
tokenizer.save_pretrained('/kaggle/working/tokenizer')
model.save_pretrained('/kaggle/working/peft_model')
print('Saved tokenizer and peft model to /kaggle/working/')


In [None]:

from transformers import pipeline
gen = pipeline('text-generation', model=output_dir, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
examples = [
    'Ingredients: egg, flour, sugar\nTitle: Simple Pancakes\nRecipe:\n',
    'Ingredients: chicken, garlic, salt, pepper\nTitle: Garlic Chicken\nRecipe:\n',
]
for ex in examples:
    o = gen(ex, max_length=300, num_return_sequences=1)[0]['generated_text']
    print('---PROMPT---\n', ex)
    print('---GENERATED---\n', o.replace(ex,''))


In [None]:

import evaluate
metric_bleu = evaluate.load('bleu')
from nltk.tokenize import word_tokenize

def compute_bleu(preds, refs):
    preds_tok = [word_tokenize(p) for p in preds]
    refs_tok = [[word_tokenize(r)] for r in refs]
    return metric_bleu.compute(predictions=preds_tok, references=refs_tok)

# Simple eval on test set (generate using prompts from test set)
test_texts = [x['text'] for x in dataset_dict['test']]
gen_texts = []
for t in test_texts[:20]:
    prompt = t.split('Recipe:\n')[0] + 'Recipe:\n'
    out = gen(prompt, max_length=512, num_return_sequences=1)[0]['generated_text']
    gen_texts.append(out.replace(prompt,''))
refs = [t.split('Recipe:\n',1)[1] for t in test_texts[:20]]
bleu_res = compute_bleu(gen_texts, refs)
print('BLEU (sample):', bleu_res)


# **MODEL EVALUATION**

This section is designed to evaluate the fine-tuned GPT-2 model independently. It loads the saved model from disk and performs comprehensive evaluation without depending on the training cells above.

**Features:**
- ‚úÖ Independent model loading from saved files
- ‚úÖ Comprehensive evaluation metrics (BLEU, ROUGE, Perplexity)
- ‚úÖ Sample generation with various prompts
- ‚úÖ Performance analysis and visualization
- ‚úÖ Recipe quality assessment

In [None]:
# üîß EVALUATION SETUP - Independent Model Loading (CPU Optimized)
# This cell loads the fine-tuned model independently for evaluation on local CPU

import os
import sys
import pickle
import torch
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Force CPU usage for local environment - no CUDA/TPU
device = torch.device("cpu")
print(f"üñ•Ô∏è Using device: {device} (forced CPU for local evaluation)")
print("‚ö†Ô∏è Note: CPU evaluation will be slower but more compatible")

# Load required libraries for evaluation
try:
    from transformers import (
        AutoTokenizer, 
        AutoModelForCausalLM, 
        GenerationConfig
    )
    from peft import PeftModel
    print("‚úÖ Core libraries loaded successfully")
    
    # Try to load evaluation metrics (optional for minimal setup)
    try:
        import evaluate
        print("‚úÖ Evaluation metrics available")
        METRICS_AVAILABLE = True
    except ImportError:
        print("‚ö†Ô∏è Evaluation metrics not available (install with: pip install evaluate)")
        METRICS_AVAILABLE = False
        
except ImportError as e:
    print(f"‚ùå Error loading libraries: {e}")
    print("Please install missing packages: pip install transformers peft")

# Define paths
BASE_PATH = Path(".")
MODEL_PATH = BASE_PATH / "models" / "model.pkl"
TOKENIZER_PATH = BASE_PATH / "tokenizer"
PEFT_MODEL_PATH = BASE_PATH / "peft_model"
TEST_DATA_PATH = BASE_PATH / "models" / "test_data.pkl"

print(f"\nüìÅ File Check:")
print(f"   ‚Ä¢ Model path: {MODEL_PATH}")
print(f"   ‚Ä¢ Tokenizer path: {TOKENIZER_PATH}")
print(f"   ‚Ä¢ PEFT model path: {PEFT_MODEL_PATH}")
print(f"   ‚Ä¢ Test data path: {TEST_DATA_PATH}")

# Check if files exist
missing_files = []
for path_name, path in [("Model", MODEL_PATH), ("Tokenizer", TOKENIZER_PATH), 
                       ("PEFT Model", PEFT_MODEL_PATH), ("Test Data", TEST_DATA_PATH)]:
    if not path.exists():
        missing_files.append(f"{path_name}: {path}")
        print(f"‚ùå Missing: {path}")
    else:
        print(f"‚úÖ Found: {path}")

# Check specifically for PEFT adapter weights
if PEFT_MODEL_PATH.exists():
    adapter_files = list(PEFT_MODEL_PATH.glob("adapter_model.*"))
    if adapter_files:
        print(f"‚úÖ PEFT adapter weights found: {[f.name for f in adapter_files]}")
    else:
        print("‚ö†Ô∏è PEFT directory exists but no adapter weights found")

if missing_files:
    print(f"\n‚ö†Ô∏è Warning: Some files are missing. Evaluation will be limited to available files.")
    print("üîß The evaluation will adapt to use whatever files are available.")
else:
    print(f"\nüéâ All required files found! Ready for evaluation.")

EVALUATION_READY = True  # Always ready, just adapt to available files

In [None]:
# ü§ñ LOAD FINE-TUNED MODEL FOR EVALUATION (CPU Optimized)
# This cell loads the fine-tuned GPT-2 model with multiple fallback strategies

def load_finetuned_model():
    """Load the fine-tuned model with CPU optimization and fallback strategies"""
    
    # Strategy 1: Try PEFT model if available
    if TOKENIZER_PATH.exists() and PEFT_MODEL_PATH.exists():
        adapter_files = list(PEFT_MODEL_PATH.glob("adapter_model.*"))
        if adapter_files:
            try:
                print("üîÑ Loading PEFT model (LoRA adapters)...")
                
                # Load tokenizer
                tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
                if tokenizer.pad_token is None:
                    tokenizer.pad_token = tokenizer.eos_token
                print(f"‚úÖ Tokenizer loaded (vocab size: {len(tokenizer)})")
                
                # Load base model on CPU
                base_model = AutoModelForCausalLM.from_pretrained("gpt2", torch_dtype=torch.float32)
                base_model.resize_token_embeddings(len(tokenizer))
                
                # Load PEFT adapters
                model = PeftModel.from_pretrained(base_model, PEFT_MODEL_PATH)
                model = model.to(device)
                model.eval()
                
                print(f"‚úÖ PEFT model loaded successfully on {device}!")
                
                # Count parameters
                total_params = sum(p.numel() for p in model.parameters())
                trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
                print(f"üìà Total parameters: {total_params:,}")
                print(f"üìà Trainable parameters: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")
                
                return model, tokenizer, "PEFT"
                
            except Exception as e:
                print(f"‚ö†Ô∏è PEFT loading failed: {e}")
                print("üîÑ Falling back to pickle model...")
    
    # Strategy 2: Try model.pkl with tokenizer
    if MODEL_PATH.exists() and TOKENIZER_PATH.exists():
        try:
            print("üîÑ Loading from model.pkl with custom tokenizer...")
            
            # Load tokenizer
            tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token
            
            # Load model from pickle with CPU mapping and PyTorch 2.6 compatibility
            model_data = None
            try:
                # Method 1: Direct torch.load with weights_only=False
                model_data = torch.load(MODEL_PATH, map_location='cpu', weights_only=False)
                print("‚úÖ Loaded with torch.load (weights_only=False)")
            except Exception as e1:
                print(f"‚ö†Ô∏è torch.load failed: {str(e1)[:100]}...")
                try:
                    # Method 2: Use pickle directly
                    import pickle
                    with open(MODEL_PATH, 'rb') as f:
                        model_data = pickle.load(f)
                    print("‚úÖ Loaded with pickle.load")
                except Exception as e2:
                    print(f"‚ö†Ô∏è pickle.load failed: {str(e2)[:100]}...")
                    raise e2
            
            # Load base model
            model = AutoModelForCausalLM.from_pretrained("gpt2", torch_dtype=torch.float32)
            model.resize_token_embeddings(len(tokenizer))
            
            # Apply state dict if available
            if 'state_dict' in model_data:
                model.load_state_dict(model_data['state_dict'], strict=False)
                print("‚úÖ State dict loaded from pickle")
            
            model = model.to(device)
            model.eval()
            
            print(f"‚úÖ Model loaded from pickle with custom tokenizer on {device}!")
            return model, tokenizer, "Pickle+Tokenizer"
            
        except Exception as e:
            print(f"‚ö†Ô∏è Pickle+tokenizer loading failed: {e}")
    
    # Strategy 3: Try model.pkl with default tokenizer
    if MODEL_PATH.exists():
        try:
            print("üîÑ Loading from model.pkl with default GPT-2 tokenizer...")
            
            # Use default tokenizer
            tokenizer = AutoTokenizer.from_pretrained("gpt2")
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token
            
            # Load model from pickle with PyTorch 2.6 compatibility
            model_data = None
            try:
                # Method 1: Direct torch.load with weights_only=False
                model_data = torch.load(MODEL_PATH, map_location='cpu', weights_only=False)
                print("‚úÖ Loaded with torch.load (weights_only=False)")
            except Exception as e1:
                print(f"‚ö†Ô∏è torch.load failed: {str(e1)[:100]}...")
                try:
                    # Method 2: Use pickle directly
                    import pickle
                    with open(MODEL_PATH, 'rb') as f:
                        model_data = pickle.load(f)
                    print("‚úÖ Loaded with pickle.load")
                except Exception as e2:
                    print(f"‚ö†Ô∏è pickle.load failed: {str(e2)[:100]}...")
                    raise e2
            
            model = AutoModelForCausalLM.from_pretrained("gpt2", torch_dtype=torch.float32)
            
            if 'state_dict' in model_data:
                try:
                    model.load_state_dict(model_data['state_dict'], strict=False)
                    print("‚úÖ State dict loaded from pickle (non-strict)")
                except Exception as load_error:
                    print(f"‚ö†Ô∏è State dict loading failed: {load_error}")
                    print("Using base GPT-2 model instead")
            
            model = model.to(device)
            model.eval()
            
            print(f"‚úÖ Model loaded from pickle with default tokenizer on {device}!")
            return model, tokenizer, "Pickle+Default"
            
        except Exception as e:
            print(f"‚ö†Ô∏è Pickle loading failed: {e}")
    
    # Strategy 4: Fallback to base GPT-2
    try:
        print("üîÑ Using base GPT-2 model (not fine-tuned)...")
        
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        model = AutoModelForCausalLM.from_pretrained("gpt2", torch_dtype=torch.float32)
        model = model.to(device)
        model.eval()
        
        print(f"‚ö†Ô∏è Using base GPT-2 model on {device} (not fine-tuned)")
        return model, tokenizer, "Base"
        
    except Exception as e:
        print(f"‚ùå All loading strategies failed: {e}")
        return None, None, None

# Load the model
eval_model, eval_tokenizer, model_type = load_finetuned_model()
MODEL_LOADED = eval_model is not None and eval_tokenizer is not None

if MODEL_LOADED:
    print(f"\nüéâ Model successfully loaded for evaluation! (Type: {model_type})")
    if model_type == "Base":
        print("‚ö†Ô∏è Note: Using base GPT-2. Results may not reflect fine-tuning quality.")
    print("üñ•Ô∏è Ready for CPU-based evaluation (generation will be slower)")
else:
    print("\n‚ùå Failed to load any model. Please check file availability.")

In [None]:
# üìä LOAD TEST DATA FOR EVALUATION
# Load test data independently for evaluation metrics

def load_test_data():
    """Load test data for evaluation"""
    try:
        if TEST_DATA_PATH.exists():
            print("üîÑ Loading test data from pickle...")
            with open(TEST_DATA_PATH, 'rb') as f:
                test_data = pickle.load(f)
            print(f"‚úÖ Test data loaded: {len(test_data)} samples")
            return test_data
        else:
            print("‚ö†Ô∏è Test data pickle not found. Trying CSV...")
            csv_path = BASE_PATH / "models" / "test_data.csv"
            if csv_path.exists():
                test_df = pd.read_csv(csv_path)
                print(f"‚úÖ Test data loaded from CSV: {len(test_df)} samples")
                return test_df.to_dict('records')
            else:
                print("‚ùå No test data found.")
                return None
                
    except Exception as e:
        print(f"‚ùå Error loading test data: {e}")
        return None

# Load test data
test_data = load_test_data()
TEST_DATA_LOADED = test_data is not None

if TEST_DATA_LOADED:
    print(f"üìã Test data summary:")
    print(f"   ‚Ä¢ Total samples: {len(test_data)}")
    if isinstance(test_data, list) and len(test_data) > 0:
        print(f"   ‚Ä¢ Sample keys: {list(test_data[0].keys())}")
        if 'text' in test_data[0]:
            avg_length = np.mean([len(sample['text']) for sample in test_data])
            print(f"   ‚Ä¢ Average text length: {avg_length:.1f} characters")
else:
    print("‚ö†Ô∏è Test data not available. Will create sample prompts for evaluation.")
    
# Create sample evaluation prompts if no test data
SAMPLE_PROMPTS = [
    "Ingredients: eggs, flour, sugar, butter, vanilla\nTitle: Classic Vanilla Cake\nRecipe:\n",
    "Ingredients: chicken breast, garlic, olive oil, salt, pepper\nTitle: Garlic Chicken\nRecipe:\n",
    "Ingredients: pasta, tomatoes, basil, parmesan, olive oil\nTitle: Pasta Marinara\nRecipe:\n",
    "Ingredients: salmon, lemon, dill, butter, salt\nTitle: Lemon Dill Salmon\nRecipe:\n",
    "Ingredients: rice, vegetables, soy sauce, ginger, garlic\nTitle: Vegetable Fried Rice\nRecipe:\n"
]

print(f"\nüìù Sample prompts prepared: {len(SAMPLE_PROMPTS)} prompts ready for generation")

In [None]:
# üéØ RECIPE GENERATION FUNCTION (CPU Optimized)
# Create a lightweight recipe generation function optimized for CPU

def generate_recipe_cpu(model, tokenizer, prompt, max_length=200, temperature=0.8, top_p=0.9):
    """
    Generate recipe using the model - optimized for CPU with shorter outputs
    
    Args:
        model: Loaded model
        tokenizer: Tokenizer
        prompt: Input prompt (ingredients + title)
        max_length: Maximum generation length (reduced for CPU)
        temperature: Sampling temperature
        top_p: Nucleus sampling parameter
    
    Returns:
        Generated recipe text
    """
    try:
        print(f"üîÑ Generating recipe (max_length={max_length})...")
        
        # Tokenize input
        inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=256, truncation=True)
        inputs = inputs.to(device)
        
        # Generate with CPU-optimized settings
        with torch.no_grad():
            outputs = model.generate(
                inputs,
                max_length=inputs.shape[1] + max_length,  # Add to input length
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                repetition_penalty=1.1,
                length_penalty=1.0,
                no_repeat_ngram_size=2,  # Prevent repetition
                early_stopping=True
            )
        
        # Decode output
        full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract only the generated part (after the prompt)
        recipe_text = full_text[len(prompt):].strip()
        
        print(f"‚úÖ Generated {len(recipe_text)} characters")
        return recipe_text
        
    except Exception as e:
        error_msg = f"‚ùå Generation error: {e}"
        print(error_msg)
        return error_msg

# Test generation function if model is loaded
if MODEL_LOADED:
    print("üß™ Testing recipe generation on CPU...")
    test_prompt = "Ingredients: eggs, flour, milk\nTitle: Simple Pancakes\nRecipe:\n"
    
    try:
        import time
        start_time = time.time()
        
        test_recipe = generate_recipe_cpu(eval_model, eval_tokenizer, test_prompt, max_length=150)
        
        end_time = time.time()
        generation_time = end_time - start_time
        
        print("‚úÖ Generation function working!")
        print(f"‚è±Ô∏è Generation time: {generation_time:.1f} seconds")
        print(f"üìù Sample generation:")
        print(f"Prompt: {test_prompt.strip()}")
        print(f"Generated: {test_recipe[:100]}...")
        
        # Store the function for later use
        generate_recipe = generate_recipe_cpu
        
    except Exception as e:
        print(f"‚ùå Generation test failed: {e}")
else:
    print("‚ö†Ô∏è Skipping generation test - model not loaded")

# Create minimal sample recipes for testing
MINIMAL_SAMPLE_PROMPTS = [
    "Ingredients: eggs, flour, milk\nTitle: Simple Pancakes\nRecipe:\n",
    "Ingredients: chicken, garlic, salt\nTitle: Garlic Chicken\nRecipe:\n",
    "Ingredients: pasta, tomato, cheese\nTitle: Simple Pasta\nRecipe:\n"
]

print(f"\nüìù Prepared {len(MINIMAL_SAMPLE_PROMPTS)} minimal test prompts for CPU evaluation")

In [None]:
# üìà MINIMAL EVALUATION METRICS (CPU Optimized)
# Calculate basic metrics with minimal computational overhead

def calculate_minimal_evaluation(model, tokenizer, num_samples=3):
    """
    Calculate minimal evaluation metrics optimized for CPU
    Focus on basic functionality rather than comprehensive analysis
    """
    print("üî¨ Starting minimal CPU evaluation...")
    print(f"üìä Testing with {num_samples} samples for speed")
    
    results = {
        "generations": [],
        "prompts": [],
        "generation_lengths": [],
        "generation_times": [],
        "success_count": 0
    }
    
    # Use minimal sample prompts
    test_prompts = MINIMAL_SAMPLE_PROMPTS[:num_samples]
    
    print("üîÑ Generating test samples...")
    
    for i, prompt in enumerate(test_prompts):
        try:
            import time
            start_time = time.time()
            
            # Generate with shorter length for CPU efficiency
            generated = generate_recipe_cpu(model, tokenizer, prompt, max_length=100, temperature=0.7)
            
            end_time = time.time()
            generation_time = end_time - start_time
            
            # Store results
            results["prompts"].append(prompt)
            results["generations"].append(generated)
            results["generation_lengths"].append(len(generated))
            results["generation_times"].append(generation_time)
            
            # Check if generation was successful (not an error message)
            if not generated.startswith("‚ùå") and len(generated.strip()) > 10:
                results["success_count"] += 1
            
            print(f"‚úÖ Sample {i+1}/{num_samples} completed in {generation_time:.1f}s")
            
        except Exception as e:
            print(f"‚ùå Error processing sample {i+1}: {e}")
            results["prompts"].append(prompt)
            results["generations"].append(f"Error: {e}")
            results["generation_lengths"].append(0)
            results["generation_times"].append(0)
    
    # Calculate basic statistics
    print("üî¢ Calculating basic metrics...")
    
    successful_lengths = [l for l, g in zip(results["generation_lengths"], results["generations"]) 
                         if not g.startswith("‚ùå") and l > 10]
    successful_times = [t for t, g in zip(results["generation_times"], results["generations"]) 
                       if not g.startswith("‚ùå") and len(g.strip()) > 10]
    
    evaluation_report = {
        "total_samples": len(results["generations"]),
        "successful_generations": results["success_count"],
        "success_rate": results["success_count"] / len(results["generations"]) if results["generations"] else 0,
        "avg_generation_length": np.mean(successful_lengths) if successful_lengths else 0,
        "avg_generation_time": np.mean(successful_times) if successful_times else 0,
        "total_time": sum(results["generation_times"]),
        "results": results
    }
    
    return evaluation_report

# Run minimal evaluation if model is loaded
if MODEL_LOADED:
    try:
        print("üöÄ Running minimal evaluation (CPU optimized)...")
        evaluation_results = calculate_minimal_evaluation(eval_model, eval_tokenizer, num_samples=3)
        
        print("\n" + "="*50)
        print("üìã MINIMAL EVALUATION REPORT")
        print("="*50) 
        print(f"üìä Samples tested: {evaluation_results['total_samples']}")
        print(f"‚úÖ Successful generations: {evaluation_results['successful_generations']}")
        print(f"üìà Success rate: {evaluation_results['success_rate']:.1%}")
        print(f"üìè Avg generation length: {evaluation_results['avg_generation_length']:.0f} chars")
        print(f"‚è±Ô∏è Avg generation time: {evaluation_results['avg_generation_time']:.1f} seconds")
        print(f"‚è±Ô∏è Total evaluation time: {evaluation_results['total_time']:.1f} seconds")
        print("="*50)
        
        # Show sample outputs
        print("\nüìù SAMPLE OUTPUTS:")
        for i, (prompt, generated) in enumerate(zip(evaluation_results['results']['prompts'], 
                                                   evaluation_results['results']['generations'])):
            print(f"\nüî∏ Sample {i+1}:")
            print(f"Prompt: {prompt.strip()}")
            print(f"Generated: {generated[:150]}...")
        
        EVALUATION_COMPLETED = True
        
    except Exception as e:
        print(f"‚ùå Minimal evaluation failed: {e}")
        EVALUATION_COMPLETED = False
else:
    print("‚ö†Ô∏è Skipping evaluation - model not loaded")
    EVALUATION_COMPLETED = False

In [None]:
# üé® MINIMAL GENERATION SHOWCASE (CPU Optimized)
# Generate sample recipes with minimal computational overhead

def minimal_showcase_generation():
    """Generate minimal sample recipes optimized for CPU"""
    
    print("üé® MINIMAL RECIPE SHOWCASE (CPU Optimized)")
    print("="*50)
    
    # Reduced showcase prompts
    showcase_prompts = [
        {
            "prompt": "Ingredients: chicken, garlic, herbs\nTitle: Herb Chicken\nRecipe:\n",
            "description": "üçó Simple Chicken Dish"
        },
        {
            "prompt": "Ingredients: eggs, flour, sugar\nTitle: Basic Cake\nRecipe:\n", 
            "description": "? Easy Dessert"
        }
    ]
    
    # Simplified creativity levels
    creativity_levels = [
        {"temp": 0.6, "top_p": 0.8, "name": "Conservative"},
        {"temp": 0.9, "top_p": 0.9, "name": "Creative"}
    ]
    
    showcase_results = []
    
    for prompt_info in showcase_prompts:
        print(f"\n{prompt_info['description']}")
        print("-" * 30)
        
        for creativity in creativity_levels:
            try:
                import time
                start_time = time.time()
                
                recipe = generate_recipe_cpu(
                    eval_model, 
                    eval_tokenizer, 
                    prompt_info['prompt'],
                    max_length=120,  # Shorter for CPU
                    temperature=creativity['temp'],
                    top_p=creativity['top_p']
                )
                
                end_time = time.time()
                
                print(f"üìù {creativity['name']} ({end_time-start_time:.1f}s):")
                print(f"   {recipe[:100]}...")
                print()
                
                showcase_results.append({
                    "prompt": prompt_info['prompt'],  
                    "creativity_level": creativity['name'],
                    "generated": recipe,
                    "generation_time": end_time - start_time
                })
                
            except Exception as e:
                print(f"‚ùå Generation failed for {creativity['name']}: {e}")
    
    return showcase_results

# Generate minimal showcase if model is loaded
if MODEL_LOADED:
    try:
        showcase_data = minimal_showcase_generation()
        print(f"\n‚úÖ Generated {len(showcase_data)} showcase examples")
        
        # Calculate average generation time
        avg_time = np.mean([item['generation_time'] for item in showcase_data if 'generation_time' in item])
        print(f"‚è±Ô∏è Average generation time: {avg_time:.1f} seconds")
        
        SHOWCASE_COMPLETED = True
    except Exception as e:
        print(f"‚ùå Showcase generation failed: {e}")
        SHOWCASE_COMPLETED = False
else:
    print("‚ö†Ô∏è Skipping showcase - model not loaded")
    SHOWCASE_COMPLETED = False

In [None]:
# üìä FINAL EVALUATION REPORT (CPU Optimized)
# Create a concise final report optimized for local CPU evaluation

def create_minimal_evaluation_report():
    """Create a concise evaluation report for CPU-based evaluation"""
    
    print("üìä FINAL EVALUATION REPORT (CPU Mode)")
    print("="*60)
    
    # Model Performance Summary  
    print("ü§ñ MODEL PERFORMANCE SUMMARY")
    print("-" * 30)
    
    if MODEL_LOADED:
        try:
            total_params = sum(p.numel() for p in eval_model.parameters())
            print(f"üìà Model: GPT-2 ({model_type})")
            print(f"üìà Total Parameters: {total_params:,}")
            print(f"üìà Device: {device}")
            print(f"üìà Status: ‚úÖ Loaded Successfully")
        except:
            print(f"üìà Model: Loaded but parameter count unavailable")
            print(f"üìà Type: {model_type if 'model_type' in globals() else 'Unknown'}")
            print(f"üìà Device: {device}")
    else:
        print(f"üìà Status: ‚ùå Not Loaded")
    
    print()
    
    # Evaluation Results Summary
    print("üìä EVALUATION RESULTS")
    print("-" * 30)
    
    if EVALUATION_COMPLETED and 'evaluation_results' in globals():
        results = evaluation_results
        print(f"üìè Samples Tested: {results['total_samples']}")
        print(f"‚úÖ Success Rate: {results['success_rate']:.1%}")
        print(f"üìè Avg Length: {results['avg_generation_length']:.0f} chars")
        print(f"‚è±Ô∏è Avg Time: {results['avg_generation_time']:.1f} seconds")
        print(f"‚è±Ô∏è Total Time: {results['total_time']:.1f} seconds")
        
        # Performance assessment
        if results['avg_generation_time'] < 5:
            print("? Performance: Good (< 5s per generation)")
        elif results['avg_generation_time'] < 15:
            print("‚ö†Ô∏è Performance: Acceptable (5-15s per generation)")
        else:
            print("üêå Performance: Slow (> 15s per generation)")
            
    else:
        print("‚ö†Ô∏è Evaluation metrics not available")
    
    print()
    
    # Generation Quality
    print("üé® GENERATION QUALITY")
    print("-" * 30)
    
    if SHOWCASE_COMPLETED and 'showcase_data' in globals():
        print(f"‚úÖ Showcase: {len(showcase_data)} examples generated")
        
        # Analyze generation quality
        avg_lengths = [len(item['generated']) for item in showcase_data if 'generated' in item]
        if avg_lengths:
            print(f"üìè Avg Recipe Length: {np.mean(avg_lengths):.0f} characters")
            
        # Check for quality indicators
        quality_indicators = 0
        for item in showcase_data:
            if 'generated' in item and len(item['generated']) > 50:
                quality_indicators += 1
                
        quality_score = quality_indicators / len(showcase_data) if showcase_data else 0
        print(f"‚úÖ Quality Score: {quality_score:.1%} (recipes > 50 chars)")
        
    else:
        print("‚ö†Ô∏è Generation showcase not available")
    
    print()
    
    # System Assessment
    print("üèÜ SYSTEM ASSESSMENT")
    print("-" * 30)
    
    assessment_score = 0
    max_score = 4
    
    if MODEL_LOADED:
        assessment_score += 1
        print("‚úÖ Model Loading: SUCCESS")
    else:
        print("‚ùå Model Loading: FAILED")
    
    if EVALUATION_COMPLETED:
        assessment_score += 1
        print("‚úÖ Basic Evaluation: COMPLETED")
    else:
        print("‚ùå Basic Evaluation: FAILED")
        
    if SHOWCASE_COMPLETED:
        assessment_score += 1
        print("‚úÖ Generation Test: COMPLETED")
    else:
        print("‚ùå Generation Test: FAILED")
    
    # Check if it's actually generating reasonable content
    if ('evaluation_results' in globals() and 
        evaluation_results.get('success_rate', 0) > 0.5):
        assessment_score += 1
        print("‚úÖ Generation Quality: ACCEPTABLE")
    else:
        print("‚ö†Ô∏è Generation Quality: NEEDS IMPROVEMENT")
    
    print()
    print(f"üéØ Overall Score: {assessment_score}/{max_score} ({100*assessment_score/max_score:.0f}%)")
    
    if assessment_score >= 3:
        print("üéâ GOOD: System is working well for local CPU evaluation!")
        print("üí° Tip: Generation is slower on CPU but functional")
    elif assessment_score >= 2:
        print("üëç FAIR: Basic functionality working")
        print("üí° Tip: Some features may need attention")
    else:
        print("‚ùå POOR: System needs significant fixes")
        print("üîß Check model files and dependencies")
    
    # Performance recommendations
    print("\nüí° CPU OPTIMIZATION TIPS:")
    print("‚Ä¢ Reduce max_length for faster generation")
    print("‚Ä¢ Use temperature 0.7-0.8 for good quality/speed balance")
    print("‚Ä¢ Consider batch processing for multiple recipes")
    print("‚Ä¢ GPU would significantly improve performance")
    
    print("\n" + "="*60)
    print("üìã CPU EVALUATION COMPLETED")
    print("="*60)

# Generate final report
create_minimal_evaluation_report()