# Phase 3: Classify 404 Canonical Poems

Uses Llama-3-8B with your 52 gold-standard examples as few-shot prompts.

**Runtime**: ~30-60 minutes on Colab GPU

**Output**: CSV with all 28 classification columns

In [None]:
# Install dependencies
!pip install transformers torch accelerate pandas

In [None]:
# Mount Google Drive (if you want to save results there)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Upload your files
# 1. gold_standard_52_poems_with_narrative_level.csv
# 2. 448_poems_to_classify.csv
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import json
import csv
from tqdm import tqdm

print(f"GPU available: {torch.cuda.is_available()}")
print(f"GPU name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

In [None]:
# Load Mistral-7B-Instruct (ungated, no access request needed)
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=500,
    temperature=0.3,
    top_p=0.9,
)

print("✓ Mistral-7B-Instruct loaded")

In [None]:
# Load gold-standard examples
gold_df = pd.read_csv('gold_standard_52_poems_with_narrative_level.csv')
print(f"Loaded {len(gold_df)} gold-standard poems")

# Load poems to classify
classify_df = pd.read_csv('448_poems_to_classify.csv')
print(f"Loaded {len(classify_df)} poems to classify")

# Select 5 diverse gold-standard examples for few-shot prompt
few_shot_examples = gold_df.sample(5, random_state=42)

In [None]:
# Build few-shot prompt
def build_classification_prompt(poem_title, poem_author, poem_year, few_shot_examples):
    prompt = """You are a literary scholar classifying English poetry across 28 metadata dimensions.

Here are 5 example classifications:

"""
    
    # Add few-shot examples
    for idx, row in few_shot_examples.iterrows():
        prompt += f"""EXAMPLE {idx+1}:
Title: {row['title']}
Author: {row['author']}
Year: {row['year_approx']}
Period: {row['period']}
Literary Movement: {row['literary_movement']}
Register: {row['register']}
Rhetorical Genre: {row['rhetorical_genre']}
Mode: {row['mode']}
Narrative Level: {row['narrative_level']}
Meter: {row['meter']}

"""
    
    prompt += f"""Now classify this poem using the same schema. Return ONLY a JSON object with these fields:
period, literary_movement, register, rhetorical_genre, discursive_structure, discourse_type, 
narrative_level, diegetic_mimetic, focalization, person, deictic_orientation, addressee_type, 
deictic_object, temporal_orientation, temporal_structure, tradition, mode, genre, 
stanza_structure, meter, rhyme

POEM TO CLASSIFY:
Title: {poem_title}
Author: {poem_author}
Year: {poem_year}

JSON:
"""
    return prompt

In [None]:
# Classify all poems
results = []
checkpoint_interval = 50

for idx, row in tqdm(classify_df.iterrows(), total=len(classify_df)):
    try:
        prompt = build_classification_prompt(
            row['title'], 
            row['author'], 
            row['year_approx'],
            few_shot_examples
        )
        
        # Generate classification
        output = pipe(prompt)
        response = output[0]['generated_text'][len(prompt):].strip()
        
        # Try to parse JSON
        try:
            classification = json.loads(response)
        except:
            # Fallback: extract JSON from response
            import re
            json_match = re.search(r'\{[^}]+\}', response, re.DOTALL)
            if json_match:
                classification = json.loads(json_match.group())
            else:
                classification = {}
        
        # Add poem info
        result = {
            'title': row['title'],
            'author': row['author'],
            'year_approx': row['year_approx'],
            **classification
        }
        results.append(result)
        
        # Checkpoint every 50 poems (save to both temp and Google Drive)
        if (idx + 1) % checkpoint_interval == 0:
            checkpoint_df = pd.DataFrame(results)
            # Save to temp (for download)
            checkpoint_df.to_csv(f'checkpoint_{idx+1}.csv', index=False)
            # Save to Google Drive (persistent - survives runtime disconnects)
            checkpoint_df.to_csv(f'/content/drive/MyDrive/checkpoint_{idx+1}.csv', index=False)
            print(f"✓ Checkpoint saved: {idx+1} poems classified (saved to Drive)")
            
    except Exception as e:
        print(f"Error on poem {idx}: {row['title']} - {e}")
        results.append({'title': row['title'], 'author': row['author'], 'error': str(e)})

print("\n✓ Classification complete!")

In [None]:
# Save final results
results_df = pd.DataFrame(results)
results_df.to_csv('404_poems_classified.csv', index=False)
print(f"✓ Saved {len(results_df)} classifications to 404_poems_classified.csv")

# Also save to Google Drive
results_df.to_csv('/content/drive/MyDrive/404_poems_classified.csv', index=False)
print("✓ Saved to Google Drive")

# Download
files.download('404_poems_classified.csv')

In [None]:
# Quality check
print("\nQuality Check:")
print(f"Total poems: {len(results_df)}")
print(f"Successful: {results_df['period'].notna().sum()}")
print(f"Errors: {results_df['period'].isna().sum()}")
print("\nSample results:")
print(results_df[['title', 'author', 'period', 'register', 'mode']].head(10))