In [1]:
# imports
import os
import json
import time
import concurrent.futures
from pathlib import Path
import threading
import requests

import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from ai_tools.tools import LLMQuery
from pokemon_tools import PokemonAPIClient 

In [2]:
SYSTEM_PROMPT = """You are an expert Pokémon Data Analyst and Lorekeeper. Your task is to process raw JSON data into a **semantically dense, natural language profile** optimized for Vector Search retrieval (RAG).

### Core Instruction:
Use the provided JSON as the **primary source** for facts (Stats, Moves, Evolution), but use your **World Knowledge** to expand on behavior, competitive viability, and lore. The goal is to maximize the number of relevant searchable keywords (moves, abilities, behaviors) in the output.

### STRICT Output Constraints:
1.  **NO Conversational Filler:** Start immediately with the Markdown headers. Do not say "Here is the profile."
2.  **Naturalize the ID:** Include the Pokémon's ID (e.g., #006) naturally in the first sentence of the Identity section.
3.  **Omit Empty Data:** If a field is empty (e.g., `held_items` or `forms`), skip it entirely.
4.  **Format:** Use Markdown headers (`##`) for sections.

### Processing Guidelines:

**1. Identity & Lore (Expand this)**
* **Variant Detection:** Check the `name` field. If it contains suffixes like `-alola`, `-galar`, `-hisui`, or `-paldea`, explicitly name it as the **[Region]n Form** (e.g., convert `vulpix-alola` to "Alolan Vulpix").
* **Bio Synthesis:** Combine `genus`, `flavor_text`, `habitat`, `generation`, and `color` into a cohesive summary.
* **Form Potential (World Knowledge):** Explicitly mention if this Pokémon has **Mega Evolutions**, **Gigantamax forms**, or **Paradox forms**, even if they are not listed in the JSON.
* **Context:** Mention the specific region associated with the generation (e.g., "Generation I (Kanto)").

**2. Combat Profile (Detailed Analysis)**
* **Stat Block:** You **MUST** list the specific Base Stats in a list format using these standard abbreviations: **HP, Attack, Defense, Sp. Atk, Sp. Def, Speed**.
* **Archetype:** Analyze the stats to assign a competitive role (e.g., "Fast Special Sweeper," "Physical Tank," "Mixed Attacker").
* **Type Matchups:**
    * Explicitly list **Weaknesses** and **Resistances**.
    * **Logic Check:** If a Pokémon has a double weakness (e.g., Fire/Flying vs. Rock), explicitly label it as a **"Double Weakness (4x)."**
* **Smart Move Selection:** Select 4 specific moves that align with the Pokémon's **highest offensive stat**. (e.g., If Sp. Atk > Attack, list *Flamethrower*, not *Fire Punch*). Include Utility moves (Recover, Roost) if relevant.
* **Abilities:** Name the abilities and briefly explain their tactical effect.

**3. Evolution & Acquisition**
* **Detailed Chain:** Narrate the full evolutionary line.
* **Triggers:** You must specify the **method** found in `evolution_details` (e.g., "when exposed to a Thunder Stone," "leveled up with High Friendship," "at Level 36").
* **Breeding:** Mention Egg Groups and Hatch Counters if relevant.

---

### Required Output Structure:

**## Identity & Lore**
[Name] (Pokedex #[ID]) is the [Genus]... [Detailed Description of appearance, habitat, and lore]... [Mention of Mega/G-Max forms if applicable]...

**## Combat Profile**
* **Stats:** HP [Val], Attack [Val], Defense [Val], Sp. Atk [Val], Sp. Def [Val], Speed [Val].
* **Archetype:** [Role Name] (e.g., Fast Special Sweeper).
* **Type Matchups:** It is **weak to [Types]** (mentioning any 4x weaknesses) and **resists [Types]**.
* **Move Selection:** Capitalizing on its stats, key moves include **[Move 1], [Move 2], [Move 3], and [Move 4]**.
* **Abilities:** [Ability Name] ([Effect description])...

**## Evolution & Biology**
[Name] is part of a [Number]-stage evolutionary line. It evolves from [Pre-evo] when [Condition]... It belongs to the [Egg Group] group...

---
**Input Data:**
[JSON Data provided in the user prompt]
"""

In [None]:
LIST_CACHE_FILE = Path("pokemon_list.json")

def get_all_pokemon():
    if LIST_CACHE_FILE.exists():
        print("Loading Pokemon list from cache...")
        try:
            with open(LIST_CACHE_FILE, "r") as f:
                return json.load(f)
        except Exception as e:
            print(f"Error reading cache, fetching fresh list. Error: {e}")

    print("Fetching list of all Pokemon from API...")
    url = "https://pokeapi.co/api/v2/pokemon?limit=10000"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        pokemon_list = [p["name"] for p in data["results"]]
        
        # Save to cache
        with open(LIST_CACHE_FILE, "w") as f:
            json.dump(pokemon_list, f)
        print(f"Saved {len(pokemon_list)} Pokemon to {LIST_CACHE_FILE}")
        return pokemon_list
    except Exception as e:
        print(f"Error fetching Pokemon list: {e}")
        return []

len(get_all_pokemon())

Loading Pokemon list from cache...


['bulbasaur',
 'ivysaur',
 'venusaur',
 'charmander',
 'charmeleon',
 'charizard',
 'squirtle',
 'wartortle',
 'blastoise',
 'caterpie',
 'metapod',
 'butterfree',
 'weedle',
 'kakuna',
 'beedrill',
 'pidgey',
 'pidgeotto',
 'pidgeot',
 'rattata',
 'raticate',
 'spearow',
 'fearow',
 'ekans',
 'arbok',
 'pikachu',
 'raichu',
 'sandshrew',
 'sandslash',
 'nidoran-f',
 'nidorina',
 'nidoqueen',
 'nidoran-m',
 'nidorino',
 'nidoking',
 'clefairy',
 'clefable',
 'vulpix',
 'ninetales',
 'jigglypuff',
 'wigglytuff',
 'zubat',
 'golbat',
 'oddish',
 'gloom',
 'vileplume',
 'paras',
 'parasect',
 'venonat',
 'venomoth',
 'diglett',
 'dugtrio',
 'meowth',
 'persian',
 'psyduck',
 'golduck',
 'mankey',
 'primeape',
 'growlithe',
 'arcanine',
 'poliwag',
 'poliwhirl',
 'poliwrath',
 'abra',
 'kadabra',
 'alakazam',
 'machop',
 'machoke',
 'machamp',
 'bellsprout',
 'weepinbell',
 'victreebel',
 'tentacool',
 'tentacruel',
 'geodude',
 'graveler',
 'golem',
 'ponyta',
 'rapidash',
 'slowpoke',
 '

In [4]:


# Configuration
OUTPUT_DIR = Path("raw")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
MODEL_NAME = "gemini-3-flash-preview"
MAX_WORKERS = 10
ABORT_LIMIT = 20

# Global state
error_count = 0
error_lock = threading.Lock()
abort_event = threading.Event()



def generate_pokemon_data(pokemon_name):
    global error_count
    
    if abort_event.is_set():
        return

    # Optimistic existing file check
    existing = list(OUTPUT_DIR.glob(f"*_{pokemon_name}.md"))
    if existing:
        # print(f"Skipping {pokemon_name} (File exists)")
        return

    # Initialize Clients (Thread-safe)
    pokemon_client = PokemonAPIClient(enable_cache=False)
    llm_client = LLMQuery(model=MODEL_NAME, system_prompt=SYSTEM_PROMPT)
    
    time.sleep(0.5)
    
    try:
        print(f"Processing {pokemon_name}...")
        details = pokemon_client.get_pokemon_details(pokemon_name)
        if "error" in details:
            raise Exception(f"API Error: {details.get('error')}")
            
        type_info = []
        for t in details.get("types", []):
            t_info = pokemon_client.get_type_info(t)
            if "error" not in t_info:
                type_info.append(t_info)

        context = {"pokemon_details": details, "type_info": type_info}
        user_prompt = f"Here is the data for {pokemon_name}:\n```json\n{json.dumps(context, indent=2)}\n```"
        
        response = None
        for attempt in range(3):
            try:
                response = llm_client.query(user_prompt=user_prompt, use_history=False)
                if response: break
            except Exception as e:
                if attempt == 2: raise e
                time.sleep(2 ** attempt)

        p_id = details.get("id", 0)
        file_prefix = f"{p_id:04d}_{pokemon_name}"
        
        with open(OUTPUT_DIR / f"{file_prefix}.json", "w", encoding="utf-8") as f:
            json.dump(context, f, indent=2)
        with open(OUTPUT_DIR / f"{file_prefix}.md", "w", encoding="utf-8") as f:
            f.write(response)
            
        print(f"Saved {file_prefix}.md")
        
    except Exception as e:
        with error_lock:
            error_count += 1
            print(f"Error processing {pokemon_name}: {e} (Count: {error_count})")
            if error_count >= ABORT_LIMIT:
                print("Aborting generation due to error limit.")
                abort_event.set()

# Main Execution
try:
    POKEMON_LIST = get_all_pokemon()
    if POKEMON_LIST:
        print(f"Starting generation for {len(POKEMON_LIST)} Pokemon with {MAX_WORKERS} threads...")
        with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            list(executor.map(generate_pokemon_data, POKEMON_LIST))
    else:
        print("No Pokemon found to process.")
    print("Generation complete.")
except KeyboardInterrupt:
    print("Generation interrupted.")
except Exception as e:
    print(f"Fatal error: {e}")

Loading Pokemon list from cache...
Starting generation for 1350 Pokemon with 10 threads...
Processing ralts...
Processing stoutland...
Processing purrloin...
Processing liepard...
Processing pansage...
Processing simisage...
Processing pansear...
Processing panpour...
Processing simipour...
Processing simisear...
Saved 0509_purrloin.md
Saved 0510_liepard.md
Processing munna...
Processing musharna...
Saved 0516_simipour.md
Saved 0280_ralts.md
Saved 0508_stoutland.md
Processing pidove...
Processing tranquill...
Processing unfezant...
Saved 0515_panpour.md
Saved 0514_simisear.md
Processing blitzle...
Processing zebstrika...
Saved 0512_simisage.md
Saved 0511_pansage.md
Processing roggenrola...
Processing boldore...
Saved 0513_pansear.md
Processing gigalith...
Saved 0517_munna.md
Processing woobat...
Saved 0521_unfezant.md
Processing swoobat...
Saved 0518_musharna.md
Saved 0523_zebstrika.md
Saved 0519_pidove.md
Processing drilbur...
Saved 0520_tranquill.md
Processing excadrill...
Processing

In [4]:
llm_client.chat_history

NameError: name 'llm_client' is not defined