In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2

In [None]:
from unsloth import FastLanguageModel
from transformers import TextStreamer
import pandas as pd
import json
import torch
import re
from tqdm import tqdm

# Configuration
max_seq_length = 4096
dtype = None # None for auto detection
load_in_4bit = True # Use 4bit quantization to reduce memory usage
BATCH_SIZE = 48

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-14B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

print("Model loaded successfully!")

In [None]:
INPUT_CSV = "steam_learning_corpus_full_context.csv"
OUTPUT_JSON = "annotated_learning_corpus.json"
OUTPUT_CSV = "annotated_learning_corpus_final.csv"

In [None]:
# 3. PROMPT TEMPLATES (ROUTING LOGIC)

# PROMPT A: For rows where 'has_perception' == True (contains words like see, hear, notice)
prompt_perception = f"""<|im_start|>system
# Role
You are an expert qualitative data analyst specializing in game studies and educational psychology. Your task is to analyze a video game review to determine if a specific highlighted verb indicates a genuine instance of "perceived learning."

# Input Format
You will receive a Steam review. A specific verb will be highlighted in double brackets, like this: "I [[realized]] that the factory must grow."

# Analysis Guidelines

1. **Verify Genuine Learning (True/False)**
   Determine if the highlighted verb describes the player acquiring new skills, knowledge, understanding, or insight *within the context of playing the game*.
   * **TRUE Examples:**
       * "I [[learned]] how to optimize my train network."
       * "We [[figured]] out the solution to the puzzle."
       * "I [[realized]] the story was a metaphor for grief."
   * **FALSE Examples:**
       * "I [[learned]] about this game from a friend."
       * "I [[learned]] my lesson: never buy early access."
       * "I [[guess]] it's okay."

2. **Categorize the Learning**
   If Genuine Learning is TRUE, categorize it into one of the following:
   * **SYSTEMS:** Learning mechanics, controls, optimization, physics, or game rules (e.g., automation in Factorio, puzzles in Portal.) Example: "I kept dying to the boss until I [[realized]] that electricity damage stuns him for a few seconds."
   * **NARRATIVE:** Learning plot points, lore, character backstories, or thematic meaning. Example: "By reading the terminal logs, we [[discovered]] that the corporation had been poisoning the planet long before the aliens arrived."
   * **SOCIAL_SELF:** Learning about teamwork, coordination, leadership, patience, or personal capability (common in Co-op). Example: "We kept failing the timer, so we [[figured]] out that one of us needed to just call out commands while the other two executed them."
   * **OTHER:** If none of the previous categories fit.
   * **NONE:** Use this if Genuine Learning is False.

3. **Verify Perceptual Link (True/False)**
   Is the learning *directly caused by* or *framed through* an act of perception mentioned in the text?
   * **TRUE:** The player learned *because* they saw/heard/noticed something.
       * *Example:* "I saw the blinking light and [[realized]] the battery was low." (Seeing caused the realization).
       * *Example:* "I noticed the pattern and [[figured]] it out."
   * **FALSE:** Perception words are present but unrelated to the specific learning event.
       * *Example:* "The game looks great. I also [[learned]] how to jump." (Visuals and learning are separate).

# Output Format
Output ONLY a valid JSON object with the following schema:

{{
  "is_genuine_learning": boolean,
  "learning_category": "SYSTEMS" | "NARRATIVE" | "SOCIAL_SELF" | "OTHER" | "NONE",
  "perception_linked_to_learning": boolean,
}}
<|im_end|>"""

# PROMPT B: For rows where 'has_perception' == False (words like realize, figure out, understand)
prompt_cognition = f"""<|im_start|>system
# Role
You are an expert qualitative data analyst specializing in game studies and educational psychology. Your task is to analyze a video game review to determine if a specific highlighted verb indicates a genuine instance of "perceived learning."

# Input Format
You will receive a Steam review. A specific verb will be highlighted in double brackets, like this: "I [[realized]] that the factory must grow."

# Analysis Guidelines

1. **Verify Genuine Learning (True/False)**
   Determine if the highlighted verb describes the player acquiring new skills, knowledge, understanding, or insight *within the context of playing the game*.
   * **TRUE Examples:**
       * "I [[learned]] how to optimize my train network."
       * "We [[figured]] out the solution to the puzzle."
       * "I [[realized]] the story was a metaphor for grief."
   * **FALSE Examples:**
       * "I [[learned]] about this game from a friend."
       * "I [[learned]] my lesson: never buy early access."
       * "I [[guess]] it's okay."

2. **Categorize the Learning**
   If Genuine Learning is TRUE, categorize it into one of the following:
   * **SYSTEMS:** Learning mechanics, controls, optimization, physics, or game rules (e.g., automation in Factorio, puzzles in Portal.) Example: "I kept dying to the boss until I [[realized]] that electricity damage stuns him for a few seconds."
   * **NARRATIVE:** Learning plot points, lore, character backstories, or thematic meaning. Example: "By reading the terminal logs, we [[discovered]] that the corporation had been poisoning the planet long before the aliens arrived."
   * **SOCIAL_SELF:** Learning about teamwork, coordination, leadership, patience, or personal capability (common in Co-op). Example: "We kept failing the timer, so we [[figured]] out that one of us needed to just call out commands while the other two executed them."
   * **OTHER:** If none of the previous categories fit.
   * **NONE:** Use this if Genuine Learning is False.

# Output Format
Output ONLY a valid JSON object with the following schema:

{{
  "is_genuine_learning": boolean,
  "learning_category": "SYSTEMS" | "NARRATIVE" | "SOCIAL_SELF" | "OTHER" | "NONE"
}}
<|im_end|>"""

def create_prompt(review_text, has_perception):
    # Select the system prompt based on the boolean flag
    sys_prompt = prompt_perception if has_perception else prompt_cognition

    return f"""{sys_prompt}
<|im_start|>user
Review:
{review_text}
<|im_end|>
<|im_start|>assistant
"""

In [None]:
# 3. DATA LOADING
print(f"Loading data from {INPUT_CSV}...")
try:
    df = pd.read_csv(INPUT_CSV)
    # Filter for valid inputs
    df = df.dropna(subset=['full_review_highlighted'])
    print(f"Loaded {len(df)} rows.")
except Exception as e:
    print(f"Error loading CSV: {e}")
    # Create dummy data for testing if file missing
    df = pd.DataFrame([{
        'full_review_highlighted': "I looked at the water and [[realized]] that I could swim underneath the base.",
        'learning_verb': 'realized',
        'game': 'Subnautica'
    }])
    print("Created dummy data for testing.")

# Convert to list of dicts for processing
data_records = df.to_dict('records')

In [None]:
# 4. TEST RUN (Single Example)
print("\n--- Running Test Example ---")
test_record = data_records[0]
has_perception = test_record.get('has_perception', False)
test_prompt = create_prompt(test_record['full_review_highlighted'], has_perception)

inputs = tokenizer([test_prompt], return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=1024,
    use_cache=True,
    temperature = 0.6, top_p = 0.95, top_k = 20, # Low temperature for consistent JSON
    pad_token_id=tokenizer.eos_token_id
)

decoded_test = tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]
print("Input Snippet:", test_record['full_review_highlighted'][:100] + "...")
print("Model Output:", decoded_test)

# Verify JSON parsing
try:
    json.loads(decoded_test)
    print("✅ JSON Parsing Successful")
except:
    print("❌ JSON Parsing Failed (Will use regex in batch loop)")

In [None]:
# 5. BATCH PROCESSING
import json
import re
from tqdm import tqdm

print(f"\nStarting batch processing of {len(data_records)} records...")
annotated_records = []
batch_queue = []

# Prepare prompts
for i, record in enumerate(data_records):
    # FIX: Pass the 'has_perception' flag from the CSV record!
    # Ensure your CSV actually has this column as a boolean
    has_perception = record.get('has_perception', False)

    batch_queue.append({
        "original_index": i,
        "prompt": create_prompt(record['full_review_highlighted'], has_perception)
    })

# Process in chunks
for i in tqdm(range(0, len(batch_queue), BATCH_SIZE), desc="Annotating"):
    batch = batch_queue[i : i + BATCH_SIZE]
    prompts = [item["prompt"] for item in batch]

    # Tokenize (Left padding is often safer for generation, but unsloth handles it well)
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=2048, # 2048 is likely overkill for a JSON response, 512 is faster
            temperature = 0.6, top_p = 0.95, top_k = 20, # Lower temp is better for strict JSON
            use_cache=True,
            pad_token_id=tokenizer.eos_token_id
        )

    generated_texts = tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)

    # Parse and Merge Results
    for j, gen_text in enumerate(generated_texts):
        idx = batch[j]["original_index"]
        original_record = data_records[idx].copy()

        try:
            # Regex to find JSON block
            json_match = re.search(r'\{.*\}', gen_text, re.DOTALL)
            if json_match:
                parsed_json = json.loads(json_match.group(0))
            else:
                parsed_json = json.loads(gen_text)

            # FIX: Mapping keys correctly based on PROMPT OUTPUT SCHEMA
            original_record['llm_is_genuine'] = parsed_json.get('is_genuine_learning', False)
            original_record['llm_category'] = parsed_json.get('learning_category', 'NONE')
            original_record['llm_perception_linked'] = parsed_json.get('perception_linked_to_learning', False)
            original_record['llm_reasoning'] = parsed_json.get('reasoning', '')

        except Exception as e:
            original_record['llm_error'] = str(e)
            original_record['raw_llm_output'] = gen_text

        annotated_records.append(original_record)

In [None]:
# 6. SAVE RESULTS
print(f"\nSaving results to {OUTPUT_CSV}...")
final_df = pd.DataFrame(annotated_records)
final_df.to_csv(OUTPUT_CSV, index=False)

# Optional: Save JSON version
with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
    json.dump(annotated_records, f, indent=2)

print("Processing complete. Download your files from the file browser.")

from google.colab import files
import os

files_to_download = ["annotated_learning_corpus.json", "annotated_learning_corpus_final.csv"]

for filename in files_to_download:
    if os.path.exists(filename):
        print(f"Downloading {filename}...")
        files.download(filename)
    else:
        print(f"File not found: {filename}. Did you run the previous cells?")