# Generate Noun Property Ratings

This notebook rates all nouns on selected properties and saves to JSON/JSONL.

**Output:** 
- `noun_property_ratings.jsonl` - JSONL format (one JSON object per line)
- `noun_property_nested.json` - Nested JSON format (nouns with property objects)

## 1. Setup

In [1]:
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from tqdm import tqdm
import os
import re
import json



## 2. Configuration

In [2]:
# File paths
NOUNS_FILE = "nouns.txt"
PROPERTIES_FILE = "curated_properties.txt"  # Use curated_properties.txt for 15 properties
OUTPUT_FILE = "noun_curated_property_ratings.jsonl"  # JSONL format (one JSON object per line)

# How often to save progress
SAVE_FREQUENCY = 200

## 3. Load Model

In [3]:
print("Loading model...")

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map="mps",  # Change to "cuda" for NVIDIA GPU or "cpu" for CPU, mps for mac
    torch_dtype="auto",
    trust_remote_code=False,
)

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=50,
    do_sample=False,
)

print("Model loaded!")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use mps
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Model loaded!


## 4. Load Nouns and Properties

In [4]:
# Read nouns
with open(NOUNS_FILE, 'r') as f:
    nouns = [line.strip() for line in f if line.strip()]

# Read properties
with open(PROPERTIES_FILE, 'r') as f:
    properties = [line.strip() for line in f if line.strip()]

print(f"Loaded {len(nouns)} nouns")
print(f"Loaded {len(properties)} properties")
print(f"Total ratings needed: {len(nouns) * len(properties):,}")
print(f"Estimated time: {len(nouns) * len(properties) * 2 / 3600:.1f} hours")

Loaded 1852 nouns
Loaded 15 properties
Total ratings needed: 27,780
Estimated time: 15.4 hours


## 5. Rating Function

In [5]:
def rate_noun_on_property(noun, property_name, pipe):
    """
    Rate a noun on a property from 1-10.
    Returns -1 if rating extraction fails.
    """
    prompt = f"""Rate the {property_name} of '{noun}' from 1 to 10.
1 = very low {property_name}
10 = very high {property_name}
Respond with only a number 1-10."""
    
    messages = [{"role": "user", "content": prompt}]
    
    try:
        output = pipe(messages)
        response = output[0]["generated_text"].strip()
        
        # Extract number from response
        numbers = re.findall(r'\b([1-9]|10)\b', response)
        if numbers:
            rating = int(numbers[0])
            if 1 <= rating <= 10:
                return rating
        
        return -1
    except Exception as e:
        print(f"Error: {noun} - {property_name}: {str(e)}")
        return -1

## 6. Test Rating Function

In [6]:
# Quick test
print("Testing...")
test_rating = rate_noun_on_property("elephant", "size", pipe)
print(f"elephant - size: {test_rating}/10")

Testing...
elephant - size: 10/10


## 7. Generate All Ratings

In [7]:
# Check for existing progress
if os.path.exists(OUTPUT_FILE):
    existing_ratings = []
    with open(OUTPUT_FILE, 'r') as f:
        for line in f:
            existing_ratings.append(json.loads(line))
    completed = set((r['noun'], r['property']) for r in existing_ratings)
    print(f"Found {len(existing_ratings)} existing ratings")
else:
    existing_ratings = []
    completed = set()
    print("Starting fresh")

# Create list of remaining work
all_combinations = [(noun, prop) for noun in nouns for prop in properties]
remaining = [combo for combo in all_combinations if combo not in completed]

print(f"Remaining: {len(remaining):,} ratings")
print("\nStarting...\n")

Found 13800 existing ratings
Remaining: 13,980 ratings

Starting...



In [8]:
# Main loop
results = []

for i, (noun, prop) in enumerate(tqdm(remaining, desc="Rating")):
    # Get rating
    rating = rate_noun_on_property(noun, prop, pipe)
    
    # Store
    results.append({
        'noun': noun,
        'property': prop,
        'rating': rating
    })
    
    # Save periodically
    if (i + 1) % SAVE_FREQUENCY == 0:
        # Append new results to file
        with open(OUTPUT_FILE, 'a') as f:
            for result in results:
                f.write(json.dumps(result) + '\n')
        results = []
        print(f"Saved at {i + 1} ratings")

# Final save
if results:
    with open(OUTPUT_FILE, 'a') as f:
        for result in results:
            f.write(json.dumps(result) + '\n')

print(f"\nDone! Saved to {OUTPUT_FILE}")

Rating:   1%|▏         | 200/13980 [02:45<8:32:09,  2.23s/it] 

Saved at 200 ratings


Rating:   3%|▎         | 400/13980 [05:22<2:03:24,  1.83it/s] 

Saved at 400 ratings


Rating:   4%|▍         | 600/13980 [07:53<1:49:16,  2.04it/s] 

Saved at 600 ratings


Rating:   6%|▌         | 800/13980 [09:32<1:32:53,  2.36it/s]

Saved at 800 ratings


Rating:   7%|▋         | 1000/13980 [12:31<10:34:37,  2.93s/it]

Saved at 1000 ratings


Rating:   9%|▊         | 1200/13980 [14:53<1:33:17,  2.28it/s] 

Saved at 1200 ratings


Rating:  10%|█         | 1400/13980 [16:58<3:50:05,  1.10s/it]

Saved at 1400 ratings


Rating:  11%|█▏        | 1600/13980 [19:00<2:21:28,  1.46it/s]

Saved at 1600 ratings


Rating:  13%|█▎        | 1800/13980 [21:46<1:36:00,  2.11it/s]

Saved at 1800 ratings


Rating:  14%|█▍        | 2000/13980 [24:29<3:36:06,  1.08s/it] 

Saved at 2000 ratings


Rating:  16%|█▌        | 2200/13980 [27:08<1:43:39,  1.89it/s]

Saved at 2200 ratings


Rating:  17%|█▋        | 2400/13980 [29:30<1:23:44,  2.30it/s]

Saved at 2400 ratings


Rating:  19%|█▊        | 2600/13980 [32:21<3:16:32,  1.04s/it] 

Saved at 2600 ratings


Rating:  20%|██        | 2800/13980 [35:38<1:31:34,  2.03it/s]

Saved at 2800 ratings


Rating:  21%|██▏       | 3000/13980 [38:18<1:25:34,  2.14it/s]

Saved at 3000 ratings


Rating:  23%|██▎       | 3200/13980 [41:13<1:10:39,  2.54it/s]

Saved at 3200 ratings


Rating:  24%|██▍       | 3400/13980 [43:35<1:38:21,  1.79it/s]

Saved at 3400 ratings


Rating:  26%|██▌       | 3600/13980 [46:39<1:17:51,  2.22it/s] 

Saved at 3600 ratings


Rating:  27%|██▋       | 3800/13980 [48:56<1:08:57,  2.46it/s]

Saved at 3800 ratings


Rating:  29%|██▊       | 4000/13980 [51:37<1:38:49,  1.68it/s]

Saved at 4000 ratings


Rating:  30%|███       | 4200/13980 [53:36<2:09:50,  1.26it/s]

Saved at 4200 ratings


Rating:  31%|███▏      | 4400/13980 [56:28<2:51:58,  1.08s/it]

Saved at 4400 ratings


Rating:  33%|███▎      | 4600/13980 [58:53<1:35:44,  1.63it/s]

Saved at 4600 ratings


Rating:  34%|███▍      | 4800/13980 [1:00:55<1:13:06,  2.09it/s]

Saved at 4800 ratings


Rating:  36%|███▌      | 5000/13980 [1:02:44<2:14:44,  1.11it/s]

Saved at 5000 ratings


Rating:  37%|███▋      | 5200/13980 [1:04:31<1:48:49,  1.34it/s]

Saved at 5200 ratings


Rating:  39%|███▊      | 5400/13980 [1:06:33<1:03:11,  2.26it/s]

Saved at 5400 ratings


Rating:  40%|████      | 5600/13980 [1:08:30<2:26:33,  1.05s/it]

Saved at 5600 ratings


Rating:  41%|████▏     | 5800/13980 [1:10:22<1:04:12,  2.12it/s]

Saved at 5800 ratings


Rating:  43%|████▎     | 6000/13980 [1:12:35<1:01:27,  2.16it/s]

Saved at 6000 ratings


Rating:  44%|████▍     | 6200/13980 [1:14:58<50:18,  2.58it/s]  

Saved at 6200 ratings


Rating:  46%|████▌     | 6400/13980 [1:17:25<53:36,  2.36it/s]  

Saved at 6400 ratings


Rating:  47%|████▋     | 6600/13980 [1:19:31<1:00:08,  2.05it/s]

Saved at 6600 ratings


Rating:  49%|████▊     | 6800/13980 [1:22:16<2:11:03,  1.10s/it]

Saved at 6800 ratings


Rating:  50%|█████     | 7000/13980 [1:24:23<1:09:58,  1.66it/s]

Saved at 7000 ratings


Rating:  52%|█████▏    | 7200/13980 [1:26:45<48:38,  2.32it/s]  

Saved at 7200 ratings


Rating:  53%|█████▎    | 7400/13980 [1:29:18<1:32:02,  1.19it/s]

Saved at 7400 ratings


Rating:  54%|█████▍    | 7600/13980 [1:31:31<3:56:49,  2.23s/it]

Saved at 7600 ratings


Rating:  56%|█████▌    | 7800/13980 [1:34:10<57:11,  1.80it/s]  

Saved at 7800 ratings


Rating:  57%|█████▋    | 8000/13980 [1:36:27<1:51:07,  1.11s/it]

Saved at 8000 ratings


Rating:  59%|█████▊    | 8200/13980 [1:38:09<1:00:26,  1.59it/s]

Saved at 8200 ratings


Rating:  59%|█████▊    | 8208/13980 [1:38:13<1:09:04,  1.39it/s]


KeyboardInterrupt: 

## 8. View Results

In [None]:
# Load and display results
ratings = []
with open(OUTPUT_FILE, 'r') as f:
    for line in f:
        ratings.append(json.loads(line))

print(f"Total ratings: {len(ratings):,}")
failed = sum(1 for r in ratings if r['rating'] == -1)
print(f"Failed ratings: {failed}")
print("\nFirst 20 ratings:")
for rating in ratings[:20]:
    print(rating)

In [None]:
# Create nested format (optional - easier to work with)
# Structure: { noun: { property: rating, ... }, ... }
nested = {}
for rating in ratings:
    if rating['rating'] != -1:  # Skip failed ratings
        noun = rating['noun']
        if noun not in nested:
            nested[noun] = {}
        nested[noun][rating['property']] = rating['rating']

with open('noun_property_nested.json', 'w') as f:
    json.dump(nested, f, indent=2)

print("Also saved nested format to: noun_property_nested.json")
print(f"Structure: {len(nested)} nouns with properties as nested objects")