# Generate Noun Property Ratings

This notebook rates all nouns on selected properties and saves to JSON/JSONL.

**Output:** 
- `noun_property_ratings.jsonl` - JSONL format (one JSON object per line)
- `noun_property_nested.json` - Nested JSON format (nouns with property objects)

## 1. Setup

In [1]:
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from tqdm import tqdm
import os
import re
import json



## 2. Configuration

In [2]:
# File paths
NOUNS_FILE = "nouns.txt"
PROPERTIES_FILE = "curated_properties.txt"  # Use curated_properties.txt for 15 properties
OUTPUT_FILE = "noun_curated_property_ratings.jsonl"  # JSONL format (one JSON object per line)

# How often to save progress
SAVE_FREQUENCY = 200

## 3. Load Model

In [3]:
print("Loading model...")

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map="mps",  # Change to "cuda" for NVIDIA GPU or "cpu" for CPU, mps for mac
    torch_dtype="auto",
    trust_remote_code=False,
)

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=50,
    do_sample=False,
)

print("Model loaded!")

Loading model...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use mps
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Model loaded!


## 4. Load Nouns and Properties

In [4]:
# Read nouns
with open(NOUNS_FILE, 'r') as f:
    nouns = [line.strip() for line in f if line.strip()]

# Read properties
with open(PROPERTIES_FILE, 'r') as f:
    properties = [line.strip() for line in f if line.strip()]

print(f"Loaded {len(nouns)} nouns")
print(f"Loaded {len(properties)} properties")
print(f"Total ratings needed: {len(nouns) * len(properties):,}")
print(f"Estimated time: {len(nouns) * len(properties) * 2 / 3600:.1f} hours")

Loaded 1852 nouns
Loaded 15 properties
Total ratings needed: 27,780
Estimated time: 15.4 hours


## 5. Rating Function

In [5]:
def rate_noun_on_property(noun, property_name, pipe):
    """
    Rate a noun on a property from 1-10.
    Returns -1 if rating extraction fails.
    """
    prompt = f"""Rate the {property_name} of '{noun}' from 1 to 10.
1 = very low {property_name}
10 = very high {property_name}
Respond with only a number 1-10."""
    
    messages = [{"role": "user", "content": prompt}]
    
    try:
        output = pipe(messages)
        response = output[0]["generated_text"].strip()
        
        # Extract number from response
        numbers = re.findall(r'\b([1-9]|10)\b', response)
        if numbers:
            rating = int(numbers[0])
            if 1 <= rating <= 10:
                return rating
        
        return -1
    except Exception as e:
        print(f"Error: {noun} - {property_name}: {str(e)}")
        return -1

## 6. Test Rating Function

In [6]:
# Quick test
print("Testing...")
test_rating = rate_noun_on_property("elephant", "size", pipe)
print(f"elephant - size: {test_rating}/10")

Testing...
elephant - size: 10/10


## 7. Generate All Ratings

In [7]:
# Check for existing progress
if os.path.exists(OUTPUT_FILE):
    existing_ratings = []
    with open(OUTPUT_FILE, 'r') as f:
        for line in f:
            existing_ratings.append(json.loads(line))
    completed = set((r['noun'], r['property']) for r in existing_ratings)
    print(f"Found {len(existing_ratings)} existing ratings")
else:
    existing_ratings = []
    completed = set()
    print("Starting fresh")

# Create list of remaining work
all_combinations = [(noun, prop) for noun in nouns for prop in properties]
remaining = [combo for combo in all_combinations if combo not in completed]

print(f"Remaining: {len(remaining):,} ratings")
print("\nStarting...\n")

Found 22000 existing ratings
Remaining: 5,780 ratings

Starting...



In [8]:
# Main loop
results = []

for i, (noun, prop) in enumerate(tqdm(remaining, desc="Rating")):
    # Get rating
    rating = rate_noun_on_property(noun, prop, pipe)
    
    # Store
    results.append({
        'noun': noun,
        'property': prop,
        'rating': rating
    })
    
    # Save periodically
    if (i + 1) % SAVE_FREQUENCY == 0:
        # Append new results to file
        with open(OUTPUT_FILE, 'a') as f:
            for result in results:
                f.write(json.dumps(result) + '\n')
        results = []
        print(f"Saved at {i + 1} ratings")

# Final save
if results:
    with open(OUTPUT_FILE, 'a') as f:
        for result in results:
            f.write(json.dumps(result) + '\n')

print(f"\nDone! Saved to {OUTPUT_FILE}")

Rating:   3%|▎         | 200/5780 [02:00<1:03:36,  1.46it/s]

Saved at 200 ratings


Rating:   7%|▋         | 400/5780 [03:38<1:04:05,  1.40it/s]

Saved at 400 ratings


Rating:  10%|█         | 600/5780 [05:25<42:42,  2.02it/s]  

Saved at 600 ratings


Rating:  14%|█▍        | 800/5780 [07:23<30:45,  2.70it/s]  

Saved at 800 ratings


Rating:  17%|█▋        | 1000/5780 [08:55<28:51,  2.76it/s] 

Saved at 1000 ratings


Rating:  21%|██        | 1200/5780 [10:38<35:56,  2.12it/s]  

Saved at 1200 ratings


Rating:  24%|██▍       | 1400/5780 [12:29<31:38,  2.31it/s]  

Saved at 1400 ratings


Rating:  28%|██▊       | 1600/5780 [15:12<25:42,  2.71it/s]  

Saved at 1600 ratings


Rating:  31%|███       | 1800/5780 [16:50<32:57,  2.01it/s]  

Saved at 1800 ratings


Rating:  35%|███▍      | 2000/5780 [19:10<26:23,  2.39it/s]  

Saved at 2000 ratings


Rating:  38%|███▊      | 2200/5780 [21:31<1:10:45,  1.19s/it]

Saved at 2200 ratings


Rating:  42%|████▏     | 2400/5780 [23:22<27:32,  2.05it/s]  

Saved at 2400 ratings


Rating:  45%|████▍     | 2600/5780 [25:14<23:16,  2.28it/s]  

Saved at 2600 ratings


Rating:  48%|████▊     | 2800/5780 [27:10<20:41,  2.40it/s]  

Saved at 2800 ratings


Rating:  52%|█████▏    | 3000/5780 [29:20<30:08,  1.54it/s]  

Saved at 3000 ratings


Rating:  55%|█████▌    | 3200/5780 [31:30<18:39,  2.30it/s]  

Saved at 3200 ratings


Rating:  59%|█████▉    | 3400/5780 [33:35<1:09:14,  1.75s/it]

Saved at 3400 ratings


Rating:  62%|██████▏   | 3600/5780 [35:16<25:21,  1.43it/s]  

Saved at 3600 ratings


Rating:  66%|██████▌   | 3800/5780 [37:14<13:30,  2.44it/s]  

Saved at 3800 ratings


Rating:  69%|██████▉   | 4000/5780 [39:42<49:24,  1.67s/it]  

Saved at 4000 ratings


Rating:  73%|███████▎  | 4200/5780 [41:50<17:10,  1.53it/s]  

Saved at 4200 ratings


Rating:  76%|███████▌  | 4400/5780 [43:53<09:12,  2.50it/s]  

Saved at 4400 ratings


Rating:  80%|███████▉  | 4600/5780 [46:42<08:12,  2.39it/s]  

Saved at 4600 ratings


Rating:  83%|████████▎ | 4800/5780 [48:54<06:39,  2.45it/s]

Saved at 4800 ratings


Rating:  87%|████████▋ | 5000/5780 [51:51<05:54,  2.20it/s]

Saved at 5000 ratings


Rating:  90%|████████▉ | 5200/5780 [54:22<05:01,  1.93it/s]

Saved at 5200 ratings


Rating:  93%|█████████▎| 5400/5780 [57:06<06:20,  1.00s/it]

Saved at 5400 ratings


Rating:  97%|█████████▋| 5600/5780 [59:28<01:45,  1.71it/s]

Saved at 5600 ratings


Rating: 100%|██████████| 5780/5780 [1:02:14<00:00,  1.55it/s]


Done! Saved to noun_curated_property_ratings.jsonl





## 8. View Results

In [9]:
# Load and display results
ratings = []
with open(OUTPUT_FILE, 'r') as f:
    for line in f:
        ratings.append(json.loads(line))

print(f"Total ratings: {len(ratings):,}")
failed = sum(1 for r in ratings if r['rating'] == -1)
print(f"Failed ratings: {failed}")
print("\nFirst 20 ratings:")
for rating in ratings[:20]:
    print(rating)

Total ratings: 27,780
Failed ratings: 2000

First 20 ratings:
{'noun': 'aardvark', 'property': 'size', 'rating': 3}
{'noun': 'aardvark', 'property': 'weight', 'rating': 3}
{'noun': 'aardvark', 'property': 'naturality', 'rating': 8}
{'noun': 'aardvark', 'property': 'edibility', 'rating': 1}
{'noun': 'aardvark', 'property': 'intelligence', 'rating': 3}
{'noun': 'aardvark', 'property': 'dangerous', 'rating': 1}
{'noun': 'aardvark', 'property': 'mobility', 'rating': 3}
{'noun': 'aardvark', 'property': 'portability', 'rating': 10}
{'noun': 'aardvark', 'property': 'cost', 'rating': 1}
{'noun': 'aardvark', 'property': 'hardness', 'rating': 1}
{'noun': 'aardvark', 'property': 'temperature', 'rating': 1}
{'noun': 'aardvark', 'property': 'loudness', 'rating': 2}
{'noun': 'aardvark', 'property': 'usefulness', 'rating': 2}
{'noun': 'aardvark', 'property': 'popularity', 'rating': 3}
{'noun': 'aardvark', 'property': 'indoor_outdoor', 'rating': 1}
{'noun': 'acai', 'property': 'size', 'rating': 3}
{'n

In [10]:
# Create nested format (optional - easier to work with)
# Structure: { noun: { property: rating, ... }, ... }
nested = {}
for rating in ratings:
    if rating['rating'] != -1:  # Skip failed ratings
        noun = rating['noun']
        if noun not in nested:
            nested[noun] = {}
        nested[noun][rating['property']] = rating['rating']

with open('noun_property_nested.json', 'w') as f:
    json.dump(nested, f, indent=2)

print("Also saved nested format to: noun_property_nested.json")
print(f"Structure: {len(nested)} nouns with properties as nested objects")

Also saved nested format to: noun_property_nested.json
Structure: 1852 nouns with properties as nested objects
