In [12]:
from datasets import load_dataset

ds = load_dataset("nvidia/Nemotron-Personas-USA", split="train", streaming=True)
ds = ds.take(2000)

In [6]:
# remove professional_persona sports_persona arts_persona travel_persona culinary_persona skills_and_expertise hobbies_and_interests career_goals_and_ambitions
ds = ds.remove_columns(
    [
        "professional_persona",
        "sports_persona",
        "arts_persona",
        "travel_persona",
        "culinary_persona",
        "skills_and_expertise",
        "hobbies_and_interests",
        "career_goals_and_ambitions",
    ]
)

In [13]:
ds_list = list(ds)  # Convert to list if you need all 100 rows in memory

In [15]:
# Define relationship categories and their distribution per main persona
# Each main persona will have ~100 contacts distributed as follows:

CONTACT_DISTRIBUTION = {
    "family": 8,  # Parents, siblings, spouse, children, in-laws, etc.
    "close_friends": 10,  # Best friends, childhood friends
    "friends": 15,  # Regular friends, acquaintances
    "colleagues": 12,  # Coworkers, boss, professional contacts
    "neighbors": 5,  # People living nearby
    "professionals": 20,  # Service providers (plumber, doctor, mechanic, etc.)
    "businesses": 15,  # Restaurants, stores, delivery services
    "casual": 10,  # Gym buddies, hobby groups, etc.
    "other": 5,  # Misc contacts
}

# Professional service types for realistic SMS interactions
PROFESSIONAL_TYPES = [
    "plumber",
    "electrician",
    "mechanic",
    "doctor",
    "dentist",
    "hairdresser",
    "veterinarian",
    "accountant",
    "lawyer",
    "realtor",
    "contractor",
    "landscaper",
    "tutor",
    "personal_trainer",
    "therapist",
    "babysitter",
    "dog_walker",
    "cleaner",
    "handyman",
    "insurance_agent",
]

# Business types
BUSINESS_TYPES = [
    "pharmacy",
    "restaurant",
    "pizza_delivery",
    "grocery_store",
    "bank",
    "gym",
    "salon",
    "auto_shop",
    "dry_cleaner",
    "pet_store",
    "dentist_office",
    "medical_clinic",
    "school",
    "daycare",
    "veterinary_clinic",
]

print(f"Total contacts per persona: {sum(CONTACT_DISTRIBUTION.values())}")

Total contacts per persona: 100


In [16]:
import random

# Load more personas for the contact pool (we need 10 main + 10*100 contacts = 1010 minimum)
# Loading extra to have variety
ds_full = load_dataset("nvidia/Nemotron-Personas-USA", split="train", streaming=True)
all_personas = list(ds_full.take(1500))

print(f"Loaded {len(all_personas)} personas")

Loaded 1500 personas


In [17]:
# Select 10 diverse main personas (varied ages, locations, professions)
def select_main_personas(personas, n=10):
    """Select diverse main personas based on age and profession variety."""
    # Shuffle and pick first n for now (can add more sophisticated selection later)
    shuffled = random.sample(personas, min(len(personas), 100))
    return shuffled[:n]


main_personas = select_main_personas(all_personas, n=10)

print("Selected main personas:")
for i, p in enumerate(main_personas):
    print(
        f"  {i + 1}. Age: {p.get('age', 'N/A')}, Location: {p.get('location', 'N/A')}"
    )

Selected main personas:
  1. Age: 42, Location: N/A
  2. Age: 16, Location: N/A
  3. Age: 14, Location: N/A
  4. Age: 48, Location: N/A
  5. Age: 67, Location: N/A
  6. Age: 6, Location: N/A
  7. Age: 46, Location: N/A
  8. Age: 56, Location: N/A
  9. Age: 5, Location: N/A
  10. Age: 73, Location: N/A


In [None]:
def create_contact_network(
    main_persona, all_personas, distribution, professional_types, business_types
):
    """
    Create a contact network for a main persona.
    Returns a dict with relationship type -> list of contact personas.
    """
    contacts = {}
    remaining_personas = [p for p in all_personas if p != main_persona]

    for relationship_type, count in distribution.items():
        contacts[relationship_type] = []

        for _ in range(count):
            if not remaining_personas:
                break

            # Select a contact persona
            idx = random.randint(0, len(remaining_personas) - 1)
            contact = remaining_personas.pop(idx)

            # Add metadata based on relationship type
            contact_info = {
                "persona": contact,
                "relationship": relationship_type,
            }

            # Add specific metadata for professionals and businesses
            if relationship_type == "professionals":
                contact_info["service_type"] = random.choice(professional_types)
                contact_info["business_name"] = (
                    f"{contact.get('location', 'Local').split(',')[0]} {contact_info['service_type'].title()} Services"
                )
            elif relationship_type == "businesses":
                contact_info["business_type"] = random.choice(business_types)
                contact_info["business_name"] = (
                    f"{contact.get('location', 'Local').split(',')[0]} {contact_info['business_type'].replace('_', ' ').title()}"
                )

            contacts[relationship_type].append(contact_info)

    return contacts


# Build contact networks for all main personas
contact_networks = {}
for i, main_p in enumerate(main_personas):
    contact_networks[i] = {
        "main_persona": main_p,
        "contacts": create_contact_network(
            main_p,
            all_personas,
            CONTACT_DISTRIBUTION,
            PROFESSIONAL_TYPES,
            BUSINESS_TYPES,
        ),
    }
    total_contacts = sum(len(v) for v in contact_networks[i]["contacts"].values())
    print(f"Persona {i + 1}: {total_contacts} contacts created")

Persona 1: 100 contacts created
Persona 2: 100 contacts created
Persona 3: 100 contacts created
Persona 4: 100 contacts created
Persona 5: 100 contacts created
Persona 6: 100 contacts created
Persona 7: 100 contacts created
Persona 8: 100 contacts created
Persona 9: 100 contacts created
Persona 10: 100 contacts created


In [19]:
# Preview the contact network structure for the first main persona
example_network = contact_networks[0]

print("=" * 60)
print("MAIN PERSONA:")
print(f"  Age: {example_network['main_persona'].get('age')}")
print(f"  Location: {example_network['main_persona'].get('location')}")
print(f"  Persona: {example_network['main_persona'].get('persona', '')[:200]}...")
print("=" * 60)

print("\nCONTACT NETWORK SUMMARY:")
for rel_type, contacts in example_network["contacts"].items():
    print(f"\n{rel_type.upper()} ({len(contacts)} contacts):")
    for c in contacts[:2]:  # Show first 2 of each type
        if "service_type" in c:
            print(f"  - {c['business_name']} ({c['service_type']})")
        elif "business_type" in c:
            print(f"  - {c['business_name']} ({c['business_type']})")
        else:
            print(
                f"  - Age: {c['persona'].get('age')}, Location: {c['persona'].get('location')}"
            )

MAIN PERSONA:
  Age: 42
  Location: None
  Persona: Lucinda Gadson, a calm, artâ€‘driven veterinarian who blends scientific precision with watercolor therapy, finds solace in solitary garden meditation, yet occasionally battles a compulsive need to perfe...

CONTACT NETWORK SUMMARY:

FAMILY (8 contacts):
  - Age: 38, Location: None
  - Age: 56, Location: None

CLOSE_FRIENDS (10 contacts):
  - Age: 60, Location: None
  - Age: 78, Location: None

FRIENDS (15 contacts):
  - Age: 47, Location: None
  - Age: 47, Location: None

COLLEAGUES (12 contacts):
  - Age: 33, Location: None
  - Age: 28, Location: None

NEIGHBORS (5 contacts):
  - Age: 57, Location: None
  - Age: 1, Location: None

PROFESSIONALS (20 contacts):
  - Local Realtor Services (realtor)
  - Local Landscaper Services (landscaper)

BUSINESSES (15 contacts):
  - Local Daycare (daycare)
  - Local Daycare (daycare)

CASUAL (10 contacts):
  - Age: 2, Location: None
  - Age: 32, Location: None

OTHER (5 contacts):
  - Age: 78, Loca

## Next Steps

The `contact_networks` dictionary now contains:
- **10 main personas** - the phone owners
- **~100 contacts each** - distributed across relationship categories

Each contact includes:
- The full persona data from the dataset
- Relationship type (family, friend, professional, etc.)
- For professionals/businesses: service type and business name

### To generate synthetic SMS conversations:
1. Use an LLM to generate realistic conversations based on:
   - Both personas' demographics and interests
   - The relationship type (formal for professionals, casual for friends)
   - Realistic SMS patterns (short messages, emojis, typos, etc.)
2. Generate conversation history with varied message frequencies based on relationship closeness