# Populate Profile Embeddings

This notebook populates the `profile_embedding` field for all ServiceProviderProfile records using OpenAI embeddings.

## 1. Setup Django Environment

In [1]:
import os
import sys
import django
from pathlib import Path

# Setup Django path and settings
notebook_dir = Path.cwd()
django_path = notebook_dir / 'growbal_django'
sys.path.insert(0, str(django_path))

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "growbal.settings")
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

django.setup()

print("✅ Django environment setup complete")

✅ Django environment setup complete


## 2. Import Required Modules

In [2]:
from accounts.models import ServiceProviderProfile
from accounts.embedding_utils import EmbeddingGenerator, bulk_update_embeddings
from django.db.models import Q
import time

print("✅ Modules imported successfully")

✅ Modules imported successfully


## 3. Check Current Profile Status

In [None]:
# Get counts
total_profiles = ServiceProviderProfile.objects.count()
profiles_with_embeddings = ServiceProviderProfile.objects.filter(profile_embedding__isnull=False).count()
profiles_without_embeddings = ServiceProviderProfile.objects.filter(profile_embedding__isnull=True).count()

print(f"📊 Profile Statistics:")
print(f"   Total profiles: {total_profiles}")
print(f"   With embeddings: {profiles_with_embeddings}")
print(f"   Without embeddings: {profiles_without_embeddings}")

📊 Profile Statistics:
   Total profiles: 482
   With embeddings: 482
   Without embeddings: 0


: 

## 4. Initialize Embedding Generator

In [4]:
# Initialize the embedding generator
# It will automatically load the API key from environment variables
generator = EmbeddingGenerator()

# Verify API key is loaded
if generator.api_key:
    print(f"✅ OpenAI API key loaded (using model: {generator.model})")
else:
    print("❌ OpenAI API key not found! Please set OPENAI_API_KEY in environment variables.")

✅ OpenAI API key loaded (using model: text-embedding-ada-002)


## 5. Populate Embeddings for All Profiles

This will generate embeddings for all profiles that don't have them yet.

In [5]:
# Get profiles without embeddings
profiles_to_update = ServiceProviderProfile.objects.filter(profile_embedding__isnull=True)
count_to_update = profiles_to_update.count()

if count_to_update == 0:
    print("✅ All profiles already have embeddings!")
else:
    print(f"🚀 Starting to generate embeddings for {count_to_update} profiles...")
    print(f"   (Processing in batches of 10)\n")
    
    start_time = time.time()
    
    # Use the bulk update function
    bulk_update_embeddings(profiles=profiles_to_update)
    
    elapsed_time = time.time() - start_time
    
    print(f"\n✅ Embedding generation complete!")
    print(f"   Time taken: {elapsed_time:.2f} seconds")
    print(f"   Average time per profile: {elapsed_time/count_to_update:.2f} seconds")

🚀 Starting to generate embeddings for 482 profiles...
   (Processing in batches of 10)

Updated embedding for profile: 111 Group (1/482)
Updated embedding for profile: 360 Chartered Accountants (2/482)
Updated embedding for profile: 91Travel and Immigration (3/482)
Updated embedding for profile: 3E Accounting India (4/482)
Updated embedding for profile: A&A Associate LLC (5/482)
Updated embedding for profile: AEY Group (6/482)
Updated embedding for profile: AKA Management Consultancy (7/482)
Updated embedding for profile: A & H Consultants LLC (8/482)
Updated embedding for profile: AMA Accounting (9/482)
Updated embedding for profile: AAA Associates Immigration Services (10/482)
Updated embedding for profile: A.V. Immigration & Careers Consultancy Pvt. Ltd. (Apex Visas) (11/482)
Updated embedding for profile: ACDORA DMCC (12/482)
Updated embedding for profile: Abu Dhabi Global Market (ADGM) (13/482)
Updated embedding for profile: AKM Global (14/482)
Updated embedding for profile: AKT A

## 6. Verify Results

In [10]:
# Re-check counts after update
total_profiles = ServiceProviderProfile.objects.count()
profiles_with_embeddings = ServiceProviderProfile.objects.filter(profile_embedding__isnull=False).count()
profiles_without_embeddings = ServiceProviderProfile.objects.filter(profile_embedding__isnull=True).count()

print(f"📊 Updated Profile Statistics:")
print(f"   Total profiles: {total_profiles}")
print(f"   With embeddings: {profiles_with_embeddings} ✅")
print(f"   Without embeddings: {profiles_without_embeddings}")

if profiles_without_embeddings == 0:
    print("\n🎉 Success! All profiles now have embeddings.")
else:
    print(f"\n⚠️ {profiles_without_embeddings} profiles still need embeddings.")
    print("   This might be due to errors during generation. Check the output above for error messages.")

📊 Updated Profile Statistics:
   Total profiles: 482
   With embeddings: 422 ✅
   Without embeddings: 60

⚠️ 60 profiles still need embeddings.
   This might be due to errors during generation. Check the output above for error messages.


## 7. Test Search Functionality (Optional)

Test that the embeddings work correctly for similarity search.

In [None]:
from accounts.embedding_utils import search_profiles_by_text

# Test search with a sample query
test_query = "software development and web design services"
print(f"🔍 Testing search with query: '{test_query}'\n")

results = search_profiles_by_text(test_query, limit=5)

if results:
    print(f"Found {len(results)} similar profiles:\n")
    for i, profile in enumerate(results, 1):
        similarity = getattr(profile, 'similarity', 'N/A')
        print(f"{i}. {profile.name}")
        print(f"   Type: {profile.provider_type}")
        print(f"   Country: {profile.country or 'Not specified'}")
        print(f"   Similarity distance: {similarity}")
        print()
else:
    print("No results found. Make sure embeddings have been generated.")

## 8. Update Single Profile (If Needed)

Use this cell to update embedding for a specific profile.

In [None]:
# Uncomment and modify to update a specific profile
# profile_name = "Your Company Name"
# 
# try:
#     profile = ServiceProviderProfile.objects.get(name=profile_name)
#     generator = EmbeddingGenerator()
#     embedding = generator.update_profile_embedding(profile)
#     print(f"✅ Updated embedding for: {profile.name}")
#     print(f"   Embedding dimensions: {len(embedding)}")
# except ServiceProviderProfile.DoesNotExist:
#     print(f"❌ Profile with name '{profile_name}' not found")
# except Exception as e:
#     print(f"❌ Error: {e}")

## Notes

- **API Rate Limits**: OpenAI has rate limits. The bulk update function processes profiles in batches to avoid hitting these limits.
- **Cost**: Each embedding generation costs tokens. Monitor your OpenAI usage.
- **Text Length**: Very long profile texts (>30,000 chars) are automatically truncated to fit OpenAI's token limits.
- **Error Handling**: The bulk update function will continue even if some profiles fail, printing error messages for debugging.