# Clean ServiceProviderProfile Embeddings

This notebook removes/cleans the `profile_embedding` field contents for all ServiceProviderProfile records in the database.

In [1]:
import os
import sys
import django
from pathlib import Path

# Setup Django path and settings
notebook_dir = Path.cwd()
django_path = notebook_dir / 'growbal_django'
sys.path.insert(0, str(django_path))

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "growbal.settings")
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

django.setup()

print("✅ Django environment setup complete")

✅ Django environment setup complete


In [2]:
# Test database connection
from django.db import connection

try:
    with connection.cursor() as cursor:
        cursor.execute("SELECT 1")
        result = cursor.fetchone()
        print("✅ Database connection successful")
        
        # Check if we can access the ServiceProviderProfile table
        cursor.execute("""
            SELECT COUNT(*) 
            FROM accounts_serviceproviderprofile 
            WHERE profile_embedding IS NOT NULL
        """)
        count = cursor.fetchone()[0]
        print(f"✅ Found {count} profiles with embeddings in database")
        
except Exception as e:
    print(f"❌ Database connection failed: {e}")
    raise

✅ Database connection successful
✅ Found 0 profiles with embeddings in database


In [3]:
# Import the ServiceProviderProfile model
from accounts.models import ServiceProviderProfile

# Get total count of profiles
total_profiles = ServiceProviderProfile.objects.count()
print(f"Total ServiceProviderProfile records: {total_profiles}")

# Count profiles with embeddings
profiles_with_embeddings = ServiceProviderProfile.objects.exclude(profile_embedding__isnull=True).count()
print(f"Profiles with embeddings: {profiles_with_embeddings}")

Total ServiceProviderProfile records: 482
Profiles with embeddings: 482


In [4]:
# Preview profiles with embeddings (show first 5)
profiles_to_clean = ServiceProviderProfile.objects.exclude(profile_embedding__isnull=True)[:5]

print("Preview of profiles with embeddings (first 5):")
print("-" * 50)
for profile in profiles_to_clean:
    print(f"ID: {profile.id}")
    print(f"Name: {profile.name}")
    print(f"Provider Type: {profile.provider_type}")
    print(f"Country: {profile.country}")
    print(f"Has embedding: {profile.profile_embedding is not None}")
    if profile.profile_embedding is not None:
        print(f"Embedding dimensions: {len(profile.profile_embedding) if hasattr(profile.profile_embedding, '__len__') else 'N/A'}")
    print("-" * 50)

Preview of profiles with embeddings (first 5):
--------------------------------------------------
ID: 37
Name: 360 Chartered Accountants
Provider Type: Company
Country: UK
Has embedding: True
Embedding dimensions: 1536
--------------------------------------------------
ID: 39
Name: 3E Accounting India
Provider Type: Company
Country: India
Has embedding: True
Embedding dimensions: 1536
--------------------------------------------------
ID: 40
Name: A&A Associate LLC
Provider Type: Company
Country: UAE
Has embedding: True
Embedding dimensions: 1536
--------------------------------------------------
ID: 41
Name: AEY Group
Provider Type: Company
Country: UAE
Has embedding: True
Embedding dimensions: 1536
--------------------------------------------------
ID: 42
Name: AKA Management Consultancy
Provider Type: Company
Country: UAE
Has embedding: True
Embedding dimensions: 1536
--------------------------------------------------


In [6]:
# Function to clean embeddings with progress tracking
def clean_embeddings(dry_run=True):
    """
    Clean profile_embedding field for all ServiceProviderProfile records.
    
    Args:
        dry_run (bool): If True, only shows what would be cleaned without making changes.
                       If False, actually performs the cleaning.
    """
    profiles_with_embeddings = ServiceProviderProfile.objects.exclude(profile_embedding__isnull=True)
    count = profiles_with_embeddings.count()
    
    if dry_run:
        print(f"DRY RUN: Would clean {count} profile embeddings")
        return count
    
    print(f"Starting to clean {count} profile embeddings...")
    
    # Update all profiles with embeddings to set them to None
    updated = profiles_with_embeddings.update(profile_embedding=None)
    
    print(f"Successfully cleaned {updated} profile embeddings")
    return updated

# First, do a dry run to see what would be cleaned
clean_embeddings(dry_run=True)

DRY RUN: Would clean 482 profile embeddings


482

In [7]:
# ACTUAL CLEANING - Remove embeddings
# WARNING: This will permanently delete all embedding data!
# 
# To perform the actual cleaning, uncomment ONE of the following lines:

# Option 1: Fast bulk cleaning (recommended for large datasets)
cleaned_count = clean_embeddings(dry_run=False)

# Option 2: Detailed progress tracking (shows progress every 10 profiles) 
# cleaned_count = clean_embeddings_with_progress(dry_run=False)

print("⚠️  Ready to clean embeddings. Uncomment one of the lines above to proceed.")

Starting to clean 482 profile embeddings...
Successfully cleaned 482 profile embeddings
⚠️  Ready to clean embeddings. Uncomment one of the lines above to proceed.


In [6]:
# Alternative: Clean embeddings one by one with progress tracking
def clean_embeddings_with_progress(dry_run=True):
    """
    Clean embeddings one by one with detailed progress tracking.
    
    Args:
        dry_run (bool): If True, only shows what would be cleaned without making changes.
    """
    profiles_with_embeddings = ServiceProviderProfile.objects.exclude(profile_embedding__isnull=True)
    total = profiles_with_embeddings.count()
    
    if total == 0:
        print("No profiles with embeddings found.")
        return 0
    
    print(f"{'DRY RUN: ' if dry_run else ''}Processing {total} profiles...")
    
    cleaned = 0
    failed = 0
    
    for i, profile in enumerate(profiles_with_embeddings, 1):
        try:
            if not dry_run:
                profile.profile_embedding = None
                profile.save(update_fields=['profile_embedding'])
            
            cleaned += 1
            
            # Print progress every 10 profiles or at the end
            if i % 10 == 0 or i == total:
                print(f"Progress: {i}/{total} ({(i/total)*100:.1f}%) - Cleaned: {cleaned}, Failed: {failed}")
                
        except Exception as e:
            failed += 1
            print(f"Failed to clean profile ID {profile.id}: {str(e)}")
    
    print(f"\n{'DRY RUN ' if dry_run else ''}Summary:")
    print(f"  Total profiles processed: {total}")
    print(f"  Successfully {'would be ' if dry_run else ''}cleaned: {cleaned}")
    print(f"  Failed: {failed}")
    
    return cleaned

# Dry run with progress tracking
clean_embeddings_with_progress(dry_run=True)

DRY RUN: Processing 482 profiles...
Progress: 10/482 (2.1%) - Cleaned: 10, Failed: 0
Progress: 20/482 (4.1%) - Cleaned: 20, Failed: 0
Progress: 30/482 (6.2%) - Cleaned: 30, Failed: 0
Progress: 40/482 (8.3%) - Cleaned: 40, Failed: 0
Progress: 50/482 (10.4%) - Cleaned: 50, Failed: 0
Progress: 60/482 (12.4%) - Cleaned: 60, Failed: 0
Progress: 70/482 (14.5%) - Cleaned: 70, Failed: 0
Progress: 80/482 (16.6%) - Cleaned: 80, Failed: 0
Progress: 90/482 (18.7%) - Cleaned: 90, Failed: 0
Progress: 100/482 (20.7%) - Cleaned: 100, Failed: 0
Progress: 110/482 (22.8%) - Cleaned: 110, Failed: 0
Progress: 120/482 (24.9%) - Cleaned: 120, Failed: 0
Progress: 130/482 (27.0%) - Cleaned: 130, Failed: 0
Progress: 140/482 (29.0%) - Cleaned: 140, Failed: 0
Progress: 150/482 (31.1%) - Cleaned: 150, Failed: 0
Progress: 160/482 (33.2%) - Cleaned: 160, Failed: 0
Progress: 170/482 (35.3%) - Cleaned: 170, Failed: 0
Progress: 180/482 (37.3%) - Cleaned: 180, Failed: 0
Progress: 190/482 (39.4%) - Cleaned: 190, Failed: 

482

In [None]:
# ACTUAL CLEANING with progress tracking
# WARNING: This will permanently delete all embedding data!
# Uncomment the line below to actually perform the cleaning

# cleaned_count = clean_embeddings_with_progress(dry_run=False)

In [7]:
# Verify cleaning results
def verify_cleaning():
    """
    Verify that embeddings have been cleaned.
    """
    total_profiles = ServiceProviderProfile.objects.count()
    profiles_with_embeddings = ServiceProviderProfile.objects.exclude(profile_embedding__isnull=True).count()
    profiles_without_embeddings = ServiceProviderProfile.objects.filter(profile_embedding__isnull=True).count()
    
    print("Verification Results:")
    print("=" * 50)
    print(f"Total profiles: {total_profiles}")
    print(f"Profiles WITH embeddings: {profiles_with_embeddings}")
    print(f"Profiles WITHOUT embeddings: {profiles_without_embeddings}")
    
    if profiles_with_embeddings == 0:
        print("\n✅ All embeddings have been successfully cleaned!")
    else:
        print(f"\n⚠️  {profiles_with_embeddings} profiles still have embeddings")
    
    return profiles_with_embeddings == 0

verify_cleaning()

Verification Results:
Total profiles: 482
Profiles WITH embeddings: 482
Profiles WITHOUT embeddings: 0

⚠️  482 profiles still have embeddings


False

In [None]:
# EXECUTE CLEANING - Run this cell to actually clean all embeddings
# This cell is ready to run and will clean all embeddings when executed

confirm = input("Are you sure you want to delete ALL embeddings? Type 'yes' to confirm: ")

if confirm.lower() == 'yes':
    print("\n🔄 Starting embedding cleanup...")
    cleaned_count = clean_embeddings(dry_run=False)
    print(f"\n✅ Complete! Cleaned {cleaned_count} embeddings.")
    
    # Verify the results
    print("\n" + "="*50)
    verify_cleaning()
else:
    print("❌ Cleaning cancelled. No changes were made.")

In [None]:
# Optional: Clean embeddings for specific profiles by ID
def clean_specific_profiles(profile_ids, dry_run=True):
    """
    Clean embeddings for specific profiles by their IDs.
    
    Args:
        profile_ids (list): List of profile IDs to clean
        dry_run (bool): If True, only shows what would be cleaned
    """
    profiles = ServiceProviderProfile.objects.filter(id__in=profile_ids)
    found_count = profiles.count()
    
    print(f"Found {found_count} profiles out of {len(profile_ids)} requested IDs")
    
    cleaned = 0
    for profile in profiles:
        if profile.profile_embedding is not None:
            print(f"{'Would clean' if dry_run else 'Cleaning'} embedding for profile ID {profile.id} ({profile.name})")
            if not dry_run:
                profile.profile_embedding = None
                profile.save(update_fields=['profile_embedding'])
            cleaned += 1
        else:
            print(f"Profile ID {profile.id} ({profile.name}) has no embedding to clean")
    
    print(f"\n{'Would clean' if dry_run else 'Cleaned'} {cleaned} embeddings")
    return cleaned

# Example usage (uncomment and modify IDs as needed):
# clean_specific_profiles([1, 2, 3], dry_run=True)