## üì¶ Setup

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from dotenv import load_dotenv

# Add project root to path
sys.path.insert(0, os.path.abspath('..'))

from nlp.embeddings import EmbeddingGenerator

load_dotenv()

# Configuration
PROJECT_ID = os.getenv('GCP_PROJECT_ID', 'sg-job-market')
DATASET_ID = os.getenv('BQ_DATASET_ID', 'sg_job_market')

print("‚úÖ Imports successful")
print(f"Project: {PROJECT_ID}")
print(f"Dataset: {DATASET_ID}")

## üìä Load Embeddings from BigQuery

In [None]:
client = bigquery.Client(project=PROJECT_ID)

query = f"""
SELECT 
    e.job_id,
    e.source,
    e.embedding,
    e.model_name,
    e.created_at,
    c.job_title,
    c.company_name,
    c.job_location,
    c.job_classification,
    c.job_salary_mid_sgd_monthly
FROM `{PROJECT_ID}.{DATASET_ID}.job_embeddings` e
JOIN `{PROJECT_ID}.{DATASET_ID}.cleaned_jobs` c
    ON e.job_id = c.job_id AND e.source = c.source
LIMIT 1000
"""

print("Loading embeddings from BigQuery...")
df = client.query(query).to_dataframe()

print(f"\n‚úÖ Loaded {len(df):,} jobs with embeddings")
print(f"Embedding dimension: {len(df['embedding'].iloc[0])}")
print(f"\nSample data:")
df[['job_title', 'company_name', 'job_classification']].head()

## üîç Test Similarity Search

Find jobs similar to a query using cosine similarity.

In [None]:
# Generate embedding for query
generator = EmbeddingGenerator()
query_text = "Senior Data Scientist with Python and machine learning experience"

print(f"Query: {query_text}")
print("\nGenerating query embedding...")
query_embedding = generator.embed_texts([query_text])[0]

print(f"‚úÖ Query embedding shape: {query_embedding.shape}")

In [None]:
# Calculate cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

embeddings_matrix = np.vstack(df['embedding'].values)
similarities = cosine_similarity([query_embedding], embeddings_matrix)[0]

# Add similarity scores to dataframe
df['similarity'] = similarities

# Get top 10 similar jobs
top_jobs = df.nlargest(10, 'similarity')[[
    'job_title', 'company_name', 'job_classification', 
    'job_location', 'job_salary_mid_sgd_monthly', 'similarity'
]]

print(f"\nüéØ Top 10 Similar Jobs:\n")
for idx, row in top_jobs.iterrows():
    print(f"{row['similarity']:.3f} | {row['job_title'][:50]:50} | {row['company_name'][:30]:30} | ${row['job_salary_mid_sgd_monthly']:.0f}")

top_jobs

## üìà Embedding Quality Checks

In [None]:
# Check embedding statistics
embeddings_matrix = np.vstack(df['embedding'].values)

print("üìä Embedding Statistics:\n")
print(f"Shape: {embeddings_matrix.shape}")
print(f"Value range: [{embeddings_matrix.min():.3f}, {embeddings_matrix.max():.3f}]")
print(f"Mean: {embeddings_matrix.mean():.3f}")
print(f"Std: {embeddings_matrix.std():.3f}")

# Check normalization (SBERT embeddings should have norm ‚âà 1)
norms = np.linalg.norm(embeddings_matrix, axis=1)
print(f"\nüìè Vector Norms:")
print(f"Mean: {norms.mean():.3f}")
print(f"Std: {norms.std():.3f}")
print(f"Min: {norms.min():.3f}")
print(f"Max: {norms.max():.3f}")

if abs(norms.mean() - 1.0) < 0.01:
    print("\n‚úÖ Embeddings are properly normalized (unit vectors)")
else:
    print("\n‚ö†Ô∏è Warning: Embeddings may not be normalized")

# Plot norm distribution
plt.figure(figsize=(10, 4))
plt.hist(norms, bins=50, edgecolor='black')
plt.axvline(1.0, color='red', linestyle='--', label='Expected norm = 1.0')
plt.xlabel('Vector Norm')
plt.ylabel('Frequency')
plt.title('Distribution of Embedding Vector Norms')
plt.legend()
plt.show()

## üé® Visualize Embeddings with PCA

In [None]:
# Reduce to 2D using PCA
print("Reducing 384 dimensions to 2D with PCA...")
pca = PCA(n_components=2, random_state=42)
embeddings_2d = pca.fit_transform(embeddings_matrix)

df['pca_1'] = embeddings_2d[:, 0]
df['pca_2'] = embeddings_2d[:, 1]

print(f"‚úÖ Explained variance: {pca.explained_variance_ratio_.sum():.2%}")

# Plot by job classification
plt.figure(figsize=(12, 8))
for classification in df['job_classification'].dropna().unique()[:10]:
    subset = df[df['job_classification'] == classification]
    plt.scatter(subset['pca_1'], subset['pca_2'], 
                label=classification, alpha=0.6, s=30)

plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.title('Job Embeddings Visualization (PCA)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

## üéØ Analyze Similarity by Category

In [None]:
# Group by job classification and check intra-cluster similarity
from sklearn.metrics.pairwise import cosine_similarity

def calculate_intra_cluster_similarity(df, classification):
    """Calculate average similarity within a job classification."""
    subset = df[df['job_classification'] == classification]
    if len(subset) < 2:
        return np.nan
    
    embeddings = np.vstack(subset['embedding'].values)
    sim_matrix = cosine_similarity(embeddings)
    
    # Get upper triangle (exclude diagonal)
    mask = np.triu(np.ones_like(sim_matrix, dtype=bool), k=1)
    return sim_matrix[mask].mean()

# Calculate for top categories
top_categories = df['job_classification'].value_counts().head(10).index
similarity_scores = []

for cat in top_categories:
    score = calculate_intra_cluster_similarity(df, cat)
    similarity_scores.append({
        'category': cat,
        'count': len(df[df['job_classification'] == cat]),
        'avg_similarity': score
    })

sim_df = pd.DataFrame(similarity_scores).sort_values('avg_similarity', ascending=False)

print("\nüìä Intra-Category Similarity (Higher = More Cohesive):\n")
print(sim_df.to_string(index=False))

# Plot
plt.figure(figsize=(10, 6))
plt.barh(sim_df['category'], sim_df['avg_similarity'])
plt.xlabel('Average Cosine Similarity')
plt.title('Embedding Cohesiveness by Job Category')
plt.tight_layout()
plt.show()

## üíæ Summary Statistics

In [None]:
print("\n" + "="*70)
print("üìã EMBEDDING PIPELINE SUMMARY")
print("="*70)

print(f"\n‚úÖ Total Jobs with Embeddings: {len(df):,}")
print(f"‚úÖ Embedding Dimension: {len(df['embedding'].iloc[0])}")
print(f"‚úÖ Model: {df['model_name'].iloc[0]}")
print(f"‚úÖ Sources: {df['source'].unique().tolist()}")
print(f"\nüìä Job Categories: {df['job_classification'].nunique()}")
print(f"üìä Companies: {df['company_name'].nunique()}")
print(f"üìä Locations: {df['job_location'].nunique()}")

print(f"\nüéØ Quality Metrics:")
print(f"  Vector norm: {norms.mean():.3f} ¬± {norms.std():.3f} (expected: 1.0)")
print(f"  Value range: [{embeddings_matrix.min():.3f}, {embeddings_matrix.max():.3f}]")
print(f"  PCA variance (2D): {pca.explained_variance_ratio_.sum():.2%}")

print("\n" + "="*70)
print("‚úÖ EMBEDDINGS ARE PRODUCTION-READY!")
print("="*70)
print("\nNext steps:")
print("  1. Create vector index: python -m nlp.create_vector_index")
print("  2. Train ML models: python -m ml.train")
print("  3. Build RAG pipeline: python -m genai.rag")