## üì¶ Setup

In [2]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from dotenv import load_dotenv

# Add project root to path
sys.path.insert(0, os.path.abspath('..'))

from nlp.embeddings import EmbeddingGenerator

load_dotenv()

# Configuration
PROJECT_ID = os.getenv('GCP_PROJECT_ID', 'sg-job-market')
DATASET_ID = os.getenv('BQ_DATASET_ID', 'sg_job_market')

print("‚úÖ Imports successful")
print(f"Project: {PROJECT_ID}")
print(f"Dataset: {DATASET_ID}")

‚úÖ Imports successful
Project: sg-job-market
Dataset: sg_job_market


## üìä Load Embeddings from BigQuery

In [3]:
client = bigquery.Client(project=PROJECT_ID)

query = f"""
SELECT 
    e.job_id,
    e.source,
    e.embedding,
    e.model_name,
    e.created_at,
    c.job_title,
    c.company_name,
    c.job_location,
    c.job_classification
FROM `{PROJECT_ID}.{DATASET_ID}.job_embeddings` e
JOIN `{PROJECT_ID}.{DATASET_ID}.cleaned_jobs` c
    ON e.job_id = c.job_id AND e.source = c.source
LIMIT 1000
"""

print("Loading embeddings from BigQuery...")
df = client.query(query).to_dataframe()

print(f"\n‚úÖ Loaded {len(df):,} jobs with embeddings")
print(f"Embedding dimension: {len(df['embedding'].iloc[0])}")
print(f"\nSample data:")
df[['job_title', 'company_name', 'job_classification']].head()

Loading embeddings from BigQuery...





‚úÖ Loaded 1,000 jobs with embeddings
Embedding dimension: 384

Sample data:


Unnamed: 0,job_title,company_name,job_classification
0,Legal Counsel (AI Technology),STAFFKING .,Legal
1,APAC Legal Manager,UPS ASIA GROUP .,Legal
2,Process Engineer,DR LASER SINGAPORE .,Precision Engineering
3,"Senior Associate, Real Estate",WEE HUR CAPITAL .,Real Estate / Property Management
4,Event Executive/Planner (Event Company / 5D / ...,NALA EMPLOYMENT .,Entertainment


## üîç Test Similarity Search

Find jobs similar to a query using cosine similarity.

In [8]:
# Generate embedding for query
generator = EmbeddingGenerator()
query_text = "Senior Data Scientist with Python and machine learning experience"

print(f"Query: {query_text}")
print("\nGenerating query embedding...")
query_embedding = generator.embed_texts([query_text])

print(f"> Embedded matrix: {query_embedding}")
print(f"\n> Query embedding shape: {query_embedding.shape}")

# Since we only have one query, we take the first row
query_embedding = query_embedding[0]
print(f"\n> Query embedding vector shape: {query_embedding.shape}")

Query: Senior Data Scientist with Python and machine learning experience

Generating query embedding...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 104.87it/s]

> Embedded matrix: [[-8.14497843e-02  2.61427164e-02  6.16718084e-02  6.67643771e-02
  -7.61197461e-03 -1.44538581e-01 -4.00911644e-02 -3.23459655e-02
  -1.56884685e-01 -4.77438755e-02 -7.87186027e-02 -2.34097075e-02
   4.63907644e-02 -1.95023045e-02  3.21459547e-02  1.08191848e-01
  -1.01614125e-01 -2.46658474e-02  4.78268042e-03 -1.20968275e-01
  -4.70162444e-02 -7.23217847e-03 -1.41038913e-02 -4.45014313e-02
   6.55130595e-02 -3.91232856e-02  3.53042148e-02  6.23796768e-02
  -2.10496597e-02  3.39972600e-02 -2.86967829e-02 -2.79243384e-02
  -1.50212394e-02  5.38915880e-02  1.80311850e-03  4.46798392e-02
   3.91765647e-02 -1.21125979e-02 -1.43404044e-02  5.75038269e-02
  -1.42612681e-02  4.15800698e-03 -2.92154448e-03 -3.96813937e-02
  -1.01660015e-02 -5.27241128e-03  1.32119376e-02 -1.17309138e-01
   4.90341075e-02 -3.39578018e-02 -2.83774007e-02 -3.30944881e-02
  -4.23823996e-03 -9.11882073e-02 -4.46428433e-02 -1.81986819e-04
   7.47091696e-02 -3.39747220e-03  3.03726504e-03 -1.1072




In [14]:
print("> Original embeddings from df (taking only the first 2 elements):")
print(df['embedding'].values[:2])
print(f"\n> Number of embeddings: {len(df['embedding'].values)}")

# We do a vertical stack to convert list of arrays into 2D array
embeddings_matrix = np.vstack(df['embedding'].values)

print ("\n> Embedded matrix from df:")
print(embeddings_matrix)
print(f"\n> Embeddings matrix shape: {embeddings_matrix.shape}")

> Original embeddings from df (taking only the first 2 elements):
[array([-4.98189405e-02, -8.26002844e-03, -1.19957756e-02, -7.47045502e-02,
        -2.42943838e-02,  4.51588146e-02, -2.37924866e-02,  2.65324414e-02,
         7.46911094e-02, -1.71521213e-04, -9.39833820e-02,  2.76090913e-02,
        -6.56824410e-02,  7.22736493e-02,  8.15652907e-02,  6.16131686e-02,
         1.46569237e-02, -4.28696796e-02, -2.10201032e-02, -2.84688156e-02,
        -2.85437256e-02,  4.25332561e-02, -4.87634726e-02, -3.78681906e-02,
        -6.03517853e-02, -5.73646314e-02,  2.95113921e-02, -1.04266353e-01,
        -9.63155180e-03, -8.36342424e-02,  3.69902328e-03, -6.32960489e-03,
         8.28375816e-02,  3.20356786e-02,  3.16617899e-02,  5.21520246e-03,
         1.30629074e-02, -2.21475605e-02,  2.71570906e-02, -1.21300546e-02,
         2.30636299e-02,  3.16173807e-02, -5.63575551e-02, -8.73859692e-03,
         2.47015562e-02, -2.18418427e-02,  3.24706808e-02,  8.60431325e-03,
        -1.11844903e-0

In [25]:
# Calculate cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity([query_embedding], embeddings_matrix)

print(f"> Similarities: {similarities}")
print(f"\n> Similarities shape: {similarities.shape}")

# Since we have only one query, take the first row
similarities = similarities[0]
print(f"\n> Similarity scores vector shape: {similarities.shape}")

# Add similarity scores to dataframe
df['similarity'] = similarities

# See similarity column
df[['job_title', 'company_name', 'similarity']].head()

# Get top 10 similar jobs
# Note: Similarity is based on embeddings generated from job_title + job_description,
# but we show other fields (company, classification, location) for context
top_jobs_all = df.nlargest(10, 'similarity')

top_jobs = top_jobs_all[['job_title', 'company_name', 'job_classification', 'job_location', 'similarity']]

print(f"\nüéØ Top 10 Similar Jobs:\n")
print(f"{'Score':<6}| {'Job Title':<50} | {'Company':<30} | {'Location':<30} | {'Category':<30}")
print("-" * 150)
for idx, row in top_jobs.iterrows():
    print(f"{row['similarity']:.3f} | {row['job_title'][:50]:50} | {row['company_name'][:30]:30} | {row['job_location'][:30]:30} | {row['job_classification'][:30]:30}")

> Similarities: [[0.14289598 0.12907358 0.26456749 0.25516181 0.22652122 0.13514232
  0.15129247 0.15765203 0.17922829 0.24934899 0.24627376 0.33266054
  0.22969097 0.12263706 0.22178008 0.31607992 0.23396333 0.16545512
  0.15321566 0.18213985 0.2177883  0.18104303 0.11164427 0.14597535
  0.16192417 0.21179878 0.27630021 0.16252996 0.23146529 0.22055401
  0.18871398 0.18871398 0.28930348 0.24525706 0.17235134 0.18291511
  0.15639377 0.21078868 0.29931567 0.15714591 0.27347454 0.28817914
  0.30622114 0.17392272 0.26955963 0.31568726 0.24329909 0.18274226
  0.19629595 0.1971802  0.29055571 0.18549929 0.18821278 0.34740999
  0.28817914 0.28999597 0.19683777 0.17938461 0.23936083 0.23028879
  0.22099174 0.30571012 0.24515056 0.26179318 0.25261338 0.21759889
  0.23936083 0.30031046 0.21341658 0.28325707 0.21953545 0.14898081
  0.24550145 0.22370703 0.29591555 0.17869746 0.26874839 0.31634489
  0.21222317 0.14826318 0.26051972 0.28563528 0.23199854 0.39469307
  0.11907139 0.13165333 0.195683

## üìà Embedding Quality Checks

In [None]:
# Check embedding statistics
embeddings_matrix = np.vstack(df['embedding'].values)

print("üìä Embedding Statistics:\n")
print(f"Shape: {embeddings_matrix.shape}")
print(f"Value range: [{embeddings_matrix.min():.3f}, {embeddings_matrix.max():.3f}]")
print(f"Mean: {embeddings_matrix.mean():.3f}")
print(f"Std: {embeddings_matrix.std():.3f}")

# Check normalization (SBERT embeddings should have norm ‚âà 1)
norms = np.linalg.norm(embeddings_matrix, axis=1)
print(f"\nüìè Vector Norms:")
print(f"Mean: {norms.mean():.3f}")
print(f"Std: {norms.std():.3f}")
print(f"Min: {norms.min():.3f}")
print(f"Max: {norms.max():.3f}")

if abs(norms.mean() - 1.0) < 0.01:
    print("\n‚úÖ Embeddings are properly normalized (unit vectors)")
else:
    print("\n‚ö†Ô∏è Warning: Embeddings may not be normalized")

# Plot norm distribution
plt.figure(figsize=(10, 4))
plt.hist(norms, bins=50, edgecolor='black')
plt.axvline(1.0, color='red', linestyle='--', label='Expected norm = 1.0')
plt.xlabel('Vector Norm')
plt.ylabel('Frequency')
plt.title('Distribution of Embedding Vector Norms')
plt.legend()
plt.show()

## üé® Visualize Embeddings with PCA

In [None]:
# Reduce to 2D using PCA
print("Reducing 384 dimensions to 2D with PCA...")
pca = PCA(n_components=2, random_state=42)
embeddings_2d = pca.fit_transform(embeddings_matrix)

df['pca_1'] = embeddings_2d[:, 0]
df['pca_2'] = embeddings_2d[:, 1]

print(f"‚úÖ Explained variance: {pca.explained_variance_ratio_.sum():.2%}")

# Plot by job classification
plt.figure(figsize=(12, 8))
for classification in df['job_classification'].dropna().unique()[:10]:
    subset = df[df['job_classification'] == classification]
    plt.scatter(subset['pca_1'], subset['pca_2'], 
                label=classification, alpha=0.6, s=30)

plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.title('Job Embeddings Visualization (PCA)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

## üéØ Analyze Similarity by Category

In [None]:
# Group by job classification and check intra-cluster similarity
from sklearn.metrics.pairwise import cosine_similarity

def calculate_intra_cluster_similarity(df, classification):
    """Calculate average similarity within a job classification."""
    subset = df[df['job_classification'] == classification]
    if len(subset) < 2:
        return np.nan
    
    embeddings = np.vstack(subset['embedding'].values)
    sim_matrix = cosine_similarity(embeddings)
    
    # Get upper triangle (exclude diagonal)
    mask = np.triu(np.ones_like(sim_matrix, dtype=bool), k=1)
    return sim_matrix[mask].mean()

# Calculate for top categories
top_categories = df['job_classification'].value_counts().head(10).index
similarity_scores = []

for cat in top_categories:
    score = calculate_intra_cluster_similarity(df, cat)
    similarity_scores.append({
        'category': cat,
        'count': len(df[df['job_classification'] == cat]),
        'avg_similarity': score
    })

sim_df = pd.DataFrame(similarity_scores).sort_values('avg_similarity', ascending=False)

print("\nüìä Intra-Category Similarity (Higher = More Cohesive):\n")
print(sim_df.to_string(index=False))

# Plot
plt.figure(figsize=(10, 6))
plt.barh(sim_df['category'], sim_df['avg_similarity'])
plt.xlabel('Average Cosine Similarity')
plt.title('Embedding Cohesiveness by Job Category')
plt.tight_layout()
plt.show()

## üíæ Summary Statistics

In [None]:
print("\n" + "="*70)
print("üìã EMBEDDING PIPELINE SUMMARY")
print("="*70)

print(f"\n‚úÖ Total Jobs with Embeddings: {len(df):,}")
print(f"‚úÖ Embedding Dimension: {len(df['embedding'].iloc[0])}")
print(f"‚úÖ Model: {df['model_name'].iloc[0]}")
print(f"‚úÖ Sources: {df['source'].unique().tolist()}")
print(f"\nüìä Job Categories: {df['job_classification'].nunique()}")
print(f"üìä Companies: {df['company_name'].nunique()}")
print(f"üìä Locations: {df['job_location'].nunique()}")

print(f"\nüéØ Quality Metrics:")
print(f"  Vector norm: {norms.mean():.3f} ¬± {norms.std():.3f} (expected: 1.0)")
print(f"  Value range: [{embeddings_matrix.min():.3f}, {embeddings_matrix.max():.3f}]")
print(f"  PCA variance (2D): {pca.explained_variance_ratio_.sum():.2%}")

print("\n" + "="*70)
print("‚úÖ EMBEDDINGS ARE PRODUCTION-READY!")
print("="*70)
print("\nNext steps:")
print("  1. Create vector index: python -m nlp.create_vector_index")
print("  2. Train ML models: python -m ml.train")
print("  3. Build RAG pipeline: python -m genai.rag")