# Create Synthetic Dataset

In [1]:
!pip install faker

Collecting faker
  Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.1.0


In [2]:
import json
import random
from faker import Faker
from datetime import datetime

# Initialize Faker for realistic data
fake = Faker()

# Define possible skills
SKILLS_POOL = [
    "React", "JavaScript", "Node.js", "Python", "Django", "Flask",
    "Graphic Design", "Illustration", "UI/UX Design", "Data Visualization",
    "TypeScript", "Tailwind CSS", "Animation", "Machine Learning", "SQL"
]

# Define possible tones for portfolio and reviews
TONES = ["creative", "professional", "technical"]

# Generate 5 client projects
def generate_clients(num=5):
    clients = []
    for i in range(num):
        num_skills = random.randint(2, 4)
        skills = random.sample(SKILLS_POOL, num_skills)
        tone = random.choice(TONES)
        project_description = (
            f"{fake.sentence(nb_words=6, variable_nb_words=True)} "
            f"requiring {', '.join(skills)} for a {tone} project"
        ).capitalize()
        clients.append({
            "client_id": i + 1,
            "project_description": project_description,
            "skills_required": skills,
            "budget": round(random.uniform(300, 2000), 2),
            "timeline": f"{random.randint(5, 30)} days",
            "created_at": datetime(2025, 4, 20, random.randint(8, 18), random.randint(0, 59)).isoformat()
        })
    return clients

# Generate 10 freelancers
def generate_freelancers(num=10):
    freelancers = []
    for i in range(num):
        num_skills = random.randint(3, 5)
        skills = random.sample(SKILLS_POOL, num_skills)
        tone = random.choice(TONES)
        num_portfolio_items = random.randint(2, 3)
        portfolio_text = [
            f"{fake.sentence(nb_words=5, variable_nb_words=True)} {skill.lower()} {tone} project".capitalize()
            for skill in random.sample(skills, num_portfolio_items)
        ]
        num_experiences = random.randint(1, 3)
        experience = [
            {
                "duration": f"{random.randint(1, 7)} years",
                "experience_description": f"{fake.sentence(nb_words=6, variable_nb_words=True)} {skill.lower()}".capitalize()
            }
            for skill in random.sample(skills, num_experiences)
        ]
        freelancers.append({
            "freelancer_id": i + 1,
            "skills": skills,
            "experience": experience,
            "portfolio_text": portfolio_text,
            "availability": random.choice([True, False]),
            "avg_rating": round(random.uniform(3.5, 5.0), 2) if random.random() > 0.2 else 0.0,
            "rate": round(random.uniform(20, 100), 2),
            "created_at": datetime(2025, 4, random.randint(1, 19), random.randint(8, 18)).isoformat(),
            "updated_at": datetime(2025, 4, 20, random.randint(8, 18)).isoformat()
        })
    return freelancers

# Generate 3 reviews
def generate_reviews(num=3, freelancer_ids=range(1, 11)):
    reviews = []
    selected_freelancers = random.sample(list(freelancer_ids), num)
    for i, freelancer_id in enumerate(selected_freelancers, 1):
        tone = random.choice(TONES)
        reviews.append({
            "review_id": i,
            "freelancer_id": freelancer_id,
            "review_text": f"{fake.sentence(nb_words=6, variable_nb_words=True)} {tone} work".capitalize(),
            "rating": round(random.uniform(4.0, 5.0), 2),
            "created_at": datetime(2025, 4, random.randint(10, 19), random.randint(8, 18)).isoformat()
        })
    return reviews

# Generate data
clients = generate_clients(5)
freelancers = generate_freelancers(10)
reviews = generate_reviews(3)

# Combine data
data = {
    "clients": clients,
    "freelancers": freelancers,
    "reviews": reviews
}

# Save to JSON file
with open("synthetic_data.json", "w") as f:
    json.dump(data, f, indent=2)

print("Synthetic data generated and saved to 'synthetic_data.json'.")

Synthetic data generated and saved to 'synthetic_data.json'.


# Load the Dataset

In [3]:
import json
from typing import Tuple, List, Dict, Any

def load_synthetic_data(file_path: str = "synthetic_data.json") -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
    """
    Load synthetic data from a JSON file and return clients, freelancers, and reviews as separate lists.
    
    Args:
        file_path (str): Path to the JSON file (default: 'synthetic_data.json').
        
    Returns:
        Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
            - List of client dictionaries.
            - List of freelancer dictionaries.
            - List of review dictionaries.
            
    Raises:
        FileNotFoundError: If the JSON file does not exist.
        json.JSONDecodeError: If the JSON file is invalid.
        KeyError: If expected keys ('clients', 'freelancers', 'reviews') are missing.
    """
    try:
        # Read the JSON file
        with open(file_path, "r") as f:
            data = json.load(f)
        
        # Extract clients, freelancers, and reviews
        clients = data["clients"]
        freelancers = data["freelancers"]
        reviews = data["reviews"]
        
        # Validate data
        if not all([clients, freelancers, reviews]):
            raise KeyError("JSON file missing required keys: 'clients', 'freelancers', or 'reviews'")
        
        print(f"Loaded {len(clients)} clients, {len(freelancers)} freelancers, and {len(reviews)} reviews from {file_path}")
        return clients, freelancers, reviews
    
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found")
        raise
    except json.JSONDecodeError as e:
        print(f"Error: Invalid JSON format in '{file_path}': {str(e)}")
        raise
    except KeyError as e:
        print(f"Error: Missing required keys in JSON data: {str(e)}")
        raise

# Example usage
if __name__ == "__main__":
    try:
        # Load data into variables
        clients_data, freelancers_data, reviews_data = load_synthetic_data()
        
        # Print sample data for verification
        print("\nSample Client:")
        print(json.dumps(clients_data[0], indent=2))
        
        print("\nSample Freelancer:")
        print(json.dumps(freelancers_data[0], indent=2))
        
        print("\nSample Review:")
        print(json.dumps(reviews_data[0], indent=2))
        
    except Exception as e:
        print(f"Failed to load data: {str(e)}")

Loaded 5 clients, 10 freelancers, and 3 reviews from synthetic_data.json

Sample Client:
{
  "client_id": 1,
  "project_description": "My subject media without. requiring animation, django, python, data visualization for a professional project",
  "skills_required": [
    "Animation",
    "Django",
    "Python",
    "Data Visualization"
  ],
  "budget": 1865.14,
  "timeline": "5 days",
  "created_at": "2025-04-20T10:35:00"
}

Sample Freelancer:
{
  "freelancer_id": 1,
  "skills": [
    "Django",
    "Node.js",
    "Data Visualization",
    "Tailwind CSS",
    "TypeScript"
  ],
  "experience": [
    {
      "duration": "1 years",
      "experience_description": "Blood dream because seek several fish something. node.js"
    }
  ],
  "portfolio_text": [
    "Red drug modern. node.js technical project",
    "Order meet easy. tailwind css technical project",
    "Base smile role. data visualization technical project"
  ],
  "availability": true,
  "avg_rating": 4.84,
  "rate": 42.31,
  "cre

# Data Preprocessing

In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from typing import List, Dict, Any, Union

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

def preprocess_text(text: str, remove_stopwords: bool = False, lemmatize: bool = True) -> str:
    """
    Preprocess a single text string by cleaning and normalizing it.
    
    Args:
        text (str): Input text to preprocess.
        remove_stopwords (bool): If True, remove stopwords (default: False to preserve tone words).
        lemmatize (bool): If True, lemmatize words (default: True).
        
    Returns:
        str: Cleaned and normalized text.
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters, numbers, and extra spaces
    text = re.sub(r'[^a-z\s]', ' ', text)  # Keep letters and spaces
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize spaces
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords (optional)
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize (optional)
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into a string
    return ' '.join(tokens)

def preprocess_synthetic_data(clients: List[Dict[str, Any]], 
                            freelancers: List[Dict[str, Any]], 
                            reviews: List[Dict[str, Any]], 
                            join_portfolio: bool = True,
                            include_experience: bool = True) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
    """
    Preprocess text fields in synthetic data (clients, freelancers, reviews).
    
    Args:
        clients (List[Dict[str, Any]]): List of client dictionaries.
        freelancers (List[Dict[str, Any]]): List of freelancer dictionaries.
        reviews (List[Dict[str, Any]]): List of review dictionaries.
        join_portfolio (bool): If True, join portfolio_text into a single string; else keep as list.
        include_experience (bool): If True, preprocess experience_description and include in output.
        
    Returns:
        Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
            - Preprocessed clients with cleaned project_description.
            - Preprocessed freelancers with cleaned portfolio_text and experience.
            - Preprocessed reviews with cleaned review_text.
    """
    # Copy data to avoid modifying originals
    clients_processed = clients.copy()
    freelancers_processed = freelancers.copy()
    reviews_processed = reviews.copy()
    
    # Preprocess clients (project_description)
    for client in clients_processed:
        client['project_description'] = preprocess_text(client['project_description'])
    
    # Preprocess freelancers (portfolio_text, experience)
    for freelancer in freelancers_processed:
        # Process portfolio_text
        if join_portfolio:
            # Join portfolio_text into a single string
            joined_text = ' '.join(freelancer['portfolio_text'])
            freelancer['portfolio_text'] = preprocess_text(joined_text)
        else:
            # Process each portfolio_text entry separately
            freelancer['portfolio_text'] = [preprocess_text(text) for text in freelancer['portfolio_text']]
        
        # Process experience (if included)
        if include_experience:
            for exp in freelancer['experience']:
                exp['experience_description'] = preprocess_text(exp['experience_description'])
    
    # Preprocess reviews (review_text)
    for review in reviews_processed:
        review['review_text'] = preprocess_text(review['review_text'])
    
    return clients_processed, freelancers_processed, reviews_processed

# Example usage
if __name__ == "__main__":
    try:
        # Load synthetic data
        clients_data, freelancers_data, reviews_data = load_synthetic_data("synthetic_data.json")
        
        # Preprocess data
        clients_proc, freelancers_proc, reviews_proc = preprocess_synthetic_data(
            clients_data,
            freelancers_data,
            reviews_data,
            join_portfolio=True,  # Join portfolio_text for BERT embeddings
            include_experience=True  # Include experience_description
        )
        
        # Print samples for verification
        print("\nSample Preprocessed Client:")
        print(json.dumps(clients_proc[0], indent=2))
        
        print("\nSample Preprocessed Freelancer:")
        print(json.dumps(freelancers_proc[0], indent=2))
        
        print("\nSample Preprocessed Review:")
        print(json.dumps(reviews_proc[0], indent=2))
        
    except Exception as e:
        print(f"Failed to preprocess data: {str(e)}")

Loaded 5 clients, 10 freelancers, and 3 reviews from synthetic_data.json

Sample Preprocessed Client:
{
  "client_id": 1,
  "project_description": "my subject medium without requiring animation django python data visualization for a professional project",
  "skills_required": [
    "Animation",
    "Django",
    "Python",
    "Data Visualization"
  ],
  "budget": 1865.14,
  "timeline": "5 days",
  "created_at": "2025-04-20T10:35:00"
}

Sample Preprocessed Freelancer:
{
  "freelancer_id": 1,
  "skills": [
    "Django",
    "Node.js",
    "Data Visualization",
    "Tailwind CSS",
    "TypeScript"
  ],
  "experience": [
    {
      "duration": "1 years",
      "experience_description": "blood dream because seek several fish something node j"
    }
  ],
  "portfolio_text": "red drug modern node j technical project order meet easy tailwind cs technical project base smile role data visualization technical project",
  "availability": true,
  "avg_rating": 4.84,
  "rate": 42.31,
  "created_at"

# Text Embedding

## To Generate Client and Freelancer Embedding

In [5]:
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict, Any, Tuple

def get_bert_embeddings(texts: List[str], model_name: str = 'distilbert-base-uncased', batch_size: int = 8) -> np.ndarray:
    """
    Generate BERT embeddings for a list of texts using a pre-trained model.
    
    Args:
        texts (List[str]): List of text strings to embed.
        model_name (str): Hugging Face model name (default: 'distilbert-base-uncased').
        batch_size (int): Batch size for processing texts (default: 8).
        
    Returns:
        np.ndarray: Array of embeddings (shape: [len(texts), 768]).
    """
    # Initialize tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    embeddings = []
    
    # Process texts in batches
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        
        # Tokenize and encode
        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Get embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            # Use [CLS] token embedding (first token)
            batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        
        embeddings.append(batch_embeddings)
    
    # Concatenate embeddings
    return np.vstack(embeddings)

def get_tfidf_embeddings(texts: List[str]) -> np.ndarray:
    """
    Generate TF-IDF embeddings for a list of texts.
    
    Args:
        texts (List[str]): List of text strings to embed.
        
    Returns:
        np.ndarray: Array of TF-IDF vectors (sparse, converted to dense).
    """
    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(texts)
    return tfidf_matrix.toarray()

def extract_text_features(clients: List[Dict[str, Any]], 
                         freelancers: List[Dict[str, Any]], 
                         use_bert: bool = True) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Extract text features (BERT or TF-IDF embeddings) for project_description and portfolio_text.
    
    Args:
        clients (List[Dict[str, Any]]): List of preprocessed client dictionaries.
        freelancers (List[Dict[str, Any]]): List of preprocessed freelancer dictionaries.
        use_bert (bool): If True, use BERT embeddings; else use TF-IDF (default: True).
        
    Returns:
        Tuple[np.ndarray, np.ndarray, np.ndarray]:
            - Client embeddings (shape: [len(clients), 768] for BERT, or [len(clients), vocab_size] for TF-IDF).
            - Freelancer embeddings (shape: [len(freelancers), 768] or [len(freelancers), vocab_size]).
            - Cosine similarity matrix (shape: [len(clients), len(freelancers)]).
    """
    # Extract text fields
    client_texts = [client['project_description'] for client in clients]
    freelancer_texts = [freelancer['portfolio_text'] for freelancer in freelancers]
    
    # Generate embeddings
    if use_bert:
        # Combine texts for efficiency (single model load)
        all_texts = client_texts + freelancer_texts
        all_embeddings = get_bert_embeddings(all_texts)
        
        # Split embeddings
        client_embeddings = all_embeddings[:len(client_texts)]
        freelancer_embeddings = all_embeddings[len(client_texts):]
    else:
        # TF-IDF embeddings
        all_texts = client_texts + freelancer_texts
        all_embeddings = get_tfidf_embeddings(all_texts)
        
        # Split embeddings
        client_embeddings = all_embeddings[:len(client_texts)]
        freelancer_embeddings = all_embeddings[len(client_texts):]
    
    # Compute cosine similarity
    similarity_matrix = cosine_similarity(client_embeddings, freelancer_embeddings)
    
    return client_embeddings, freelancer_embeddings, similarity_matrix



# Example usage
# Notes:  This should run every day, so new freelancer can be included in recommendation
if __name__ == "__main__":
    try:
        # Load synthetic data
        clients_data, freelancers_data, reviews_data = load_synthetic_data("synthetic_data.json")
        
        # Preprocess data (join_portfolio=True for single string)
        clients_proc, freelancers_proc, reviews_proc = preprocess_synthetic_data(
            clients_data,
            freelancers_data,
            reviews_data,
            join_portfolio=True,
            include_experience=False  # Exclude experience for now
        )
        
        # Extract features
        client_emb, freelancer_emb, similarity_matrix = extract_text_features(
            clients_proc,
            freelancers_proc,
            use_bert=True  # Use BERT embeddings
        )
        
        # Print shapes and sample results
        print(f"Client embeddings shape: {client_emb.shape}")
        print(f"Freelancer embeddings shape: {freelancer_emb.shape}")
        print(f"Similarity matrix shape: {similarity_matrix.shape}")
        
        # Print sample similarity scores
        print("\nSample Cosine Similarity Scores (Client 1 vs. Freelancers):")
        for i, score in enumerate(similarity_matrix[0]):
            print(f"Freelancer {i+1}: {score:.4f}")
        
        # Print sample embedding (first 5 dimensions for brevity)
        print("\nSample Client 1 Embedding (first 5 dimensions):")
        print(client_emb[0][:5])
        
    except Exception as e:
        print(f"Failed to extract features: {str(e)}")

Loaded 5 clients, 10 freelancers, and 3 reviews from synthetic_data.json


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

2025-04-20 17:07:49.550843: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745168869.840658      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745168869.919440      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Client embeddings shape: (5, 768)
Freelancer embeddings shape: (10, 768)
Similarity matrix shape: (5, 10)

Sample Cosine Similarity Scores (Client 1 vs. Freelancers):
Freelancer 1: 0.9290
Freelancer 2: 0.8802
Freelancer 3: 0.8979
Freelancer 4: 0.9113
Freelancer 5: 0.9116
Freelancer 6: 0.9402
Freelancer 7: 0.9331
Freelancer 8: 0.9347
Freelancer 9: 0.9456
Freelancer 10: 0.9245

Sample Client 1 Embedding (first 5 dimensions):
[-0.08792603  0.0429019  -0.02656294 -0.20145981  0.00760244]


## To Find Similarity From a Client Data and All Freelancer

In [6]:
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict, Any

def get_bert_embedding(text: str, model_name: str = 'distilbert-base-uncased') -> np.ndarray:
    """
    Generate BERT embedding for a single text string.
    
    Args:
        text (str): Input text to embed.
        model_name (str): Hugging Face model name (default: 'distilbert-base-uncased').
        
    Returns:
        np.ndarray: Embedding vector (shape: [768]).
    """
    # Initialize tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    # Tokenize and encode
    inputs = tokenizer(
        text,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Get embedding
    with torch.no_grad():
        outputs = model(**inputs)
        # Use [CLS] token embedding
        embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    
    return embedding[0]  # Shape: [768]

def compute_client_similarity(client_data: Dict[str, Any], 
                             freelancer_embeddings_file: str = "freelancer_embeddings.npy",
                             freelancers_data: List[Dict[str, Any]] = None) -> List[float]:
    """
    Compute cosine similarity between a client's project_description and each freelancer's portfolio_text.
    
    Args:
        client_data (Dict[str, Any]): Client data with 'project_description' key.
        freelancer_embeddings_file (str): Path to precomputed freelancer embeddings (default: 'freelancer_embeddings.npy').
        freelancers_data (List[Dict[str, Any]]): Preprocessed freelancer data (optional, used if embeddings file is missing).
        
    Returns:
        List[float]: Cosine similarity scores for each freelancer (ordered by freelancer_id).
        
    Raises:
        KeyError: If 'project_description' is missing in client_data.
        FileNotFoundError: If freelancer_embeddings_file is missing and freelancers_data is not provided.
    """
    # Validate client data
    if 'project_description' not in client_data:
        raise KeyError("client_data must contain 'project_description'")
    
    # Preprocess client project_description
    client_text = preprocess_text(client_data['project_description'])
    
    # Generate client embedding
    client_embedding = get_bert_embedding(client_text).reshape(1, -1)  # Shape: [1, 768]
    
    # Load or compute freelancer embeddings
    try:
        # Try loading precomputed embeddings
        freelancer_embeddings = np.load(freelancer_embeddings_file)
    except FileNotFoundError:
        if freelancers_data is None:
            raise FileNotFoundError(
                f"Freelancer embeddings file '{freelancer_embeddings_file}' not found, "
                "and freelancers_data not provided"
            )
        # Compute embeddings on-the-fly
        freelancer_texts = [freelancer['portfolio_text'] for freelancer in freelancers_data]
        freelancer_embeddings = np.vstack([
            get_bert_embedding(text) for text in freelancer_texts
        ])
        # Save for future use
        np.save(freelancer_embeddings_file, freelancer_embeddings)
    
    # Compute cosine similarity
    similarity_scores = cosine_similarity(client_embedding, freelancer_embeddings)[0]
    
    return similarity_scores.tolist()

# Example usage
if __name__ == "__main__":
    try:
        # Load synthetic data
        clients_data, freelancers_data, reviews_data = load_synthetic_data("synthetic_data.json")
        
        # Preprocess data (join_portfolio=True for single string)
        _, freelancers_proc, _ = preprocess_synthetic_data(
            clients_data,
            freelancers_data,
            reviews_data,
            join_portfolio=True,
            include_experience=False
        )
        
        # Example: Assume precomputed freelancer embeddings exist
        # (Run extract_text_features.py first to generate freelancer_embeddings.npy)
        
        # New client data
        new_client = {
            "project_description": "REACT"
        }
        
        # Compute similarity scores
        similarity_scores = compute_client_similarity(
            client_data=new_client,
            freelancer_embeddings_file="freelancer_embeddings.npy",
            freelancers_data=freelancers_proc  # Fallback if file missing
        )
        
        # Print results
        print("\nCosine Similarity Scores for New Client vs. Freelancers:")
        for i, score in enumerate(similarity_scores, 1):
            print(f"Freelancer {i}: {score:.4f}")
        
    except Exception as e:
        print(f"Failed to compute similarities: {str(e)}")

Loaded 5 clients, 10 freelancers, and 3 reviews from synthetic_data.json

Cosine Similarity Scores for New Client vs. Freelancers:
Freelancer 1: 0.8763
Freelancer 2: 0.8018
Freelancer 3: 0.8278
Freelancer 4: 0.8404
Freelancer 5: 0.8293
Freelancer 6: 0.9007
Freelancer 7: 0.8556
Freelancer 8: 0.8756
Freelancer 9: 0.9041
Freelancer 10: 0.8813


## Tone Analysis

But i think for now we dont use tone analysis

In [7]:
# import numpy as np
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score, f1_score
# from typing import List, Dict, Any, Tuple
# import torch
# from faker import Faker

# # Initialize Faker for synthetic data
# fake = Faker()

# def generate_synthetic_tone_data(num_samples_per_class: int = 100) -> Tuple[List[str], List[int]]:
#     """
#     Generate synthetic labeled data for tone classification (creative, professional, technical).
    
#     Args:
#         num_samples_per_class (int): Number of samples per tone class (default: 100).
        
#     Returns:
#         Tuple[List[str], List[int]]: Texts and corresponding labels (0: creative, 1: professional, 2: technical).
#     """
#     tones = ['creative', 'professional', 'technical']
#     texts = []
#     labels = []
    
#     for tone_idx, tone in enumerate(tones):
#         for _ in range(num_samples_per_class):
#             if tone == 'creative':
#                 text = f"{fake.sentence(nb_words=6)} vibrant {tone} design project"
#             elif tone == 'professional':
#                 text = f"{fake.sentence(nb_words=6)} reliable {tone} service delivery"
#             else:  # technical
#                 text = f"{fake.sentence(nb_words=6)} scalable {tone} system implementation"
#             texts.append(text)
#             labels.append(tone_idx)
    
#     return texts, labels

# def train_tone_classifier(texts: List[str], labels: List[int], use_bert: bool = True, model_name: str = 'distilbert-base-uncased') -> Any:
#     """
#     Train a tone classifier (BERT or Logistic Regression) on labeled texts.
    
#     Args:
#         texts (List[str]): Preprocessed texts for training.
#         labels (List[int]): Labels (0: creative, 1: professional, 2: technical).
#         use_bert (bool): If True, use BERT; else use Logistic Regression with TF-IDF (default: True).
#         model_name (str): Hugging Face model name for BERT (default: 'distilbert-base-uncased').
        
#     Returns:
#         Any: Trained model (BERT Trainer or LogisticRegression).
#     """
#     if use_bert:
#         # Tokenize texts
#         tokenizer = AutoTokenizer.from_pretrained(model_name)
#         encodings = tokenizer(
#             texts,
#             padding=True,
#             truncation=True,
#             max_length=128,
#             return_tensors="pt"
#         )
        
#         # Create dataset
#         class ToneDataset(torch.utils.data.Dataset):
#             def __init__(self, encodings, labels):
#                 self.encodings = encodings
#                 self.labels = labels
            
#             def __getitem__(self, idx):
#                 item = {key: val[idx] for key, val in self.encodings.items()}
#                 item['labels'] = torch.tensor(self.labels[idx])
#                 return item
            
#             def __len__(self):
#                 return len(self.labels)
        
#         dataset = ToneDataset(encodings, labels)
        
#         # Split into train and validation (80-20)
#         train_size = int(0.8 * len(dataset))
#         train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, len(dataset) - train_size])
        
#         # Initialize model
#         model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
        
#         # Training arguments
#         training_args = TrainingArguments(
#             output_dir="./tone_classifier",
#             num_train_epochs=3,
#             per_device_train_batch_size=8,
#             per_device_eval_batch_size=8,
#             warmup_steps=10,
#             weight_decay=0.01,
#             logging_dir="./logs",
#             logging_steps=10,
#             eval_strategy="epoch",
#             save_strategy="epoch",
#             load_best_model_at_end=True
#         )
        
#         # Initialize trainer
#         trainer = Trainer(
#             model=model,
#             args=training_args,
#             train_dataset=train_dataset,
#             eval_dataset=val_dataset,
#             compute_metrics=lambda p: {
#                 "accuracy": accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1)),
#                 "f1": f1_score(p.label_ids, np.argmax(p.predictions, axis=1), average='weighted')
#             }
#         )
        
#         # Train
#         trainer.train()
#         return trainer
    
#     else:
#         # TF-IDF with Logistic Regression
#         vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
#         X = vectorizer.fit_transform(texts)
#         model = LogisticRegression(multi_class='multinomial', max_iter=1000)
#         model.fit(X, labels)
#         return model, vectorizer

# def predict_tone(texts: List[str], model: Any, use_bert: bool = True, tokenizer: Any = None, vectorizer: Any = None) -> List[int]:
#     """
#     Predict tone labels for a list of texts.
    
#     Args:
#         texts (List[str]): Preprocessed texts to classify.
#         model: Trained model (BERT Trainer or LogisticRegression).
#         use_bert (bool): If True, use BERT model; else use Logistic Regression (default: True).
#         tokenizer: BERT tokenizer (required if use_bert=True).
#         vectorizer: TF-IDF vectorizer (required if use_bert=False).
        
#     Returns:
#         List[int]: Predicted labels (0: creative, 1: professional, 2: technical).
#     """
#     if use_bert:
#         encodings = tokenizer(
#             texts,
#             padding=True,
#             truncation=True,
#             max_length=128,
#             return_tensors="pt"
#         )
#         dataset = torch.utils.data.TensorDataset(
#             encodings['input_ids'],
#             encodings['attention_mask']
#         )
#         predictions = model.predict(dataset)
#         return np.argmax(predictions, axis=1).tolist()
#     else:
#         X = vectorizer.transform(texts)
#         return model.predict(X).tolist()

# def perform_tone_analysis(clients: List[Dict[str, Any]], 
#                          freelancers: List[Dict[str, Any]], 
#                          reviews: List[Dict[str, Any]], 
#                          use_bert: bool = True) -> Tuple[List[str], List[str], np.ndarray]:
#     """
#     Perform tone analysis on review_text, portfolio_text, and project_description.
    
#     Args:
#         clients (List[Dict[str, Any]]): Preprocessed client dictionaries.
#         freelancers (List[Dict[str, Any]]): Preprocessed freelancer dictionaries.
#         reviews (List[Dict[str, Any]]): Preprocessed review dictionaries.
#         use_bert (bool): If True, use BERT classifier; else use Logistic Regression (default: True).
        
#     Returns:
#         Tuple[List[str], List[str], np.ndarray]:
#             - Client tones (e.g., ['professional', 'creative', ...]).
#             - Freelancer tones (e.g., ['technical', 'professional', ...]).
#             - Tone match matrix (shape: [len(clients), len(freelancers)], 1 if tones match, 0 otherwise).
#     """
#     # Generate synthetic training data
#     synthetic_texts, synthetic_labels = generate_synthetic_tone_data(num_samples_per_class=100)
    
#     # Add real review_text (assume manually labeled for demo)
#     review_texts = [review['review_text'] for review in reviews]
#     # Example: Manually label the 3 reviews (in practice, label manually or use external data)
#     review_labels = [1, 0, 2]  # e.g., professional, creative, technical
#     if len(review_texts) != len(review_labels):
#         raise ValueError("Number of review texts and labels must match")
    
#     # Combine training data
#     train_texts = synthetic_texts + review_texts
#     train_labels = synthetic_labels + review_labels
    
#     # Train classifier
#     if use_bert:
#         trainer = train_tone_classifier(train_texts, train_labels, use_bert=True)
#         tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
#         model = trainer
#         vectorizer = None
#     else:
#         model, vectorizer = train_tone_classifier(train_texts, train_labels, use_bert=False)
#         tokenizer = None
    
#     # Predict tones
#     tone_map = {0: 'creative', 1: 'professional', 2: 'technical'}
    
#     # Client tones (project_description)
#     client_texts = [client['project_description'] for client in clients]
#     client_tone_ids = predict_tone(client_texts, model, use_bert, tokenizer, vectorizer)
#     client_tones = [tone_map[tid] for tid in client_tone_ids]
    
#     # Freelancer tones (portfolio_text, fallback to reviews if available)
#     freelancer_tones = []
#     for freelancer in freelancers:
#         # Use portfolio_text (all freelancers have it)
#         portfolio_text = freelancer['portfolio_text']
#         portfolio_tone_id = predict_tone([portfolio_text], model, use_bert, tokenizer, vectorizer)[0]
        
#         # Check for reviews (optional augmentation)
#         freelancer_reviews = [r['review_text'] for r in reviews if r['freelancer_id'] == freelancer['freelancer_id']]
#         if freelancer_reviews:
#             review_tone_ids = predict_tone(freelancer_reviews, model, use_bert, tokenizer, vectorizer)
#             # Aggregate: majority vote or use portfolio_text if tied
#             tone_counts = np.bincount(review_tone_ids + [portfolio_tone_id], minlength=3)
#             tone_id = np.argmax(tone_counts)
#         else:
#             tone_id = portfolio_tone_id
        
#         freelancer_tones.append(tone_map[tone_id])
    
#     # Compute tone match matrix
#     tone_match_matrix = np.zeros((len(clients), len(freelancers)))
#     for i, client_tone in enumerate(client_tones):
#         for j, freelancer_tone in enumerate(freelancer_tones):
#             tone_match_matrix[i, j] = 1 if client_tone == freelancer_tone else 0
    
#     return client_tones, freelancer_tones, tone_match_matrix

# # Example usage
# if __name__ == "__main__":
#     try:
#         # Load synthetic data
#         clients_data, freelancers_data, reviews_data = load_synthetic_data("synthetic_data.json")
        
#         # Preprocess data (join_portfolio=True for single string)
#         clients_proc, freelancers_proc, reviews_proc = preprocess_synthetic_data(
#             clients_data,
#             freelancers_data,
#             reviews_data,
#             join_portfolio=True,
#             include_experience=False
#         )
        
#         # Perform tone analysis
#         client_tones, freelancer_tones, tone_match_matrix = perform_tone_analysis(
#             clients_proc,
#             freelancers_proc,
#             reviews_proc,
#             use_bert=True
#         )
        
#         # Print results
#         print("\nClient Tones:")
#         for i, tone in enumerate(client_tones, 1):
#             print(f"Client {i}: {tone}")
        
#         print("\nFreelancer Tones:")
#         for i, tone in enumerate(freelancer_tones, 1):
#             print(f"Freelancer {i}: {tone}")
        
#         print("\nTone Match Matrix (Client vs. Freelancer):")
#         for i, row in enumerate(tone_match_matrix, 1):
#             print(f"Client {i}: {row.tolist()}")
        
#     except Exception as e:
#         print(f"Failed to perform tone analysis: {str(e)}")

# Skill Similarity

In [8]:
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from typing import List, Dict, Any, Tuple, Union
from fuzzywuzzy import fuzz
import itertools

# Define skill universe (from data generation script, April 20, 2025)
SKILLS_POOL = [
    "React", "JavaScript", "Node.js", "Python", "Django", "Flask",
    "Graphic Design", "Illustration", "UI/UX Design", "Data Visualization",
    "TypeScript", "Tailwind CSS", "Animation", "Machine Learning", "SQL"
]

def extract_skill_features(clients: List[Dict[str, Any]], 
                          freelancers: List[Dict[str, Any]], 
                          use_one_hot: bool = True) -> Tuple[Union[np.ndarray, List[set]], Union[np.ndarray, List[set]], List[str]]:
    """
    Extract skill features as one-hot encoded vectors or skill sets.
    
    Args:
        clients (List[Dict[str, Any]]): Client dictionaries with 'skills_required'.
        freelancers (List[Dict[str, Any]]): Freelancer dictionaries with 'skills'.
        use_one_hot (bool): If True, use one-hot encoding; else return skill sets (default: True).
        
    Returns:
        Tuple[Union[np.ndarray, List[set]], Union[np.ndarray, List[set]], List[str]]:
            - Client skill features (array of shape [num_clients, num_skills] or list of sets).
            - Freelancer skill features (array of shape [num_freelancers, num_skills] or list of sets).
            - Skill labels (list of skill names, only for one-hot encoding).
    """
    # Extract skill lists
    client_skills = [client['skills_required'] for client in clients]
    freelancer_skills = [freelancer['skills'] for freelancer in freelancers]
    
    if use_one_hot:
        # Initialize MultiLabelBinarizer with predefined skill universe
        mlb = MultiLabelBinarizer(classes=SKILLS_POOL)
        
        # Fit and transform skills
        client_features = mlb.fit_transform(client_skills)
        freelancer_features = mlb.transform(freelancer_skills)
        
        return client_features, freelancer_features, mlb.classes_.tolist()
    else:
        # Return skill sets for exact matching
        client_features = [set(skills) for skills in client_skills]
        freelancer_features = [set(skills) for skills in freelancer_skills]
        return client_features, freelancer_features, SKILLS_POOL

def compute_skill_similarity(clients: List[Dict[str, Any]], 
                            freelancers: List[Dict[str, Any]], 
                            use_one_hot: bool = True, 
                            use_fuzzy: bool = False) -> np.ndarray:
    """
    Compute skill similarity between clients and freelancers using Jaccard similarity.
    
    Args:
        clients (List[Dict[str, Any]]): Client dictionaries with 'skills_required'.
        freelancers (List[Dict[str, Any]]): Freelancer dictionaries with 'skills'.
        use_one_hot (bool): If True, use one-hot encoded vectors; else use set-based Jaccard (default: True).
        use_fuzzy (bool): If True, use fuzzy matching for skills (default: False).
        
    Returns:
        np.ndarray: Similarity matrix (shape: [num_clients, num_freelancers]).
    """
    # Extract skill features
    client_features, freelancer_features, skill_labels = extract_skill_features(clients, freelancers, use_one_hot)
    
    # Initialize similarity matrix
    similarity_matrix = np.zeros((len(clients), len(freelancers)))
    
    if use_one_hot and not use_fuzzy:
        # Jaccard similarity on one-hot vectors
        for i, client_vec in enumerate(client_features):
            for j, freelancer_vec in enumerate(freelancer_features):
                intersection = np.sum(client_vec & freelancer_vec)
                union = np.sum(client_vec | freelancer_vec)
                similarity_matrix[i, j] = intersection / union if union > 0 else 0.0
    else:
        # Set-based Jaccard similarity
        for i, client_skills in enumerate(client_features):
            for j, freelancer_skills in enumerate(freelancer_features):
                if use_fuzzy:
                    # Fuzzy matching: Compute max similarity for each skill pair
                    intersection = 0
                    for c_skill, f_skill in itertools.product(client_skills, freelancer_skills):
                        score = fuzz.ratio(c_skill.lower(), f_skill.lower()) / 100.0
                        if score > 0.9:  # Threshold for match
                            intersection += 1
                    union = len(client_skills) + len(freelancer_skills) - intersection
                    similarity_matrix[i, j] = intersection / union if union > 0 else 0.0
                else:
                    # Exact matching
                    intersection = len(client_skills & freelancer_skills)
                    union = len(client_skills | freelancer_skills)
                    similarity_matrix[i, j] = intersection / union if union > 0 else 0.0
    
    return similarity_matrix

# Example usage
if __name__ == "__main__":
    try:
        # Load synthetic data
        clients_data, freelancers_data, reviews_data = load_synthetic_data("synthetic_data.json")
        
        # Extract skill features
        client_features, freelancer_features, skill_labels = extract_skill_features(
            clients_data,
            freelancers_data,
            use_one_hot=True
        )
        
        # Compute skill similarity
        similarity_matrix = compute_skill_similarity(
            clients_data,
            freelancers_data,
            use_one_hot=True,
            use_fuzzy=False
        )
        
        # Print results
        print("\nSkill Labels:")
        print(skill_labels)
        
        print("\nSample Client Skill Vector (Client 1):")
        print(client_features[0])
        
        print("\nSample Freelancer Skill Vector (Freelancer 1):")
        print(freelancer_features[0])
        
        print("\nSkill Similarity Matrix (Client vs. Freelancer):")
        for i, row in enumerate(similarity_matrix, 1):
            print(f"Client {i}: {[f'{x:.4f}' for x in row]}")
        
    except Exception as e:
        print(f"Failed to process skills: {str(e)}")

Loaded 5 clients, 10 freelancers, and 3 reviews from synthetic_data.json

Skill Labels:
['React', 'JavaScript', 'Node.js', 'Python', 'Django', 'Flask', 'Graphic Design', 'Illustration', 'UI/UX Design', 'Data Visualization', 'TypeScript', 'Tailwind CSS', 'Animation', 'Machine Learning', 'SQL']

Sample Client Skill Vector (Client 1):
[0 0 0 1 1 0 0 0 0 1 0 0 1 0 0]

Sample Freelancer Skill Vector (Freelancer 1):
[0 0 1 0 1 0 0 0 0 1 1 1 0 0 0]

Skill Similarity Matrix (Client vs. Freelancer):
Client 1: ['0.2857', '0.5000', '0.3333', '0.1667', '0.4000', '0.1429', '0.4000', '0.2857', '0.0000', '0.0000']
Client 2: ['0.1250', '0.0000', '0.0000', '0.0000', '0.1667', '0.1429', '0.1667', '0.0000', '0.1250', '0.1429']
Client 3: ['0.1429', '0.1429', '0.7500', '0.0000', '0.0000', '0.1667', '0.2000', '0.1429', '0.3333', '0.0000']
Client 4: ['0.3333', '0.3333', '0.0000', '0.0000', '0.2000', '0.0000', '0.2000', '0.3333', '0.1429', '0.1667']
Client 5: ['0.2857', '0.1250', '0.6000', '0.1667', '0.1667',



## Get similarity for a client with each freelancer

In [9]:
import numpy as np
from typing import List, Dict, Any
from fuzzywuzzy import fuzz
import itertools

def compute_client_skill_similarity(client_skills: List[str], 
                                   freelancers_data: List[Dict[str, Any]] = None, 
                                   use_fuzzy: bool = False) -> List[float]:
    """
    Compute Jaccard similarity between a client's skills and each freelancer's skills.
    
    Args:
        client_skills (List[str]): List of client skills (e.g., ['React', 'JavaScript']).
        freelancers_data (List[Dict[str, Any]]): Preprocessed freelancer data (optional, loaded if not provided).
        use_fuzzy (bool): If True, use fuzzy matching for skills (default: False).
        
    Returns:
        List[float]: Jaccard similarity scores for each freelancer (ordered by freelancer_id).
        
    Raises:
        ValueError: If client_skills is empty.
        FileNotFoundError: If freelancers_data is not provided and synthetic_data.json is missing.
    """
    # Validate client skills
    if not client_skills:
        raise ValueError("client_skills cannot be empty")
    
    # Load freelancer data if not provided
    if freelancers_data is None:
        try:
            _, freelancers_data, _ = load_synthetic_data("synthetic_data.json")
        except FileNotFoundError:
            raise FileNotFoundError("synthetic_data.json not found and freelancers_data not provided")
    
    # Convert client skills to set
    client_skills_set = set(client_skills)
    
    # Initialize similarity scores
    similarity_scores = []
    
    # Compute similarity for each freelancer
    for freelancer in freelancers_data:
        freelancer_skills = set(freelancer['skills'])
        
        if use_fuzzy:
            # Fuzzy matching: Compute max similarity for each skill pair
            intersection = 0
            for c_skill, f_skill in itertools.product(client_skills, freelancer_skills):
                score = fuzz.ratio(c_skill.lower(), f_skill.lower()) / 100.0
                if score > 0.9:  # Threshold for match
                    intersection += 1
            union = len(client_skills) + len(freelancer_skills) - intersection
            similarity = intersection / union if union > 0 else 0.0
        else:
            # Exact matching
            intersection = len(client_skills_set & freelancer_skills)
            union = len(client_skills_set | freelancer_skills)
            similarity = intersection / union if union > 0 else 0.0
        
        similarity_scores.append(similarity)
    
    return similarity_scores

# Example usage
if __name__ == "__main__":
    try:
        # Load synthetic data
        _, freelancers_data, _ = load_synthetic_data("synthetic_data.json")
        
        # Example client skills
        new_client_skills = ["React", "JavaScript", "UI/UX Design"]
        
        # Compute skill similarity
        similarity_scores = compute_client_skill_similarity(
            client_skills=new_client_skills,
            freelancers_data=freelancers_data,
            use_fuzzy=False
        )
        
        # Print results
        print("\nSkill Similarity Scores for Client vs. Freelancers:")
        for i, score in enumerate(similarity_scores, 1):
            print(f"Freelancer {i}: {score:.4f}")
        
    except Exception as e:
        print(f"Failed to compute skill similarities: {str(e)}")

Loaded 5 clients, 10 freelancers, and 3 reviews from synthetic_data.json

Skill Similarity Scores for Client vs. Freelancers:
Freelancer 1: 0.0000
Freelancer 2: 0.1429
Freelancer 3: 0.0000
Freelancer 4: 0.2000
Freelancer 5: 0.2000
Freelancer 6: 0.0000
Freelancer 7: 0.0000
Freelancer 8: 0.1429
Freelancer 9: 0.1429
Freelancer 10: 0.1667


# Calculating final score from a client for each freelancer

In [10]:
import numpy as np
from typing import List, Dict, Any

def compute_final_score(client_data: Dict[str, Any], 
                       freelancers_data: List[Dict[str, Any]] = None,
                       freelancer_embeddings_file: str = "freelancer_embeddings.npy",
                       weights: Dict[str, float] = None) -> List[float]:
    """
    Compute final score for a client against each freelancer based on text similarity,
    skill similarity, and average rating.
    
    Args:
        client_data (Dict[str, Any]): Client data with 'project_description' and 'skills_required'.
        freelancers_data (List[Dict[str, Any]]): Preprocessed freelancer data (optional, loaded if not provided).
        freelancer_embeddings_file (str): Path to precomputed freelancer embeddings (default: 'freelancer_embeddings.npy').
        weights (Dict[str, float]): Weights for scoring components (default: {'skill': 0.4, 'text': 0.4, 'rating': 0.2}).
        
    Returns:
        List[float]: Final scores for each freelancer (ordered by freelancer_id).
        
    Raises:
        KeyError: If required client_data keys are missing.
        FileNotFoundError: If freelancers_data or embeddings file is missing.
        ValueError: If weights are invalid.
    """
    # Validate client data
    required_keys = ['project_description', 'skills_required']
    missing_keys = [key for key in required_keys if key not in client_data]
    if missing_keys:
        raise KeyError(f"client_data missing required keys: {missing_keys}")
    
    # Load freelancer data if not provided
    if freelancers_data is None:
        try:
            _, freelancers_data, _ = load_synthetic_data("synthetic_data.json")
        except FileNotFoundError:
            raise FileNotFoundError("synthetic_data.json not found and freelancers_data not provided")
    
    # Set default weights
    if weights is None:
        weights = {'skill': 0.4, 'text': 0.4, 'rating': 0.2}
    
    # Validate weights
    if not all(k in weights for k in ['skill', 'text', 'rating']):
        raise ValueError("weights must include 'skill', 'text', and 'rating'")
    if not abs(sum(weights.values()) - 1.0) < 1e-6:
        raise ValueError("weights must sum to 1.0")
    if any(w < 0 for w in weights.values()):
        raise ValueError("weights must be non-negative")
    
    # Compute text similarity
    text_similarities = compute_client_similarity(
        client_data=client_data,
        freelancer_embeddings_file=freelancer_embeddings_file,
        freelancers_data=freelancers_data
    )
    
    # Compute skill similarity
    skill_similarities = compute_client_skill_similarity(
        client_skills=client_data['skills_required'],
        freelancers_data=freelancers_data,
        use_fuzzy=False
    )
    
    # Extract and normalize average ratings (assume max rating is 5.0)
    ratings = [freelancer['avg_rating'] / 5.0 for freelancer in freelancers_data]
    
    # Compute final scores
    final_scores = [
        weights['skill'] * skill_sim + weights['text'] * text_sim + weights['rating'] * rating
        for skill_sim, text_sim, rating in zip(skill_similarities, text_similarities, ratings)
    ]
    
    return final_scores

# Example usage
if __name__ == "__main__":
    try:
        # Load synthetic data
        clients_data, freelancers_data, _ = load_synthetic_data("synthetic_data.json")
        
        # Example client data
        new_client = {
            "project_description": "Build a React e-commerce website with modern UI/UX",
            "skills_required": ["React", "JavaScript", "UI/UX Design"]
        }

        new_client = clients_data[0]
        
        # Compute final scores
        final_scores = compute_final_score(
            client_data=new_client,
            freelancers_data=freelancers_data,
            freelancer_embeddings_file="freelancer_embeddings.npy",
            weights={'skill': 0.4, 'text': 0.4, 'rating': 0.2}
        )
        
        # Print results
        top_5_indices = np.argsort(final_scores)[-5:][::-1]  # reverse for descending order
        print("\nTop 5 Matches:")
        for rank, idx in enumerate(top_5_indices, 1):
            print(f"{rank}. Freelancer {idx+1} with score {final_scores[idx]:.4f}")
        
    except Exception as e:
        print(f"Failed to compute final scores: {str(e)}")

Loaded 5 clients, 10 freelancers, and 3 reviews from synthetic_data.json

Top 5 Matches:
1. Freelancer 2 with score 0.7361
2. Freelancer 7 with score 0.7240
3. Freelancer 1 with score 0.6795
4. Freelancer 8 with score 0.6670
5. Freelancer 3 with score 0.6561
