# 00_dataset_generation.ipynb

This notebook demonstrates the generation of the synthetic JibJobRecSys dataset using the provided configuration.

In [None]:
import os
from utils.common import load_config
from dataset_generator.categories_generator import generate_categories
from dataset_generator.users_generator import generate_professionals, generate_clients
from dataset_generator.jobs_generator import generate_jobs
from dataset_generator.interactions_generator import generate_interactions

config = load_config('../config.yaml')
data_dir = '../data/generated'
os.makedirs(data_dir, exist_ok=True)

# Generate categories
generate_categories(
    num_categories=config['dataset']['num_categories'],
    output_path=os.path.join(data_dir, 'categories.csv'),
    base_category_definitions=config['dataset']['base_category_definitions'],
    random_seed=42
)
# Generate professionals
generate_professionals(
    num_professionals=config['dataset']['num_professionals'],
    avg_categories=config['dataset']['avg_categories_per_professional'],
    categories_path=os.path.join(data_dir, 'categories.csv'),
    output_dir=data_dir,
    random_seed=42,
    min_categories=config['dataset']['min_categories_per_professional'],
    max_categories=config['dataset']['max_categories_per_professional']
)
# Generate clients
generate_clients(
    num_clients=config['dataset']['num_clients'],
    output_dir=data_dir,
    random_seed=42
)
# Generate jobs
generate_jobs(
    num_jobs=config['dataset']['num_jobs'],
    avg_categories_per_job=config['dataset']['avg_categories_per_job'],
    clients_path=os.path.join(data_dir, 'clients.csv'),
    categories_path=os.path.join(data_dir, 'categories.csv'),
    output_dir=data_dir,
    random_seed=42,
    min_categories=config['dataset']['min_categories_per_job'],
    max_categories=config['dataset']['max_categories_per_job']
)
# Generate interactions
generate_interactions(
    professionals_path=os.path.join(data_dir, 'professionals.csv'),
    jobs_path=os.path.join(data_dir, 'jobs.csv'),
    professional_categories_path=os.path.join(data_dir, 'professional_selected_categories.csv'),
    job_categories_path=os.path.join(data_dir, 'job_required_categories.csv'),
    output_path=os.path.join(data_dir, 'interactions.csv'),
    num_interactions_per_professional=config['dataset']['num_interactions_per_professional'],
    random_seed=42,
    jaccard_prob_offset=config['dataset']['jaccard_prob_offset'],
    unrelated_prob=config['dataset']['unrelated_prob']
)