# DocMatchNet-JEPA: Data Generation
=================================
This notebook generates synthetic doctor-patient matching data.

Runtime: ~2-3 hours on Kaggle CPU  
Output: Saved as Kaggle Dataset for other notebooks

In [1]:
# ============================================================
# CELL 1: Setup and Imports
# ============================================================
!pip install sentence-transformers -q

import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import json
import os

# Set seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Create output directory
os.makedirs('/kaggle/working/data', exist_ok=True)

print("Setup complete!")

2026-02-13 10:27:49.784388: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770978470.089267      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770978470.176802      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770978470.854816      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770978470.854868      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770978470.854875      55 computation_placer.cc:177] computation placer alr

Setup complete!


In [2]:
# ============================================================
# CELL 2: Define Constants
# ============================================================
SPECIALTIES = [
    "General Medicine", "Cardiology", "Neurology", "Orthopedics",
    "Dermatology", "Pediatrics", "Gynecology", "Ophthalmology",
    "ENT", "Psychiatry", "Urology", "Nephrology", "Pulmonology",
    "Gastroenterology", "Endocrinology", "Rheumatology",
    "Oncology", "Hematology", "Infectious Disease",
    "General Surgery", "Cardiac Surgery", "Neurosurgery",
    "Plastic Surgery", "Vascular Surgery", "Radiology",
    "Anesthesiology", "Emergency Medicine", "Family Medicine",
    "Sports Medicine", "Pain Medicine", "Allergy & Immunology",
    "Geriatrics", "Neonatology", "Hepatology", "Critical Care",
    "Palliative Care", "Physical Medicine", "Preventive Medicine",
    "Sleep Medicine", "Tropical Medicine"
]

# Specialty distribution (more common ones have higher probability)
SPECIALTY_PROBS = [0.12, 0.05, 0.04, 0.05, 0.06, 0.07, 0.06, 0.04,
                   0.04, 0.03, 0.02, 0.02, 0.03, 0.03, 0.02, 0.01,
                   0.02, 0.01, 0.02, 0.04, 0.01, 0.01, 0.01, 0.01,
                   0.02, 0.02, 0.03, 0.04, 0.01, 0.01, 0.01, 0.02,
                   0.01, 0.01, 0.02, 0.01, 0.01, 0.01, 0.005, 0.005]
SPECIALTY_PROBS = np.array(SPECIALTY_PROBS)
SPECIALTY_PROBS = SPECIALTY_PROBS / SPECIALTY_PROBS.sum()

CITIES = [
    ("Mumbai", "Maharashtra", 19.0760, 72.8777),
    ("Delhi", "Delhi", 28.6139, 77.2090),
    ("Bangalore", "Karnataka", 12.9716, 77.5946),
    ("Chennai", "Tamil Nadu", 13.0827, 80.2707),
    ("Kolkata", "West Bengal", 22.5726, 88.3639),
    ("Hyderabad", "Telangana", 17.3850, 78.4867),
    ("Pune", "Maharashtra", 18.5204, 73.8567),
    ("Ahmedabad", "Gujarat", 23.0225, 72.5714),
    ("Jaipur", "Rajasthan", 26.9124, 75.7873),
    ("Lucknow", "Uttar Pradesh", 26.8467, 80.9462),
    ("Chandigarh", "Punjab", 30.7333, 76.7794),
    ("Kochi", "Kerala", 9.9312, 76.2673),
    ("Bhopal", "Madhya Pradesh", 23.2599, 77.4126),
    ("Patna", "Bihar", 25.5941, 85.1376),
    ("Indore", "Madhya Pradesh", 22.7196, 75.8577),
    ("Nagpur", "Maharashtra", 21.1458, 79.0882),
    ("Coimbatore", "Tamil Nadu", 11.0168, 76.9558),
    ("Vizag", "Andhra Pradesh", 17.6868, 83.2185),
    ("Guwahati", "Assam", 26.1445, 91.7362),
    ("Thiruvananthapuram", "Kerala", 8.5241, 76.9366)
]

LANGUAGES = ["English", "Hindi", "Tamil", "Telugu", "Bengali", 
             "Marathi", "Kannada", "Malayalam", "Gujarati"]

FIRST_NAMES = ["Rajesh", "Priya", "Amit", "Sunita", "Vikram", "Anita",
               "Suresh", "Kavita", "Ramesh", "Meera", "Arun", "Deepa",
               "Sanjay", "Lakshmi", "Vijay", "Pooja", "Manoj", "Neha"]
               
LAST_NAMES = ["Sharma", "Patel", "Kumar", "Singh", "Gupta", "Reddy",
              "Iyer", "Nair", "Joshi", "Verma", "Rao", "Menon", "Das"]

N_DOCTORS = 500
N_CASES = 15000
DOCTORS_PER_CASE = 100  # Top 50 + 50 random

print(f"Will generate {N_DOCTORS} doctors and {N_CASES} cases")

Will generate 500 doctors and 15000 cases


In [3]:
# ============================================================
# CELL 3: Generate Doctors
# ============================================================
def generate_doctors(n_doctors=500):
    """Generate synthetic doctor profiles."""
    doctors = []
    
    for i in tqdm(range(n_doctors), desc="Generating doctors"):
        # Basic info
        specialty = np.random.choice(SPECIALTIES, p=SPECIALTY_PROBS)
        years_exp = np.random.randint(1, 35)
        city_idx = np.random.randint(len(CITIES))
        city, state, lat, lon = CITIES[city_idx]
        
        # Add some noise to location
        lat += np.random.normal(0, 0.1)
        lon += np.random.normal(0, 0.1)
        
        # Languages (English + regional)
        langs = ["English"]
        n_extra_langs = np.random.randint(1, 4)
        extra_langs = np.random.choice(LANGUAGES[1:], n_extra_langs, replace=False)
        langs.extend(extra_langs.tolist())
        
        # Publications (higher for experienced doctors)
        if years_exp > 10:
            pubs = np.random.poisson(years_exp * 2)
        else:
            pubs = np.random.poisson(years_exp * 0.5)
        
        # Create expertise description
        expertise_desc = f"Dr. {np.random.choice(FIRST_NAMES)} {np.random.choice(LAST_NAMES)} " \
                        f"is a {specialty} specialist with {years_exp} years of experience. " \
                        f"Specializes in diagnosis and treatment of conditions related to {specialty.lower()}. "
        
        if pubs > 10:
            expertise_desc += f"Published {pubs} research papers in the field. "
        
        doctor = {
            'doctor_id': f'DOC_{i:04d}',
            'name': f"Dr. {np.random.choice(FIRST_NAMES)} {np.random.choice(LAST_NAMES)}",
            'specialty': specialty,
            'specialty_idx': SPECIALTIES.index(specialty),
            'years_experience': years_exp,
            'languages': langs,
            'city': city,
            'state': state,
            'lat': lat,
            'lon': lon,
            'consultation_fee': np.random.randint(3, 51) * 100,  # 300-5000
            'available_modes': np.random.choice(['online', 'in_person', 'both'], 
                                                p=[0.2, 0.3, 0.5]),
            'availability_score': np.random.uniform(0.3, 1.0),
            'nmc_verified': np.random.random() < 0.9,
            'profile_completeness': np.random.uniform(0.4, 1.0),
            'review_score': np.clip(np.random.normal(4.0, 0.5), 2.5, 5.0),
            'num_reviews': np.random.randint(0, 500),
            'publications_count': pubs,
            'consultation_completion_rate': np.random.uniform(0.75, 1.0),
            'expertise_description': expertise_desc
        }
        doctors.append(doctor)
    
    return pd.DataFrame(doctors)

doctors_df = generate_doctors(N_DOCTORS)
print(f"\nGenerated {len(doctors_df)} doctors")
print(f"Specialty distribution:\n{doctors_df['specialty'].value_counts().head(10)}")

Generating doctors: 100%|██████████| 500/500 [00:00<00:00, 3039.75it/s]


Generated 500 doctors
Specialty distribution:
specialty
General Medicine    59
Pediatrics          39
Gynecology          33
ENT                 26
Dermatology         25
Orthopedics         23
General Surgery     23
Neurology           22
Ophthalmology       22
Cardiology          17
Name: count, dtype: int64





In [4]:
# ============================================================
# CELL 4: Generate Patient Cases
# ============================================================
# Symptom templates by specialty
SYMPTOM_TEMPLATES = {
    "General Medicine": [
        "fever and body aches for {duration}",
        "persistent fatigue and weakness",
        "recurring headaches and dizziness",
        "unexplained weight loss",
        "general weakness and loss of appetite"
    ],
    "Cardiology": [
        "chest pain radiating to left arm",
        "shortness of breath on exertion",
        "palpitations and irregular heartbeat",
        "swelling in legs and ankles",
        "high blood pressure symptoms"
    ],
    "Neurology": [
        "severe migraine headaches",
        "numbness in extremities",
        "difficulty with balance and coordination",
        "memory problems and confusion",
        "seizures and loss of consciousness"
    ],
    "Orthopedics": [
        "chronic back pain",
        "knee pain when climbing stairs",
        "shoulder pain limiting movement",
        "joint stiffness in the morning",
        "pain after sports injury"
    ],
    "Dermatology": [
        "persistent skin rash",
        "acne that won't clear",
        "unexplained hair loss",
        "skin discoloration patches",
        "itching and dry skin"
    ],
    "Pediatrics": [
        "child has high fever",
        "child not eating properly",
        "delayed developmental milestones",
        "frequent ear infections",
        "child has persistent cough"
    ],
    # Add more as needed - use General Medicine as default
}

def get_symptom_description(specialty, severity, duration):
    """Generate symptom description based on specialty."""
    templates = SYMPTOM_TEMPLATES.get(specialty, SYMPTOM_TEMPLATES["General Medicine"])
    base_symptom = np.random.choice(templates).format(duration=duration)
    
    if severity == "severe":
        prefix = "Experiencing severe "
        suffix = " This is significantly affecting daily activities."
    elif severity == "moderate":
        prefix = "Having "
        suffix = " Looking for treatment options."
    else:
        prefix = "Mild "
        suffix = " Would like to get it checked."
    
    return prefix + base_symptom + suffix

def generate_cases(n_cases=15000, doctors_df=None):
    """Generate synthetic patient cases."""
    cases = []
    
    # Context distribution
    context_probs = {
        'routine': 0.60,
        'complex': 0.15,
        'rare_disease': 0.10,
        'emergency': 0.10,
        'pediatric': 0.05
    }
    contexts = list(context_probs.keys())
    probs = list(context_probs.values())
    
    for i in tqdm(range(n_cases), desc="Generating cases"):
        # Determine context
        context = np.random.choice(contexts, p=probs)
        
        # Age based on context
        if context == 'pediatric':
            age = np.random.randint(0, 18)
        elif context == 'geriatric':
            age = np.random.randint(65, 90)
        else:
            age = np.random.randint(18, 80)
        
        # Urgency based on context
        if context == 'emergency':
            urgency = 'emergency'
        elif context == 'complex':
            urgency = np.random.choice(['semi_urgent', 'urgent'], p=[0.6, 0.4])
        else:
            urgency = np.random.choice(['routine', 'semi_urgent'], p=[0.7, 0.3])
        
        # Severity
        if context == 'emergency':
            severity = np.random.choice(['severe', 'critical'], p=[0.4, 0.6])
        elif context == 'complex':
            severity = np.random.choice(['moderate', 'severe'], p=[0.5, 0.5])
        else:
            severity = np.random.choice(['mild', 'moderate'], p=[0.6, 0.4])
        
        # Target specialty
        if context == 'pediatric':
            target_specialty = 'Pediatrics'
        elif context == 'rare_disease':
            # Rare specialties
            rare_specs = ['Rheumatology', 'Hematology', 'Oncology', 
                         'Neurosurgery', 'Tropical Medicine']
            target_specialty = np.random.choice(rare_specs)
        else:
            target_specialty = np.random.choice(SPECIALTIES, p=SPECIALTY_PROBS)
        
        # Location
        city_idx = np.random.randint(len(CITIES))
        city, state, lat, lon = CITIES[city_idx]
        lat += np.random.normal(0, 0.15)
        lon += np.random.normal(0, 0.15)
        
        # Duration
        duration = np.random.choice(['2 days', '3 days', '1 week', 
                                     '2 weeks', '1 month', '3 months'])
        
        # Symptom description
        symptom_desc = get_symptom_description(target_specialty, severity, duration)
        
        # Comorbidities (more for complex cases)
        if context == 'complex':
            comorbidity_count = np.random.randint(2, 5)
        elif age > 60:
            comorbidity_count = np.random.randint(0, 3)
        else:
            comorbidity_count = np.random.randint(0, 2)
        
        # Disease rarity score
        if context == 'rare_disease':
            rarity_score = np.random.uniform(0.7, 1.0)
        else:
            rarity_score = np.random.uniform(0.0, 0.3)
        
        # Red flag score
        if context == 'emergency':
            red_flag = np.random.uniform(0.7, 1.0)
        elif severity in ['severe', 'critical']:
            red_flag = np.random.uniform(0.4, 0.8)
        else:
            red_flag = np.random.uniform(0.0, 0.3)
        
        case = {
            'case_id': f'CASE_{i:05d}',
            'patient_age': age,
            'patient_gender': np.random.choice(['M', 'F', 'Other'], p=[0.48, 0.50, 0.02]),
            'symptom_description': symptom_desc,
            'duration': duration,
            'severity': severity,
            'city': city,
            'state': state,
            'lat': lat,
            'lon': lon,
            'preferred_language': np.random.choice(LANGUAGES, p=[0.4, 0.25, 0.08, 0.07, 
                                                                  0.06, 0.05, 0.04, 0.03, 0.02]),
            'preferred_mode': np.random.choice(['online', 'in_person', 'no_preference'],
                                               p=[0.35, 0.25, 0.40]),
            'budget_min': np.random.randint(2, 10) * 100,
            'budget_max': np.random.randint(20, 50) * 100,
            'urgency_level': urgency,
            'context_category': context,
            'target_specialty': target_specialty,
            'target_specialty_idx': SPECIALTIES.index(target_specialty),
            'red_flag_score': red_flag,
            'comorbidity_count': comorbidity_count,
            'disease_rarity_score': rarity_score,
            'symptom_count': np.random.randint(1, 8)
        }
        cases.append(case)
    
    return pd.DataFrame(cases)

cases_df = generate_cases(N_CASES, doctors_df)
print(f"\nGenerated {len(cases_df)} cases")
print(f"Context distribution:\n{cases_df['context_category'].value_counts()}")

Generating cases: 100%|██████████| 15000/15000 [00:04<00:00, 3289.57it/s]



Generated 15000 cases
Context distribution:
context_category
routine         8993
complex         2266
rare_disease    1521
emergency       1476
pediatric        744
Name: count, dtype: int64


In [5]:
# ============================================================
# CELL 5: Compute Embeddings
# ============================================================
print("Loading embedding model...")
encoder = SentenceTransformer('all-MiniLM-L6-v2')
print(f"Embedding dimension: {encoder.get_sentence_embedding_dimension()}")

# Doctor embeddings
print("\nComputing doctor embeddings...")
doctor_texts = doctors_df['expertise_description'].tolist()
doctor_embeddings = encoder.encode(doctor_texts, show_progress_bar=True, 
                                   convert_to_tensor=True)
print(f"Doctor embeddings shape: {doctor_embeddings.shape}")

# Case embeddings
print("\nComputing case embeddings...")
case_texts = cases_df['symptom_description'].tolist()
case_embeddings = encoder.encode(case_texts, show_progress_bar=True,
                                 convert_to_tensor=True, batch_size=64)
print(f"Case embeddings shape: {case_embeddings.shape}")

# Save embeddings
torch.save(doctor_embeddings, '/kaggle/working/data/doctor_embeddings.pt')
torch.save(case_embeddings, '/kaggle/working/data/case_embeddings.pt')
print("Embeddings saved!")

Loading embedding model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding dimension: 384

Computing doctor embeddings...


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Doctor embeddings shape: torch.Size([500, 384])

Computing case embeddings...


Batches:   0%|          | 0/235 [00:00<?, ?it/s]

Case embeddings shape: torch.Size([15000, 384])
Embeddings saved!


In [6]:
# ============================================================
# CELL 6: Compute Relevance Labels and Select Doctors per Case
# ============================================================
def haversine_distance(lat1, lon1, lat2, lon2):
    """Compute haversine distance in km."""
    R = 6371  # Earth radius in km
    
    lat1, lat2 = np.radians(lat1), np.radians(lat2)
    lon1, lon2 = np.radians(lon1), np.radians(lon2)
    
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    
    return R * c

def compute_relevance(case, doctor):
    """
    Compute relevance score (0-4) for a doctor-case pair.
    
    4 = Perfect match
    3 = Good match  
    2 = Fair match
    1 = Marginal match
    0 = Not relevant
    """
    score = 0
    
    # Specialty match (most important)
    if doctor['specialty'] == case['target_specialty']:
        score += 2.0
    elif doctor['specialty_idx'] // 5 == case['target_specialty_idx'] // 5:
        # Same specialty group
        score += 1.0
    
    # Availability
    if doctor['availability_score'] > 0.7:
        score += 0.5
    elif doctor['availability_score'] > 0.5:
        score += 0.25
    
    # Location (for non-online preference)
    if case['preferred_mode'] != 'online':
        dist = haversine_distance(case['lat'], case['lon'], 
                                  doctor['lat'], doctor['lon'])
        if dist < 20:
            score += 0.5
        elif dist < 50:
            score += 0.25
    
    # Language match
    if case['preferred_language'] in doctor['languages']:
        score += 0.3
    
    # Mode match
    if case['preferred_mode'] == 'no_preference':
        score += 0.2
    elif doctor['available_modes'] == 'both':
        score += 0.2
    elif doctor['available_modes'] == case['preferred_mode']:
        score += 0.3
    
    # Fee match
    if doctor['consultation_fee'] <= case['budget_max']:
        if doctor['consultation_fee'] >= case['budget_min']:
            score += 0.2
        else:
            score += 0.1
    
    # Trust factors
    if doctor['nmc_verified']:
        score += 0.2
    if doctor['review_score'] > 4.0:
        score += 0.2
    
    # Experience for complex/rare cases
    if case['context_category'] in ['complex', 'rare_disease']:
        if doctor['years_experience'] > 15:
            score += 0.3
        if doctor['publications_count'] > 20:
            score += 0.2
    
    # Convert to 0-4 scale
    if score >= 3.0:
        return 4
    elif score >= 2.0:
        return 3
    elif score >= 1.0:
        return 2
    elif score >= 0.5:
        return 1
    else:
        return 0

print("Computing relevance labels and selecting doctors per case...")

# For each case, compute relevance for all doctors, then select top-50 + 50 random
all_doctor_indices = []
all_relevance_labels = []

doctors_array = doctors_df.to_dict('records')
cases_array = cases_df.to_dict('records')

for case_idx in tqdm(range(N_CASES), desc="Processing cases"):
    case = cases_array[case_idx]
    
    # Compute relevance for all doctors
    relevances = []
    for doc_idx in range(N_DOCTORS):
        doctor = doctors_array[doc_idx]
        rel = compute_relevance(case, doctor)
        relevances.append(rel)
    
    relevances = np.array(relevances)
    
    # Get top-50 by relevance
    top_50_indices = np.argsort(-relevances)[:50]
    
    # Get 50 random from remaining
    remaining = np.setdiff1d(np.arange(N_DOCTORS), top_50_indices)
    random_50_indices = np.random.choice(remaining, 50, replace=False)
    
    # Combine
    selected_indices = np.concatenate([top_50_indices, random_50_indices])
    selected_relevances = relevances[selected_indices]
    
    all_doctor_indices.append(selected_indices)
    all_relevance_labels.append(selected_relevances)

doctor_indices = np.stack(all_doctor_indices)  # (15000, 100)
relevance_labels = np.stack(all_relevance_labels)  # (15000, 100)

print(f"Doctor indices shape: {doctor_indices.shape}")
print(f"Relevance labels shape: {relevance_labels.shape}")
print(f"Relevance distribution: {np.bincount(relevance_labels.flatten())}")

# Save
np.save('/kaggle/working/data/doctor_indices.npy', doctor_indices)
np.save('/kaggle/working/data/relevance_labels.npy', relevance_labels)

Computing relevance labels and selecting doctors per case...


Processing cases: 100%|██████████| 15000/15000 [01:17<00:00, 193.26it/s]


Doctor indices shape: (15000, 100)
Relevance labels shape: (15000, 100)
Relevance distribution: [ 20831 227505 578733 467602 205329]


In [7]:
# ============================================================
# CELL 7: Extract Features for Selected Doctor-Case Pairs
# ============================================================
def extract_features(case_idx, doc_local_idx, cases_df, doctors_df,
                     case_embeddings, doctor_embeddings, doctor_indices):
    """Extract all features for a doctor-case pair."""
    case = cases_df.iloc[case_idx]
    doc_global_idx = doctor_indices[case_idx, doc_local_idx]
    doctor = doctors_df.iloc[doc_global_idx]
    
    case_emb = case_embeddings[case_idx]
    doc_emb = doctor_embeddings[doc_global_idx]
    
    # Clinical features (4)
    cos_sim = torch.nn.functional.cosine_similarity(
        case_emb.unsqueeze(0), doc_emb.unsqueeze(0)
    ).item()
    
    specialty_match = 1.0 if doctor['specialty'] == case['target_specialty'] else \
                      0.5 if doctor['specialty_idx'] // 5 == case['target_specialty_idx'] // 5 else 0.0
    subspecialty_match = 0.5 if specialty_match > 0 else 0.0  # Simplified
    keyword_overlap = 0.3  # Placeholder - would compute properly with tokenization
    
    clinical = [cos_sim, specialty_match, subspecialty_match, keyword_overlap]
    
    # PastWork features (5)
    pub_impact = min(doctor['publications_count'] / 50, 1.0)
    topic_relevance = cos_sim * 0.8  # Simplified
    experience = min(doctor['years_experience'] / 25, 1.0)
    platform_perf = doctor['consultation_completion_rate']
    reputation = (doctor['review_score'] - 2.5) / 2.5 * 0.7 + 0.3
    
    pastwork = [pub_impact, topic_relevance, experience, platform_perf, reputation]
    
    # Logistics features (5)
    availability = doctor['availability_score']
    language_match = 1.0 if case['preferred_language'] in doctor['languages'] else \
                     0.5 if 'English' in doctor['languages'] else 0.0
    
    dist = haversine_distance(case['lat'], case['lon'], doctor['lat'], doctor['lon'])
    proximity = 1.0 - min(dist / 100, 1.0)
    
    fee = doctor['consultation_fee']
    if case['budget_min'] <= fee <= case['budget_max']:
        fee_match = 1.0
    elif fee < case['budget_min']:
        fee_match = 0.8
    else:
        fee_match = max(0, 1.0 - (fee - case['budget_max']) / case['budget_max'])
    
    if case['preferred_mode'] == 'no_preference' or doctor['available_modes'] == 'both':
        mode_match = 1.0
    elif case['preferred_mode'] == doctor['available_modes']:
        mode_match = 1.0
    else:
        mode_match = 0.3
    
    logistics = [availability, language_match, proximity, fee_match, mode_match]
    
    # Trust features (3)
    nmc = 1.0 if doctor['nmc_verified'] else 0.0
    completeness = doctor['profile_completeness']
    review = (doctor['review_score'] - 2.5) / 2.5
    
    trust = [nmc, completeness, review]
    
    return clinical, pastwork, logistics, trust

# Extract features for all selected pairs
print("Extracting features for all selected doctor-case pairs...")

clinical_features = np.zeros((N_CASES, DOCTORS_PER_CASE, 4), dtype=np.float32)
pastwork_features = np.zeros((N_CASES, DOCTORS_PER_CASE, 5), dtype=np.float32)
logistics_features = np.zeros((N_CASES, DOCTORS_PER_CASE, 5), dtype=np.float32)
trust_features = np.zeros((N_CASES, DOCTORS_PER_CASE, 3), dtype=np.float32)

for case_idx in tqdm(range(N_CASES), desc="Extracting features"):
    for doc_local_idx in range(DOCTORS_PER_CASE):
        clinical, pastwork, logistics, trust = extract_features(
            case_idx, doc_local_idx, cases_df, doctors_df,
            case_embeddings, doctor_embeddings, doctor_indices
        )
        clinical_features[case_idx, doc_local_idx] = clinical
        pastwork_features[case_idx, doc_local_idx] = pastwork
        logistics_features[case_idx, doc_local_idx] = logistics
        trust_features[case_idx, doc_local_idx] = trust

# Context features (8) - per case
context_features = np.zeros((N_CASES, 8), dtype=np.float32)
urgency_map = {'routine': 0, 'semi_urgent': 1, 'urgent': 2, 'emergency': 3}

for i, case in cases_df.iterrows():
    context_features[i] = [
        urgency_map[case['urgency_level']] / 3.0,
        case['symptom_count'] / 8.0,
        case['red_flag_score'],
        case['patient_age'] / 90.0,
        case['comorbidity_count'] / 5.0,
        case['disease_rarity_score'],
        1.0 if case['patient_age'] < 18 else 0.0,
        1.0 if case['urgency_level'] == 'emergency' else 0.0
    ]

print(f"Clinical features shape: {clinical_features.shape}")
print(f"Context features shape: {context_features.shape}")

# Save all features
torch.save(torch.tensor(clinical_features), '/kaggle/working/data/clinical_features.pt')
torch.save(torch.tensor(pastwork_features), '/kaggle/working/data/pastwork_features.pt')
torch.save(torch.tensor(logistics_features), '/kaggle/working/data/logistics_features.pt')
torch.save(torch.tensor(trust_features), '/kaggle/working/data/trust_features.pt')
torch.save(torch.tensor(context_features), '/kaggle/working/data/context_features.pt')
torch.save(torch.tensor(relevance_labels), '/kaggle/working/data/relevance_labels.pt')
torch.save(torch.tensor(doctor_indices), '/kaggle/working/data/doctor_indices.pt')

# Save metadata
case_metadata = {
    'context_category': cases_df['context_category'].tolist(),
    'urgency_level': cases_df['urgency_level'].tolist(),
    'target_specialty': cases_df['target_specialty'].tolist()
}
torch.save(case_metadata, '/kaggle/working/data/case_metadata.pt')

# Save dataframes
doctors_df.to_parquet('/kaggle/working/data/doctors.parquet')
cases_df.to_parquet('/kaggle/working/data/cases.parquet')

Extracting features for all selected doctor-case pairs...


Extracting features: 100%|██████████| 15000/15000 [09:04<00:00, 27.54it/s]


Clinical features shape: (15000, 100, 4)
Context features shape: (15000, 8)


In [8]:
# ============================================================
# CELL 8: Compute MCDA Teacher Scores
# ============================================================
print("Computing MCDA teacher scores...")

def compute_mcda_score(clinical, pastwork, logistics, trust):
    """Compute static MCDA score."""
    c_weights = [0.55, 0.20, 0.15, 0.10]
    p_weights = [0.30, 0.25, 0.20, 0.15, 0.10]
    l_weights = [0.30, 0.25, 0.20, 0.15, 0.10]
    t_weights = [0.50, 0.30, 0.20]
    
    c_score = sum(c * w for c, w in zip(clinical, c_weights))
    p_score = sum(p * w for p, w in zip(pastwork, p_weights))
    l_score = sum(l * w for l, w in zip(logistics, l_weights))
    t_score = sum(t * w for t, w in zip(trust, t_weights))
    
    return 0.40 * c_score + 0.25 * p_score + 0.25 * l_score + 0.10 * t_score

mcda_scores = np.zeros((N_CASES, DOCTORS_PER_CASE), dtype=np.float32)

for i in tqdm(range(N_CASES), desc="Computing MCDA scores"):
    for j in range(DOCTORS_PER_CASE):
        mcda_scores[i, j] = compute_mcda_score(
            clinical_features[i, j],
            pastwork_features[i, j],
            logistics_features[i, j],
            trust_features[i, j]
        )

torch.save(torch.tensor(mcda_scores), '/kaggle/working/data/mcda_scores.pt')
print(f"MCDA scores shape: {mcda_scores.shape}")
print(f"MCDA score range: [{mcda_scores.min():.3f}, {mcda_scores.max():.3f}]")

Computing MCDA teacher scores...


Computing MCDA scores: 100%|██████████| 15000/15000 [00:20<00:00, 745.96it/s]

MCDA scores shape: (15000, 100)
MCDA score range: [0.200, 0.730]





In [9]:
# ============================================================
# CELL 9: Create Train/Val/Test Splits
# ============================================================
from sklearn.model_selection import train_test_split

# Stratified split by context category
contexts = cases_df['context_category'].values
indices = np.arange(N_CASES)

# First split: train+val vs test
train_val_idx, test_idx = train_test_split(
    indices, test_size=0.15, stratify=contexts, random_state=42
)

# Second split: train vs val
train_idx, val_idx = train_test_split(
    train_val_idx, test_size=0.15/0.85, 
    stratify=contexts[train_val_idx], random_state=42
)

print(f"Train size: {len(train_idx)}")
print(f"Val size: {len(val_idx)}")
print(f"Test size: {len(test_idx)}")

# Verify stratification
for split_name, split_idx in [('Train', train_idx), ('Val', val_idx), ('Test', test_idx)]:
    split_contexts = contexts[split_idx]
    print(f"\n{split_name} context distribution:")
    unique, counts = np.unique(split_contexts, return_counts=True)
    for ctx, cnt in zip(unique, counts):
        print(f"  {ctx}: {cnt} ({cnt/len(split_idx)*100:.1f}%)")

# Save splits
splits = {
    'train': train_idx.tolist(),
    'val': val_idx.tolist(),
    'test': test_idx.tolist()
}
torch.save(splits, '/kaggle/working/data/splits.pt')

Train size: 10500
Val size: 2250
Test size: 2250

Train context distribution:
  complex: 1586 (15.1%)
  emergency: 1034 (9.8%)
  pediatric: 520 (5.0%)
  rare_disease: 1065 (10.1%)
  routine: 6295 (60.0%)

Val context distribution:
  complex: 340 (15.1%)
  emergency: 221 (9.8%)
  pediatric: 112 (5.0%)
  rare_disease: 228 (10.1%)
  routine: 1349 (60.0%)

Test context distribution:
  complex: 340 (15.1%)
  emergency: 221 (9.8%)
  pediatric: 112 (5.0%)
  rare_disease: 228 (10.1%)
  routine: 1349 (60.0%)


In [10]:
# ============================================================
# CELL 10: Summary and Verification
# ============================================================
print("\n" + "=" * 60)
print("DATA GENERATION COMPLETE")
print("=" * 60)

import os
data_dir = '/kaggle/working/data'
files = os.listdir(data_dir)
print(f"\nGenerated files:")
total_size = 0
for f in sorted(files):
    path = os.path.join(data_dir, f)
    size = os.path.getsize(path) / (1024 * 1024)  # MB
    total_size += size
    print(f"  {f}: {size:.2f} MB")
print(f"\nTotal size: {total_size:.2f} MB")

print(f"\nDataset Statistics:")
print(f"  - Doctors: {N_DOCTORS}")
print(f"  - Cases: {N_CASES}")
print(f"  - Doctors per case: {DOCTORS_PER_CASE}")
print(f"  - Total pairs: {N_CASES * DOCTORS_PER_CASE:,}")
print(f"  - Embedding dimension: 384")
print(f"  - Feature dimensions: clinical(4), pastwork(5), logistics(5), trust(3)")
print(f"  - Context dimension: 8")

print(f"\nSplits:")
print(f"  - Train: {len(train_idx)} cases ({len(train_idx)/N_CASES*100:.1f}%)")
print(f"  - Val: {len(val_idx)} cases ({len(val_idx)/N_CASES*100:.1f}%)")
print(f"  - Test: {len(test_idx)} cases ({len(test_idx)/N_CASES*100:.1f}%)")

print("\n✅ Data ready! Save this notebook output as a Kaggle Dataset.")


DATA GENERATION COMPLETE

Generated files:
  case_embeddings.pt: 21.97 MB
  case_metadata.pt: 5.20 MB
  cases.parquet: 0.79 MB
  clinical_features.pt: 22.89 MB
  context_features.pt: 0.46 MB
  doctor_embeddings.pt: 0.73 MB
  doctor_indices.npy: 11.44 MB
  doctor_indices.pt: 11.45 MB
  doctors.parquet: 0.06 MB
  logistics_features.pt: 28.61 MB
  mcda_scores.pt: 5.72 MB
  pastwork_features.pt: 28.61 MB
  relevance_labels.npy: 11.44 MB
  relevance_labels.pt: 11.45 MB
  splits.pt: 0.04 MB
  trust_features.pt: 17.17 MB

Total size: 178.05 MB

Dataset Statistics:
  - Doctors: 500
  - Cases: 15000
  - Doctors per case: 100
  - Total pairs: 1,500,000
  - Embedding dimension: 384
  - Feature dimensions: clinical(4), pastwork(5), logistics(5), trust(3)
  - Context dimension: 8

Splits:
  - Train: 10500 cases (70.0%)
  - Val: 2250 cases (15.0%)
  - Test: 2250 cases (15.0%)

✅ Data ready! Save this notebook output as a Kaggle Dataset.
