In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

def generate_hr_data(num_records=5000):
    """
    Generates an extensive HR dataset for recruitment ROI analysis.
    Includes 5,000 records and a diverse list of universities.
    """
    np.random.seed(42)

    # Configuration - Expanded University List
    tier_1_unis = [
        'Stanford University', 'MIT', 'UC Berkeley', 'Harvard University',
        'Princeton', 'Yale', 'Oxford', 'Cambridge', 'ETH Zurich', 'IIT Bombay'
    ]
    tier_2_unis = [
        'Georgia Tech', 'University of Michigan', 'UT Austin', 'University of Washington',
        'Purdue University', 'Texas A&M', 'University of Toronto', 'UCL', 'National University of Singapore'
    ]
    general_unis = [
        'State University', 'City College', 'Regional Institute', 'Community College of Tech',
        'Polytechnic University', 'Open University', 'International Business School'
    ]

    all_unis = tier_1_unis + tier_2_unis + general_unis

    sources = ['LinkedIn', 'Employee Referral', 'Recruitment Agency', 'Indeed', 'Direct Application', 'University Campus']
    departments = ['Engineering', 'Sales', 'Marketing', 'Finance', 'HR', 'Product', 'Data & AI', 'Customer Success']
    locations = ['New York', 'San Francisco', 'London', 'Remote', 'Austin', 'Singapore', 'Berlin']

    data = []

    for i in range(num_records):
        source = np.random.choice(sources, p=[0.30, 0.20, 0.15, 0.15, 0.10, 0.10])
        dept = np.random.choice(departments)
        loc = np.random.choice(locations)

        # Sourcing Cost Logic
        if source == 'Recruitment Agency':
            cost = np.random.normal(15000, 3000)
        elif source == 'Employee Referral':
            cost = np.random.choice([1000, 2500, 5000], p=[0.5, 0.4, 0.1]) # Tiered referral bonuses
        elif source == 'LinkedIn':
            cost = np.random.normal(1800, 400)
        elif source == 'University Campus':
            cost = np.random.normal(5000, 1000) # Travel + Event costs
        else:
            cost = np.random.normal(600, 150)

        # Quality and Tenure Logic
        # University logic: Tier 1 has higher performance but sometimes shorter tenure (poaching risk)
        selected_uni = 'N/A'
        quality_multiplier = 1.0
        tenure_multiplier = 1.0

        if source == 'University Campus':
            selected_uni = np.random.choice(all_unis)
            if selected_uni in tier_1_unis:
                quality_multiplier = 1.25
                tenure_multiplier = 0.85 # High performers leave faster
            elif selected_uni in tier_2_unis:
                quality_multiplier = 1.1
                tenure_multiplier = 1.1

        # Source-based multipliers
        if source == 'Employee Referral':
            quality_multiplier *= 1.1
            tenure_multiplier *= 1.3
            retention_90_rate = 0.96
        elif source == 'Recruitment Agency':
            quality_multiplier *= 0.9
            tenure_multiplier *= 0.7
            retention_90_rate = 0.68
        else:
            retention_90_rate = 0.84

        # Calculate Final Stats
        base_tenure = np.random.normal(24, 10) * tenure_multiplier
        base_performance = np.random.normal(3.4, 0.6) * quality_multiplier

        tenure_months = max(1, int(base_tenure))
        perf_rating = min(5.0, max(1.0, round(base_performance, 1)))
        retained_90_days = 1 if np.random.random() < retention_90_rate else 0

        # Funnel Dates
        # Spreading data over 2 years
        apply_date = datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 730))
        hire_date = apply_date + timedelta(days=np.random.randint(10, 90))

        data.append({
            'Candidate_ID': f'C-{5000+i}',
            'Name': f'Employee_{i}',
            'Gender': np.random.choice(['Male', 'Female', 'Non-Binary'], p=[0.48, 0.48, 0.04]),
            'Source': source,
            'University': selected_uni,
            'Location': loc,
            'Department': dept,
            'Application_Date': apply_date.strftime('%Y-%m-%d'),
            'Hire_Date': hire_date.strftime('%Y-%m-%d'),
            'Sourcing_Cost': round(cost, 2),
            'Tenure_Months': tenure_months,
            'Performance_Rating': perf_rating,
            'Retained_90_Days': retained_90_days,
            'Offer_Accepted': 1 if np.random.random() < 0.78 else 0
        })

    df = pd.DataFrame(data)

    # Save to CSV
    df.to_csv('hr_recruitment_data.csv', index=False)
    print(f"Dataset 'hr_recruitment_data.csv' generated with {num_records} records.")
    print(f"Includes {len(all_unis)} unique universities across 3 tiers.")

if __name__ == "__main__":
    generate_hr_data(400)

Dataset 'hr_recruitment_data.csv' generated with 400 records.
Includes 26 unique universities across 3 tiers.
