In [3]:
!pip install faker



In [9]:
import pandas as pd
from faker import Faker
import random

# Initialize the Faker module
fake = Faker()

# Configuration for nationalities based on H-1B visa distributions
nationality_weights = {
    'India': 74.5, 'China': 11.8, 'Canada': 1.0, 
    'South Korea': 0.9, 'Philippines': 0.6, 
    'Taiwan': 0.6, 'Mexico': 0.6
}

# Define departments and associated roles and salary ranges
department_info = {
    'Legal': ('Paralegal', 50000, 80000),
    'Marketing': ('Marketing Specialist', 60000, 90000),
    'Administrative': ('Administrative Assistant', 35000, 55000),
    'Operations': ('Operations Manager', 70000, 100000),
    'Sales': ('Sales Representative', 45000, 75000),
    'Finance': ('Financial Analyst', 65000, 95000),
    'I/T': ('IT Specialist', 65000, 95000),
    'Product': ('Product Manager', 80000, 120000),
    'Human Resource': ('HR Specialist', 50000, 75000)
}

# Function to randomly select a nationality based on defined weights
def choose_nationality():
    countries, weights = zip(*nationality_weights.items())
    return random.choices(countries, weights=weights, k=1)[0]

# Function to generate synthetic employee data
def generate_employees(num_employees=10000):
    employees = []
    for _ in range(num_employees):
        department = random.choices(list(department_info.keys()), weights=[5, 10, 10, 20, 10, 5, 10, 20, 10])[0]
        title, min_salary, max_salary = department_info[department]
        nationality = choose_nationality() if random.random() < 0.4 else 'USA'
        gender = random.choice(['Male', 'Female'])
        salary = random.randint(min_salary, max_salary)
        languages = random.sample(['Spanish', 'French', 'German', 'Chinese', 'Hindi'], random.randint(0, 2))
        
        employees.append({
            'Employee ID': fake.unique.ssn(),
            'Name': fake.name_male() if gender == 'Male' else fake.name_female(),
            'Gender': gender,
            'Department': department,
            'Job Title': title,
            'Salary': salary,
            'Nationality': nationality,
            'Languages Spoken': ', '.join(languages)
        })
        
    return pd.DataFrame(employees)

# Function to process CSV input data and generate synthetic employee data
def process_csv_input(csv_file_path):
    input_df = pd.read_csv(csv_file_path)
    num_employees = len(input_df)
    processed_employees_df = generate_employees(num_employees)
    
    # Optionally, you can merge input data with synthetic data if required
    # For example, if you want to keep some columns from the input CSV
    # processed_employees_df = pd.concat([input_df, processed_employees_df], axis=1)
    
    return processed_employees_df

# Example usage
input_csv_path = 'C:/Users/krish/Music/employees.csv'
processed_employee_df = process_csv_input(input_csv_path)
processed_employee_df.to_csv('F:/quarter_3/VA/video/synthetic_employee_data.csv', index=False)
print(processed_employee_df.head())


   Employee ID                   Name  Gender      Department  \
0  167-13-1869         Philip Wallace    Male           Sales   
1  114-59-2953           William Choi    Male           Sales   
2  680-55-7912         Benjamin Lopez    Male         Product   
3  012-31-5101  Christopher Rodriguez    Male  Human Resource   
4  645-46-7422       Jacqueline Smith  Female             I/T   

              Job Title  Salary Nationality  Languages Spoken  
0  Sales Representative   48306         USA   French, Spanish  
1  Sales Representative   59036         USA  Spanish, Chinese  
2       Product Manager   94455         USA             Hindi  
3         HR Specialist   73211         USA             Hindi  
4         IT Specialist   71789       India           Spanish  
