In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import random
from faker import Faker

# Create a Faker object
fake = Faker()

# Set a seed for reproducibility
np.random.seed(0)
random.seed(0)

# Define the number of rows
num_rows = 50

# Create synthetic data
data = {
    'name': [fake.name() for _ in range(num_rows)],
    'email': [fake.email() for _ in range(num_rows)],
    'job_role': np.random.choice([
        'Software Engineer',
        'Data Scientist',
        'Data Analyst',
        'IT Support',
        'Automation Tester',
        'Senior Data Consultant',
        'Junior Developer'
    ], size=num_rows),
    'tenure': np.round(np.random.uniform(0, 10, size=num_rows), 1),
    'salary': np.random.randint(50000, 150000, size=num_rows),
    'promotion_history': np.random.choice([0, 1], size=num_rows),
    'training_hours': np.random.randint(0, 101, size=num_rows),
    'work_life_balance_score': np.random.randint(1, 6, size=num_rows),  # Scores in the range 1-5
    'turnover': np.random.choice([0, 1], size=num_rows)
}

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Introduce NULL values in some rows for selected columns
num_nulls = 5  # Number of null values to introduce per column
null_columns = ['email', 'tenure', 'salary']  # Columns to add null values

for col in null_columns:
    null_indices = random.sample(range(num_rows), num_nulls)
    df.loc[null_indices, col] = None

# Add some unrealistic dummy data to simulate data quality issues
df.loc[random.sample(range(num_rows), 3), 'salary'] = [-1000, 2000000, 0]  # Unrealistic salaries
df.loc[random.sample(range(num_rows), 2), 'job_role'] = ["Unknown", ""]  # Dummy/invalid job roles
df.loc[random.sample(range(num_rows), 2), 'work_life_balance_score'] = [6, 0]  # Out-of-range scores

# Print the updated DataFrame
print("Employee Turnover Prediction for HR Analytics:")
print(df)

# Save the DataFrame to a CSV file
df.to_csv('employee_turnover_prediction_data.csv', index=False)


Employee Turnover Prediction for HR Analytics:
                   name                         email                job_role  \
0        Cheryl Jackson          howard86@example.com       Automation Tester   
1          Robert Smith           jmorrow@example.net  Senior Data Consultant   
2          Jeffrey Shaw                          None       Software Engineer   
3       James Mcpherson          rhonda21@example.net              IT Support   
4       Timothy Chapman              qcox@example.com              IT Support   
5          April Newton        lowerycody@example.com              IT Support   
6          Eric Beasley           imorris@example.net                 Unknown   
7       Jordan Martinez       darryljones@example.org              IT Support   
8         Austin Powell           brian44@example.org  Senior Data Consultant   
9          Michael Webb      edwardsjames@example.com            Data Analyst   
10       Phillip Kelley          vaguirre@example.com       Au