In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Reproducibility
np.random.seed(42)

# Constants
num_rows = 10000
departments = ["HR", "Finance", "Engineering", "Marketing", "Sales", "Admin", "finance", "hr", "Engg"]
positions = ["Manager", "Analyst", "Engineer", "Executive", "Clerk", "Director", "Intern", "engineer", ""]
cities = ["New York", "Los Angeles", "Chicago", "Houston", "San Jose", "newyork", "LA", "chicago"]

# Generate messy HR data
data = {
    "Employee_ID": [f"EMP{1000+i}" for i in range(num_rows)],
    "Employee_Name": [
        str(random.choice(["John", "Alice", "Bob", "Charlie", "Diana", "Eve", ""])) +
        str(random.choice([" Smith", " Johnson", " Brown", " ", "", None]))
        for _ in range(num_rows)
    ],
    "Gender": [random.choice(["Male", "Female", "M", "F", "male", "female", "", None]) for _ in range(num_rows)],
    "Department": [random.choice(departments) for _ in range(num_rows)],
    "Position": [random.choice(positions) for _ in range(num_rows)],
    "Salary": [
        random.choice([round(random.uniform(25000, 200000), 2), "N/A", None, "", "twenty thousand"])
        for _ in range(num_rows)
    ],
    "Join_Date": [
        (datetime(2010, 1, 1) + timedelta(days=random.randint(0, 5000))).strftime("%Y-%m-%d")
        if random.random() > 0.05 
        else random.choice(["2015-15-01", "30/02/2017", "", None])
        for _ in range(num_rows)
    ],
    "City": [random.choice(cities) for _ in range(num_rows)],
    "Email": [
        random.choice([
            "john@gmail.com", "alice@", "bob@yahoo", "evehotmail.com", "charlie@gmail.com", "",
            None, "diana@@gmail.com"
        ])
        for _ in range(num_rows)
    ],
    "Age": [
        random.choice([random.randint(20, 65), "", None, "twenty five", 120, -5])
        for _ in range(num_rows)
    ],
    "Performance_Score": [
        random.choice(["A", "B", "C", "D", "E", "F", "Excellent", "Good", "average", None, ""])
        for _ in range(num_rows)
    ]
}

df = pd.DataFrame(data)

# Introduce duplicates (5%)
duplicates = df.sample(frac=0.05, random_state=42)
df_unclean = pd.concat([df, duplicates], ignore_index=True)

# Shuffle rows
df_unclean = df_unclean.sample(frac=1, random_state=42).reset_index(drop=True)

# Save to CSV
file_path = "Assignment2_HR_DT.csv"
df_unclean.to_csv(file_path, index=False)

print(f"‚úÖ Unclean HR dataset generated successfully with {len(df_unclean)} rows!")
print(f"üìÇ Saved as: {file_path}")


‚úÖ Unclean HR dataset generated successfully with 10500 rows!
üìÇ Saved as: Assignment2_HR_DT.csv


Dataset Description:
This is a synthetic unclean HR dataset with 10,000+ records, designed for data analytics, testing, and cleaning exercises. It includes realistic HR attributes like Employee ID, Name, Gender, Department, Position, Salary, Join Date, City, Email, Age, and Performance Score.

The dataset intentionally contains messy and inconsistent data, including:

Missing values and nulls

Typos and inconsistent formatting (e.g., ‚ÄúHR‚Äù vs ‚Äúhr‚Äù, ‚ÄúEngineer‚Äù vs ‚Äúengineer‚Äù)

Invalid or malformed entries (e.g., negative ages, invalid dates, wrong emails)

Mixed data types (strings in numeric fields, etc.)

Duplicates (~5% of the data)

This makes it perfect for practicing data cleaning, preprocessing, and validation techniques in Python or any data analytics workflow.