# 🎯 Intern Performance Dataset Generator
---------------------------------------
- This script generates a synthetic dataset of 10,000+ intern records using Faker.
- It saves the data to a CSV file for use in dashboards, ML models, and reporting.

In [3]:
import pandas as pd
import numpy as np
from faker import Faker
import random

# Initialize Faker and seed

In [4]:
fake = Faker()
Faker.seed(42)
np.random.seed(42)

# Parameters

In [5]:
num_rows = 10000
departments = ['Tech', 'Marketing', 'HR', 'Finance', 'Design']
projects = ['Website Revamp', 'SEO Campaign', 'Recruitment Drive', 'Budget Planning', 'Brand Redesign']
statuses = ['Completed', 'Ongoing', 'Dropped']
interaction_levels = ['High', 'Medium', 'Low']

# Generate dataset

In [6]:
data = []
for i in range(num_rows):
    intern_id = 1000 + i
    intern_name = fake.name()
    department = random.choice(departments)
    task_name = f"Task {fake.random_int(min=1, max=100)}"
    project = random.choice(projects)
    start_date = fake.date_between(start_date='-6M', end_date='today')
    status = random.choices(statuses, weights=(70, 20, 10), k=1)[0]

    if status == 'Completed':
        if department == 'Tech':
            days = random.randint(3, 7)
        elif department == 'HR':
            days = random.randint(10, 15)
        else:
            days = random.randint(6, 10)
        end_date = start_date + pd.Timedelta(days=days)
    elif status == 'Ongoing':
        end_date = pd.NaT
    else:
        end_date = start_date + pd.Timedelta(days=random.randint(1, 5))

    month = start_date.strftime('%B')

    # Department-based variation in scores
    if status == 'Completed':
        if department == 'Tech':
            quality_score = random.randint(8, 10)
            feedback_score = random.randint(4, 5)
        elif department == 'HR':
            quality_score = random.randint(5, 7)
            feedback_score = random.randint(2, 3)
        else:
            quality_score = random.randint(6, 9)
            feedback_score = random.randint(3, 5)
    else:
        quality_score = np.nan
        feedback_score = random.choice([2, 3])

    interaction_level = random.choice(interaction_levels)

    data.append([
        intern_id, intern_name, department, task_name, project, start_date,
        end_date, month, quality_score, feedback_score, status, interaction_level
    ])

# Create DataFrame

In [7]:
columns = [
    'Intern ID', 'Intern Name', 'Department', 'Task Name', 'Project Assigned',
    'Date of Assignment', 'Date of Completion', 'Month', 'Project_Quality_Score',
    'Mentor_Feedback_Score', 'Completion_Status', 'Interaction_Level'
]
df = pd.DataFrame(data, columns=columns)

# Save dataset

In [None]:
output_path = "intern_performance_dataset.csv"
df.to_csv(output_path, index=False)

print(f"✅ Dataset generated and saved as: {output_path}")