In [None]:
pip install pandas numpy faker

In [None]:
#import necessary libraries
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta
import os

#create a Faker instance
fake = Faker()


In [3]:
# Number of users and tasks
NUM_USERS = 10
TASKS_PER_USER = 20

# Priority levels
PRIORITY_LEVELS = ['Low', 'Medium', 'High']


In [38]:

# --- REALISTIC TEAMS, ROLES & TASKS ---
teams = {
    "Engineering": {
        "roles": ["Backend Engineer", "Frontend Engineer", "Data Engineer", "QA Engineer"],
        "tasks": [
            "Fix backend API bug", "Implement new endpoint", "Database schema update",
            "Write frontend feature", "Refactor UI component", "Data pipeline monitoring",
            "ETL batch debugging", "Write unit tests", "Integration testing"
        ]
    },
    "Marketing": {
        "roles": ["Content Writer", "SEO Specialist", "Social Media Manager"],
        "tasks": [
            "Write blog post", "Keyword research", "Schedule social posts",
            "Email campaign setup", "Ad campaign optimization", "Competitor research"
        ]
    },
    "Finance": {
        "roles": ["Accountant", "Financial Analyst"],
        "tasks": [
            "Prepare monthly report", "Reconcile transactions", 
            "Create budget forecast", "Variance analysis", "Invoice processing"
        ]
    },
    "Operations": {
        "roles": ["Operations Associate", "Logistics Coordinator"],
        "tasks": [
            "Inventory check", "Vendor follow-up", "Shipment scheduling",
            "Process documentation", "Resource planning"
        ]
    },
    "Customer Support": {
        "roles": ["Support Agent", "Technical Support"],
        "tasks": [
            "Respond to ticket", "Resolve technical issue", "Customer follow-up",
            "Escalate request", "Live chat assistance"
        ]
    }
}

# --- Assign weights to teams (more realistic distribution) ---
team_weights = {
    "Engineering": 0.5,
    "Marketing": 0.15,
    "Finance": 0.1,
    "Operations": 0.15,
    "Customer Support": 0.1
}

def assign_priority(task):
    task_lower = task.lower()
    if any(word in task_lower for word in ["bug", "error", "issue", "urgent", "fix", "debug"]):
        return "High"
    if any(word in task_lower for word in ["report", "analysis", "campaign", "update", "planning"]):
        return "Medium"
    return "Low"

num_rows = 5000
data = {
    "task_id": [],
    "team": [],
    "role": [],
    "user_id": [],
    "task_name": [],
    "description": [],
    "priority": [],
    "expected_duration": [],
    "completion_time": []
}

task_counter = 1
team_list = list(team_weights.keys())
team_probs = list(team_weights.values())

for _ in range(num_rows):
    team = random.choices(team_list, weights=team_probs, k=1)[0]
    role = random.choice(teams[team]["roles"])
    task_name = random.choice(teams[team]["tasks"])
    description = f"{task_name} for {team.lower()} team by {role.lower()}."
    priority = assign_priority(task_name)
    user_id = random.randint(1, 200)  # Users belong to only one team
    expected = np.random.randint(1, 9)
    completion = round(np.random.normal(expected * 1.2, 3), 2)
    completion = max(0.5, completion)

    data["task_id"].append(task_counter)
    data["team"].append(team)
    data["role"].append(role)
    data["user_id"].append(user_id)
    data["task_name"].append(task_name)
    data["description"].append(description)
    data["priority"].append(priority)
    data["expected_duration"].append(expected)
    data["completion_time"].append(completion)

    task_counter += 1

df = pd.DataFrame(data)

# Create folder if not exists
import os
os.makedirs('data/raw', exist_ok=True)

df.to_csv('data/raw/tasks_dataset.csv', index=False)
df.head()


Unnamed: 0,task_id,team,role,user_id,task_name,description,priority,expected_duration,completion_time
0,1,Engineering,Backend Engineer,64,ETL batch debugging,ETL batch debugging for engineering team by ba...,High,7,7.37
1,2,Operations,Operations Associate,69,Vendor follow-up,Vendor follow-up for operations team by operat...,Low,4,8.36
2,3,Marketing,SEO Specialist,143,Write blog post,Write blog post for marketing team by seo spec...,Low,8,12.64
3,4,Engineering,QA Engineer,198,ETL batch debugging,ETL batch debugging for engineering team by qa...,High,7,8.78
4,5,Engineering,QA Engineer,119,Write unit tests,Write unit tests for engineering team by qa en...,Low,4,2.84


In [39]:
df.head(10)


Unnamed: 0,task_id,team,role,user_id,task_name,description,priority,expected_duration,completion_time
0,1,Engineering,Backend Engineer,64,ETL batch debugging,ETL batch debugging for engineering team by ba...,High,7,7.37
1,2,Operations,Operations Associate,69,Vendor follow-up,Vendor follow-up for operations team by operat...,Low,4,8.36
2,3,Marketing,SEO Specialist,143,Write blog post,Write blog post for marketing team by seo spec...,Low,8,12.64
3,4,Engineering,QA Engineer,198,ETL batch debugging,ETL batch debugging for engineering team by qa...,High,7,8.78
4,5,Engineering,QA Engineer,119,Write unit tests,Write unit tests for engineering team by qa en...,Low,4,2.84
5,6,Engineering,Frontend Engineer,57,Write frontend feature,Write frontend feature for engineering team by...,Low,1,0.5
6,7,Engineering,Data Engineer,88,Fix backend API bug,Fix backend API bug for engineering team by da...,High,7,7.1
7,8,Engineering,Data Engineer,27,Fix backend API bug,Fix backend API bug for engineering team by da...,High,1,2.26
8,9,Customer Support,Support Agent,14,Customer follow-up,Customer follow-up for customer support team b...,Low,4,4.77
9,10,Marketing,Content Writer,152,Write blog post,Write blog post for marketing team by content ...,Low,6,7.59


In [40]:
df.shape

(5000, 9)