In [38]:
import pandas as pd
import numpy as np
import uuid
import random
from faker import Faker
import bcrypt
from datetime import datetime, timedelta
import os

In [40]:
# %pip install bcrypt


In [41]:
fake = Faker()

num_users = 10000
num_projects = 500

num_projects_to_users = 5000
num_projects_to_feedback_question = 2000

num_timesheets = 10000
num_feedback_answers = 8000

In [42]:
def generateUsers() :
    roles = ['admin', 'user', 'manager', 'employee']
    weights = [10, 50, 20, 20]

    email_ids = []
    passwords = []
    roles_list = []
    names = []
    empIds= []

    password = bcrypt.hashpw('org_pass_1234'.encode('utf-8'), bcrypt.gensalt()).decode('utf-8')

    # Generate and store random data for each row
    for _ in range(num_users):
        while True:
            email = fake.email()
            if email not in email_ids:
                break
        role = np.random.choice(roles, p=np.array(weights)/sum(weights))
        name = fake.name()
        empId = f"jman{random.randint(1000, 9999)}"

        email_ids.append(email)
        passwords.append(password)
        roles_list.append(role)
        names.append(name)
        empIds.append(empId)
    # Create a DataFrame
    data = {
        'email': email_ids,
        'pass': passwords,
        'role': roles_list,
        'name': names,
        'empId': empIds
    }

    return data

users_df = pd.DataFrame(generateUsers())
users_df

Unnamed: 0,email,pass,role,name,empId
0,matthew96@example.com,$2b$12$sXcXlu4UVvPqy/VIkF.g3upDt1U/HHzdNeh9XAZ...,user,Terry Thomas,jman9922
1,gregoryberry@example.org,$2b$12$sXcXlu4UVvPqy/VIkF.g3upDt1U/HHzdNeh9XAZ...,admin,Tonya Thomas,jman1123
2,davidhansen@example.com,$2b$12$sXcXlu4UVvPqy/VIkF.g3upDt1U/HHzdNeh9XAZ...,user,Tyler Oneill,jman8504
3,frosario@example.org,$2b$12$sXcXlu4UVvPqy/VIkF.g3upDt1U/HHzdNeh9XAZ...,user,Connie Lynch,jman4520
4,ngonzalez@example.org,$2b$12$sXcXlu4UVvPqy/VIkF.g3upDt1U/HHzdNeh9XAZ...,employee,Jason Buckley,jman6726
...,...,...,...,...,...
9995,justinlarsen@example.com,$2b$12$sXcXlu4UVvPqy/VIkF.g3upDt1U/HHzdNeh9XAZ...,employee,Melanie Luna,jman7557
9996,paul81@example.net,$2b$12$sXcXlu4UVvPqy/VIkF.g3upDt1U/HHzdNeh9XAZ...,user,Helen Barker,jman5958
9997,lucaslewis@example.com,$2b$12$sXcXlu4UVvPqy/VIkF.g3upDt1U/HHzdNeh9XAZ...,manager,Tammy Green,jman3992
9998,dhiggins@example.org,$2b$12$sXcXlu4UVvPqy/VIkF.g3upDt1U/HHzdNeh9XAZ...,manager,Adrian Hopkins,jman8390


In [43]:
def generateProjects():
    project_domain_choices = ['Financial Services', 'Healthcare', 'Education Technology', 'E-commerce', 'Transportation and Logistics', 'Renewable Energy', 'Artificial Intelligence and Machine Learning', 'Cybersecurity', 'Media and Entertainment', 'Telecommunications']
    ids = []
    names = []
    domains = []

    for _ in range(num_projects):

        while True:
            project_id = str(uuid.uuid4())
            if project_id not in ids:
                break

        while True:
            project_name = fake.company() + " " + fake.word(ext_word_list=['Project', 'Solution', 'Initiative'])
            if project_name not in names:
                break
        
        project_domain = random.choice(project_domain_choices)

        ids.append(project_id)
        names.append(project_name)
        domains.append(project_domain)
    
    data = {
        'proj_id': ids, 
        'proj_name': names,
        'proj_domain': domains
    }
    return data

project_df = pd.DataFrame(generateProjects())
project_df.shape

(500, 3)

In [44]:
def generateFeedbackQuestions():
    ids = []
    feedback_questions = ['How satisfied are you with the overall progress of the project?', 'Rate the communication between team members on a scale of 1 to 5', 'Did you find the project timeline realistic? Please rate from 1 to 5', 'How would you rate the quality of work delivered by the team?', 'Rate the effectiveness of the project management on a scale of 1 to 5', 'Were the project goals clearly defined? Please rate from 1 to 5', 'How satisfied are you with the level of collaboration within the team?', 'Rate the accuracy of the project budget estimation from 1 to 5', 'Did the project meet your expectations? Please rate from 1 to 5', 'How would you rate the problem-solving skills of the team?', 'Rate the level of stakeholder engagement on a scale of 1 to 5', 'Were the project risks effectively managed? Please rate from 1 to 5', 'How satisfied are you with the project deliverables?', 'Rate the level of innovation demonstrated in the project from 1 to 5', 'Did the project meet the specified deadlines? Please rate from 1 to 5', 'How would you rate the level of client satisfaction with the project outcomes?', 'Rate the effectiveness of the project feedback mechanisms on a scale of 1 to 5', 'Were the project resources allocated efficiently? Please rate from 1 to 5', 'How satisfied are you with the level of transparency in project communication?', 'Rate the level of adaptability shown by the team members on a scale of 1 to 5']

    for _ in range(len(feedback_questions)):

        while True:
            question_id = str(uuid.uuid4())
            if question_id not in ids:
                break

        ids.append(question_id)
    
    data = {
        'id': ids, 
        'question': feedback_questions
    }
    return data

feedbackQuestion_df = pd.DataFrame(generateFeedbackQuestions())
print(feedbackQuestion_df)

                                      id   
0   6ada07ba-4dbf-40f9-8ece-67a731338d54  \
1   8532db01-673d-40e1-8f44-43ad536cae4c   
2   fe4e85e6-7689-4d68-9bdc-0a72e21ef924   
3   b0cfe4c8-72ef-42d7-8b14-5874a3341c9c   
4   3c4f9e4b-e9ab-4ce1-aa9e-bb39d78e10c5   
5   6a133a22-6238-4597-9ebf-6e88c6e3b9be   
6   eb1c455c-48c4-4e87-bff0-a8c8ca99fd3f   
7   0183e3dc-4aa1-4650-b78e-3dc6ef51aa5d   
8   974a1b62-092d-49b7-9787-f5a30ab32d20   
9   3b6933f0-8b12-46d2-a57f-0b655910ff8d   
10  66f20022-2d29-418b-875a-bdbbe500b174   
11  84d4b0e6-db61-43cd-9a33-c00436590197   
12  f9836aa6-94fe-4982-bbe0-53049fb51167   
13  caf3a62f-8c01-49a2-96fc-a001fa90d98f   
14  04a4e3bc-4236-4fe8-9bc2-ea8e772eef02   
15  ee3741c4-c8b3-450d-ab19-624a8f5702ed   
16  8cc2f6a8-6e1b-485e-a3aa-e4479a489462   
17  ad15bb82-d353-40b0-a418-b78127f09a36   
18  1ec642e9-bb71-424a-807e-a2b028dff232   
19  c314c9d1-615a-43c3-bb90-93625ee3f4be   

                                             question  
0   How satisfied a

In [45]:
# def generateProjectsToUsers(): 
#     project_names = random.choices(project_df['proj_name'], k=num_projects_to_users)
#     empIds = random.choices(users_df['empId'], k=num_projects_to_users)
    

#     data = {
#         'projectName': project_names,
#         'employeeID': empIds
#     }
#     return data

# projectsToUsers_df = pd.DataFrame(generateProjectsToUsers())
# projectsToUsers_df

def generateProjectsToUsers( num_users):
    projects = project_df['proj_name'].tolist()
    users = users_df['empId'].tolist()
    
    project_user_mapping = {}
    for project in projects:
        project_user_mapping[project] = random.sample(users, min(random.randint(1, 10), num_users))

    projects_assigned = []
    users_assigned = []
    for project, project_users in project_user_mapping.items():
        projects_assigned.extend([project] * len(project_users))
        users_assigned.extend(project_users)

    # Shuffle to randomize the assignments
    combined = list(zip(projects_assigned, users_assigned))
    random.shuffle(combined)
    projects_assigned, users_assigned = zip(*combined)

    data = {
        'projectName': projects_assigned[:num_projects_to_users],
        'employeeID': users_assigned[:num_projects_to_users]
    }
    return data

# num_projects_to_users = 50  # Number of projects to assign users to
num_users = 1000  # Number of users
projectsToUsers_df = pd.DataFrame(generateProjectsToUsers( num_users))
projectsToUsers_df.head()


Unnamed: 0,projectName,employeeID
0,Richards-Clark Initiative,jman7274
1,Baker Ltd Project,jman2038
2,Mays-Moon Project,jman9194
3,Peterson-Lopez Project,jman4626
4,Brown-Abbott Solution,jman9029


In [46]:
print(projectsToUsers_df.shape)

(2846, 2)


In [47]:

def generate_random_sunday():
    today = datetime.today()
    random_days = random.randint(0, 365 * 2)
    random_date = today - timedelta(days=random_days)
    while random_date.weekday() != 6:
        random_date -= timedelta(days=1)
    return {
        'formatted': random_date.strftime('%d-%m-%Y'),
        'original': random_date
    }






def generateTimesheets(num_projects, num_users, projects_df, users_df):
    comments_choices = ["Excited to get started!", "Looking forward to the challenge", "Ready to dive in",
                        "Feeling motivated", "Can't wait to see the end result",
                        "Feeling optimistic about this project", "Hopeful for a successful outcome",
                        "Eager to collaborate with the team"]
    hours_choices = np.arange(0, 13)
    weekend_weights = [6, 6, 5, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    normal_weights = [1, 1, 1, 1, 6, 5, 4, 1, 1, 1, 1, 1, 1]
    activity_names_choices = ['BAU Activity', 'Sales Activity']
    task_choices = ['Task A', 'Task B', 'Task C', 'Task D', 'Task E']

    empIds = []
    date_starts = []
    date_ends = []
    projects_selected = []
    tasks_selected = []
    comments_added = []
    d0s = []
    d1s = []
    d2s = []
    d3s = []
    d4s = []
    d5s = []
    d6s = []
    activity_names = []

    projects = projects_df['projectName'].tolist()
    users = users_df['empId'].tolist()

    # Generate and store random data for each row
    for project in projects:
        project_duration = random.randint(6, 10) * 30  # Random duration between 6 to 10 months
        start_date = generate_random_sunday()['original']
        end_date = start_date + timedelta(days=project_duration)

        for user in random.sample(users, random.randint(1, min(3, num_users))):
            for week_start in pd.date_range(start=start_date, end=end_date, freq='W-SUN'):
                week_end = week_start + timedelta(days=6)

                # Randomly select a task and comment
                task_selected = random.choice(task_choices)
                comment_added = random.choice(comments_choices) if random.random() > 0.5 else None

                # Generate random hours for each day of the week
                hours = [0] * 7
                for i, weights in enumerate([weekend_weights, normal_weights, normal_weights, normal_weights, normal_weights, normal_weights, weekend_weights]):
                    if random.random() > 0.3:
                        hours[i] = np.random.choice(hours_choices, p=np.array(weights)/sum(weights))
                
                # Check if total hours don't exceed 80
                total_hours = sum(hours)
                if total_hours > 80:
                    scale_factor = 80 / total_hours
                    hours = [int(h * scale_factor) for h in hours]

                # Append data to lists
                empIds.append(user)
                projects_selected.append(project)
                date_starts.append(week_start.strftime('%d-%m-%Y'))
                date_ends.append(week_end.strftime('%d-%m-%Y'))
                tasks_selected.append(task_selected)
                comments_added.append(comment_added)
                d0s.append(hours[0])
                d1s.append(hours[1])
                d2s.append(hours[2])
                d3s.append(hours[3])
                d4s.append(hours[4])
                d5s.append(hours[5])
                d6s.append(hours[6])
                activity_names.append(random.choice(activity_names_choices))

    data = {
        'empId': empIds,
        'projectSelected': projects_selected,
        'dateStart': date_starts,
        'dateEnd': date_ends,
        'taskSelected': tasks_selected,
        'commentAdded': comments_added,
        'd0': d0s,
        'd1': d1s,
        'd2': d2s,
        'd3': d3s,
        'd4': d4s,
        'd5': d5s,
        'd6': d6s,
        'activityName': activity_names
    }

    return data

num_projects = 2500  # Number of projects
num_users = 100000  # Number of users
timesheets_df = pd.DataFrame(generateTimesheets(num_projects, num_users, projectsToUsers_df, users_df))
timesheets_df.shape
print(timesheets_df)

           empId                    projectSelected   dateStart     dateEnd   
0       jman6188          Richards-Clark Initiative  25-12-2022  31-12-2022  \
1       jman6188          Richards-Clark Initiative  01-01-2023  07-01-2023   
2       jman6188          Richards-Clark Initiative  08-01-2023  14-01-2023   
3       jman6188          Richards-Clark Initiative  15-01-2023  21-01-2023   
4       jman6188          Richards-Clark Initiative  22-01-2023  28-01-2023   
...          ...                                ...         ...         ...   
195929  jman6680  Wolfe, Wilson and Shields Project  19-11-2023  25-11-2023   
195930  jman6680  Wolfe, Wilson and Shields Project  26-11-2023  02-12-2023   
195931  jman6680  Wolfe, Wilson and Shields Project  03-12-2023  09-12-2023   
195932  jman6680  Wolfe, Wilson and Shields Project  10-12-2023  16-12-2023   
195933  jman6680  Wolfe, Wilson and Shields Project  17-12-2023  23-12-2023   

       taskSelected                      commentAdd

In [48]:
def generateProjectsToFeedbackQuestions(num_projects_to_feedback_question ): 
    project_names = random.choices(project_df['proj_name'], k=num_projects_to_feedback_question)
    question_names = random.choices(feedbackQuestion_df['question'], k=num_projects_to_feedback_question)
   
    data = {
        'projectName': project_names,
        # 'questionName': question_names
    }
    for i in range(1, 6):
        data[f'Question{i}'] = question_names
    return data
num_projects_to_feedback_question = 5
projectsToFeedbackQuestions_df = pd.DataFrame(generateProjectsToFeedbackQuestions(num_projects_to_feedback_question))
print(projectsToFeedbackQuestions_df)

                               projectName   
0                     Marsh-Henry Solution  \
1       Johnson, Powell and Farmer Project   
2        King, Lopez and Ashley Initiative   
3  Williams, Carpenter and Garner Solution   
4  Farmer, Williams and Ramirez Initiative   

                                           Question1   
0  How satisfied are you with the project deliver...  \
1  Were the project risks effectively managed? Pl...   
2  How satisfied are you with the level of transp...   
3  How satisfied are you with the level of transp...   
4  Did the project meet the specified deadlines? ...   

                                           Question2   
0  How satisfied are you with the project deliver...  \
1  Were the project risks effectively managed? Pl...   
2  How satisfied are you with the level of transp...   
3  How satisfied are you with the level of transp...   
4  Did the project meet the specified deadlines? ...   

                                           Questi

In [51]:
if not os.path.exists('csvs1'):
    os.makedirs('csvs1')

users_df.to_csv('csvs1/users.csv', index=False)
project_df.to_csv('csvs1/projects.csv', index=False)
feedbackQuestion_df.to_csv('csvs1/feedbackQuestions.csv', index=False)
projectsToUsers_df.to_csv('csvs1/projectsToUsers.csv', index=False)
projectsToFeedbackQuestions_df.to_csv('csvs1/projectsToFeedbackQuestions.csv', index=False)
timesheets_df.to_csv('csvs1/timesheets.csv', index=False)
# feedbackAnswers_df.to_csv('csvs/feedbackAnswers.csv', index=False)