In [2]:
pip install pandas faker

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
from faker import Faker
import random
from datetime import timedelta
import uuid

# Initialize the Faker instance
fake = Faker()

# Define the user designations
designations = [
    'Software Engineer',
    'Senior Software Engineer',
    'Solutions Enabler',
    'Solutions Consultant',
    'Architect',
    'Principal Technology Architect',
]

# Mapping of training codes based on different fields
training_field_codes = {
    'Full Stack Development': 'FS',
    'Data Science': 'DS',
    'Data Engineering': 'DE',
    'UI/UX': 'UI',
    'Software Testing': 'ST'
}

def generate_fake_object_id():
    """Generate a fake MongoDB ObjectId."""
    return str(uuid.uuid4().hex)[:24]  # Generate a 24-character hexadecimal string

def generate_fake_users(num_users):
    users = []
    for _ in range(num_users):
        role = 'trainer' if random.randint(1, 5) == 1 else 'user'  # 1 out of 3 users will be trainers
        designation_index = random.randint(3, 5) if role == 'trainer' else random.randint(0, 2)

        user = {
            '_id': generate_fake_object_id(),
            'username': fake.name(),
            'email': fake.email(),
            'password': fake.password(),
            'role': role,
            'designation': designations[designation_index],
        }
        users.append(user)

    print(f"{num_users} fake users created.")
    return users

def generate_fake_training_sessions(num_sessions, users):
    training_sessions = []
    
    for i in range(num_sessions):
        # Randomly choose a trainer from the users list
        trainer = random.choice([user for user in users if user['role'] == 'trainer'])

        # Randomly choose a training field and assign the training code
        field = random.choice(list(training_field_codes.keys()))
        training_code = f"{training_field_codes[field]}-{str(i + 1).zfill(3)}"

        start_date = fake.date_time_this_year()
        end_date = start_date + timedelta(days=random.randint(1, 30))

        # Ensure participants are unique for each session
        participants = []
        user_ids_in_session = set()  # To track added participants
        num_participants = random.randint(1, 30)

        for _ in range(num_participants):
            # Select a unique user who is not a trainer and not already added
            user_participant = random.choice(
                [user for user in users if user['role'] == 'user' and user['_id'] not in user_ids_in_session]
            )
            user_ids_in_session.add(user_participant['_id'])

            participant = {
                'user': user_participant['_id'],
                'hackerRankScore': round(random.uniform(0, 10)),
                'assessmentScore': round(random.uniform(0, 10)),
                'performance': round(random.uniform(0, 10)),
                'communication': round(random.uniform(0, 10)),
                'remarks': fake.sentence()
            }
            participants.append(participant)

        training_session = {
            'training_code': training_code,
            'status': random.choice(['completed', 'ongoing', 'pending']),
            'trainer': trainer['_id'],
            'startDate': start_date,
            'endDate': end_date,
            'participants': participants,
        }
        training_sessions.append(training_session)

    print(f"{num_sessions} fake training sessions created.")
    return training_sessions

# Generate fake users and then training sessions
num_users = 1000
users = generate_fake_users(num_users)

num_sessions = 100
training_sessions = generate_fake_training_sessions(num_sessions, users)

# Convert lists to DataFrames
users_df = pd.DataFrame(users)
training_sessions_df = pd.DataFrame(training_sessions)

# Save DataFrames to CSV files
users_df.to_csv('raw_users.csv', index=False)
training_sessions_df.to_csv('raw_training_sessions.csv', index=False)

print("Raw data has been saved to 'raw_users.csv' and 'raw_training_sessions.csv'.")


1000 fake users created.
100 fake training sessions created.
Raw data has been saved to 'raw_users.csv' and 'raw_training_sessions.csv'.
