In [12]:
%pip install faker pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
import random
import pandas as pd
from faker import Faker
from datetime import datetime

# Initialize Faker
faker = Faker()

# Enums for Difficulty Level and Designation
difficulty_levels = ['BASIC', 'BEGINNER', 'INTERMEDIATE', 'EXPERT']
designations = [
    'SOFTWARE_ENGINEER', 'SR_SOFTWARE_ENGINEER', 'SOLUTION_ENABLER',
    'SOLUTION_CONSULTANT', 'TECHNOLOGY_SOLUTION_ARCHITECT', 'PRINCIPAL_SOLUTION_ARCHITECT'
]

# Generate Employees Data
def generate_employees(num_employees):
    employees = []
    for _ in range(num_employees):
        emp_id = faker.uuid4()
        email = faker.email().replace("@example.com", "@jmangroup.com")
        emp_name = faker.name()
        designation = random.choice(designations)
        employees.append({
            'emp_id': emp_id,
            'email': email,
            'emp_name': emp_name,
            'designation': designation
        })
    return employees

# Generate Learning Paths Data
def generate_learning_paths(num_learning_paths):
    learning_paths = []
    for _ in range(num_learning_paths):
        learning_path_id = faker.unique.random_int(min=1, max=1000)
        path_name = f"Path {_+1} - {faker.word().capitalize()}"
        description = faker.sentence(nb_words=10)
        learning_paths.append({
            'learning_path_id': learning_path_id,
            'path_name': path_name,
            'description': description
        })
    return learning_paths

# Generate Courses Data with Learning Path Mapping
def generate_courses(num_courses, learning_paths):
    courses = []
    for _ in range(num_courses):
        course_id = faker.unique.random_int(min=1, max=1000)
        course_name = f"Course {_+1} - {faker.word().capitalize()}" # KNOWN NAME
        description = faker.sentence(nb_words=10)
        duration = f"{random.randint(1, 6)} months" # DURATION IN DAYS
        difficulty_level = random.choice(difficulty_levels) #  IF MORE MONTHS THEN MORE DIFFICULTY LEVEL
        createdAt = faker.date_time_this_decade()
        
        # Randomly assign some learning paths to this course (stored as array)
        assigned_learning_paths = random.sample(
            [lp['learning_path_id'] for lp in learning_paths],
            random.randint(1, min(3, len(learning_paths)))  # 1 to 3 learning paths
        )
        
        courses.append({
            'course_id': course_id,
            'course_name': course_name,
            'description': description,
            'duration': duration,
            'difficulty_level': difficulty_level,
            'createdAt': createdAt,
            'learning_paths': assigned_learning_paths  # Array of learning_path_ids
        })
    return courses

# Generate CourseEnrollments Data
def generate_course_enrollments(employees, courses, num_enrollments):
    enrollments = []
    for _ in range(num_enrollments):
        emp = random.choice(employees)
        course = random.choice(courses)
        enroll_id = faker.unique.random_int(min=1, max=1000)
        current_page = random.randint(0, 50)
        total_pages = 100
        test_score = round(random.uniform(0, 100), 2)
        course_certificate_generate = random.choice([True, False])
        createdAt = faker.date_time_between(start_date=course['createdAt'])
        enrollments.append({
            'enroll_id': enroll_id,
            'emp_id': emp['emp_id'],
            'course_id': course['course_id'],
            'current_page': current_page,
            'total_pages': total_pages,
            'test_score': test_score,
            'course_certificate_generate': course_certificate_generate,
            'createdAt': createdAt
        })
    return enrollments

# Generate CourseEngageLogs Data
def generate_course_engage_logs(enrollments, num_logs_per_enroll):
    logs = []
    for enroll in enrollments:
        for _ in range(num_logs_per_enroll):
            start_time = faker.date_time_between(start_date=enroll['createdAt'])
            time_spent_in_sec = random.randint(600, 3600)
            logs.append({
                'enroll_id': enroll['enroll_id'],
                'start_time': start_time,
                'time_spent_in_sec': time_spent_in_sec
            })
    return logs

# Generate Data
num_employees = 20
num_courses = 10
num_learning_paths = 5
num_enrollments = 50
num_logs_per_enroll = 3

employees = generate_employees(num_employees)
learning_paths = generate_learning_paths(num_learning_paths)
courses = generate_courses(num_courses, learning_paths)
enrollments = generate_course_enrollments(employees, courses, num_enrollments)
logs = generate_course_engage_logs(enrollments, num_logs_per_enroll)

# Convert to DataFrame
employees_df = pd.DataFrame(employees)
courses_df = pd.DataFrame(courses)
enrollments_df = pd.DataFrame(enrollments)
logs_df = pd.DataFrame(logs)
learning_paths_df = pd.DataFrame(learning_paths)

# Save to CSV
employees_df.to_csv('./DataStore/employees.csv', index=False)
courses_df.to_csv('./DataStore/courses_with_learning_paths.csv', index=False)
enrollments_df.to_csv('./DataStore/enrollments.csv', index=False)
logs_df.to_csv('./DataStore/course_engage_logs.csv', index=False)
learning_paths_df.to_csv('./DataStore/learning_paths.csv', index=False)

print("Data generation complete! CSV files have been saved.")


Data generation complete! CSV files have been saved.
