# BTC Fake - Training Completion Simulator

This notebook creates a set of files just like files sent by BTC vendor. The files convey
the training Course Catalog, the transactions of managers assigning training to employees, and transactions of employees completing training. Employees can complete those manager-assigned trainings, PLUS training recommended by the ML Recommendation API.

## How it works:
1. **Preprocessing**: Downloads two files that BTC sent to the Prod SFTP server - the files that represent Courses and Contents. 
2. **Manager Assigns Training**: 
   - Queries content_assignments AND content_completion tables from Databricks in order to calculate open assignments (assignments - completions) for each employee
   - Selects and assigns up to 3 Daily Dose contents for the current week to all employees
3. **Employee Completes Training**: 
   - Possibley completes manager assignments and AI recommendations for each employee
   - employee type (A, B, or F) determines if they complete all, one, or zero trainings.
4. **Output File Generation**:
   - NonCompletedAssignments CSV (open assignments from Databricks + new manager assignments)
   - ContentUserCompletion CSV (completed training)
   - UserCompletion CSV that is a dummy file required but not important here
5. **Update NonCompletedAssignments**: Removes completed assignments, regenerates the file
6. **Summary**: Prints run details showing training from manager vs AI and completions


In [None]:
import pandas as pd
import requests
from datetime import datetime, timedelta
import random
import string
from typing import List, Dict
import urllib3
import pytz
from dotenv import load_dotenv
import os

# Import shared business logic
import simulation_core as core

# Load environment variables
load_dotenv()

# Disable SSL warnings when ignoring certificate verification
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Define Pacific timezone globally for all timestamp calculations
# Timestamps are calculated in PT for proper local time logic,
# then converted to UTC (offset +00:00) for CSV output files
PT = pytz.timezone('America/Los_Angeles')
UTC = pytz.UTC

# Load configuration from environment using simulation_core
config = core.load_config()

# Extract commonly used config values for backward compatibility
API_BASE_URL = config['api_base_url']
API_ENDPOINT = config['api_endpoint']
API_TIMEOUT = config['api_timeout']

EMPLOYEES_FILE = config['employees_file']
OUTPUT_DIR = config['output_dir']
SFTP_LOCAL_DIR = config['sftp_local_dir']
USER_COMPLETION_TEMPLATE_FILE = config['user_completion_template_file']

DATABRICKS_HOST = config['databricks_host']
DATABRICKS_HTTP_PATH = config['databricks_http_path']
DATABRICKS_TOKEN = config['databricks_token']
DATABRICKS_CATALOG = config['databricks_catalog']
DATABRICKS_SCHEMA = config['databricks_schema']

SFTP_OUTBOUND_HOST = config['sftp_outbound_host']
SFTP_OUTBOUND_USER = config['sftp_outbound_user']
SFTP_OUTBOUND_PASSWORD = config['sftp_outbound_password']
SFTP_OUTBOUND_REMOTE_PATH = config['sftp_outbound_remote_path']
SFTP_PUBLISH_ENABLED = config['sftp_publish_enabled']

# Preprocessing - Download Files from SFTP

This section prepares for a fresh run:

## Cleanup
1. Removes prior files from `generated_files/` directory to ensure each run starts with a clean slate

## Generate a Dummy (not important) UserCompletion File
1. Relies on the UserCompletion template in `docs/sample_files/` places new ne in `generated_files/` directory

## Download Files (from SFTP Server) that BTC Vendor sent 
1. **CourseCatalog** - Training curriculum elements like Courses and components
2. **StandAloneContent** - All training content (videos, PDFs, documents)

## Requirements:
1. Copy `.env.example` to `.env` and add your SFTP password
2. Files will be downloaded to `downloaded_files/` directory
3. The system finds the most recent file based on the date in the filename

## File Formats:
- CourseCatalog: `CourseCatalog_V2_YYYY_M_DD_1_random.csv`
- StandAloneContent: `StandAloneContent_v2_YYYY_M_DD_1_random.csv`

In [None]:
# Cleanup: Remove old files from previous runs using simulation_core
print("=" * 80)
print("PREPROCESSING - Cleanup")
print("=" * 80)
print()

print("Cleaning up directories from previous runs...")
print()

# Clean generated_files directory using simulation_core function
print(f"Cleaning {OUTPUT_DIR}/...")
core.cleanup_output_directory(config, lambda msg: print(f"  {msg}"))
print()

print("=" * 80)
print()

In [None]:
# Generate UserCompletion file from template using simulation_core
print("=" * 80)
print("PREPROCESSING - Generate UserCompletion File")
print("=" * 80)
print()

print("Generating a dummy UserCompletion file from sample file...")
user_completion_path = core.generate_user_completion_file_from_template(config, print)

if user_completion_path:
    print(f"✓ UserCompletion file generated successfully")
    print(f"  File: {user_completion_path}")
else:
    print("✗ Failed to generate UserCompletion file")

print()
print("=" * 80)
print()

In [None]:
# Import SFTP libraries and load environment
import os
import re
from dotenv import load_dotenv
import paramiko
from datetime import datetime

# Load environment variables
load_dotenv()

# SFTP Inbound Server Configuration (from config)
SFTP_INBOUND_HOST = config['sftp_inbound_host']
SFTP_INBOUND_USER = config['sftp_inbound_user']
SFTP_INBOUND_PASSWORD = config['sftp_inbound_password']
SFTP_INBOUND_REMOTE_PATH = config['sftp_inbound_remote_path']

# Use simulation_core functions for SFTP operations
parse_course_catalog_filename = core.parse_course_catalog_filename
parse_standalone_content_filename = core.parse_standalone_content_filename

def download_most_recent_course_catalog() -> str:
    """
    Connect to SFTP inbound server and download the most recent CourseCatalog file.
    Uses simulation_core for the actual download logic.
    
    Returns:
        Path to the downloaded file, or None if download fails
    """
    return core.download_most_recent_file_from_sftp(config, 'course_catalog')

def download_most_recent_standalone_content() -> str:
    """
    Connect to SFTP inbound server and download the most recent StandAloneContent file.
    Uses simulation_core for the actual download logic.
    
    Returns:
        Path to the downloaded file, or None if download fails
    """
    return core.download_most_recent_file_from_sftp(config, 'standalone_content')

In [None]:
def publish_files_to_sftp_outbound(files_to_publish: List[str]) -> bool:
    """
    Publish generated files to SFTP outbound server.
    Uses simulation_core for the actual publish logic.
    
    Args:
        files_to_publish: List of local file paths to upload
    
    Returns:
        True if all files published successfully, False otherwise
    """
    return core.publish_files_to_sftp_outbound(config, files_to_publish)

In [None]:
# Execute: Download Course Catalog and Standalone Content from SFTP
print("=" * 80)
print("PREPROCESSING - Download Files from SFTP Inbound Server")
print("=" * 80)
print()

# Download Course Catalog
print("Downloading Course Catalog...")
print("-" * 80)
course_catalog_path = download_most_recent_course_catalog()

if course_catalog_path:
    print()
    print(f"✓ Course catalog downloaded successfully")
    print(f"  File: {course_catalog_path}")
    
    # Optionally load and preview the file
    try:
        catalog_df = pd.read_csv(course_catalog_path)
        print(f"  Rows: {len(catalog_df)}")
        print(f"  Columns: {list(catalog_df.columns)}")
    except Exception as e:
        print(f"  Note: Could not preview file: {e}")
else:
    print()
    print("✗ Failed to download course catalog")
    print("  Please check:")
    print("    1. .env file contains valid SFTP_INBOUND_PASSWORD")
    print("    2. SFTP inbound server is accessible")
    print("    3. Remote path exists and contains CourseCatalog files")

print()
print("-" * 80)

# Download Standalone Content
print("Downloading Standalone Content...")
print("-" * 80)
standalone_content_path = download_most_recent_standalone_content()

if standalone_content_path:
    print()
    print(f"✓ Standalone content downloaded successfully")
    print(f"  File: {standalone_content_path}")
    
    # Optionally load and preview the file
    try:
        content_df = pd.read_csv(standalone_content_path)
        print(f"  Rows: {len(content_df)}")
        print(f"  Columns: {list(content_df.columns)}")
    except Exception as e:
        print(f"  Note: Could not preview file: {e}")
else:
    print()
    print("✗ Failed to download standalone content")
    print("  Please check:")
    print("    1. .env file contains valid SFTP_INBOUND_PASSWORD")
    print("    2. SFTP inbound server is accessible")
    print("    3. Remote path exists and contains StandAloneContent files")

print()
print("=" * 80)

In [None]:
# Load employees using simulation_core
print(f"Loading employees from {EMPLOYEES_FILE}...")
employees_df, filtered_count = core.load_and_filter_employees(EMPLOYEES_FILE, print)
print()

# Use format_content_id from simulation_core
format_content_id = core.format_content_id

# Manager - Assign Training to Employees

This section implements the manager functionality:
1. Queries Databricks for employee state using BOTH content_assignments and content_completion tables
2. Calculates open assignments (assignments - completions) for each employee
3. Loads the standalone content file from preprocessing
4. Filters for content where Daily_Dose_BA is TRUE
5. Sorts by CreateDate (most recent first)
6. Selects up to 3 Daily Dose contents to assign
7. Checks for Daily Dose conflicts (employees to skip):
   - **Conflict Check 1**: Queries content_completion table for employees who completed ANY Daily Dose this week
   - **Conflict Check 2**: Checks open assignments for Daily Dose with due dates this week
8. Skips employees who have Daily Dose conflicts
9. Assigns the 3 Daily Dose contents to all eligible employees
10. Assigns 1 random non-Daily Dose content to ALL employees (no skipping)
11. Generates a NonCompletedAssignments CSV file with:
    - Open assignments from Databricks (written FIRST)
    - New manager assignments (written SECOND)

**Note**: The manager will NOT assign Daily Dose to an employee if they already completed ANY Daily Dose in the current week (Sunday to Sunday).


In [None]:
import pytz

# Date/time helper functions - use simulation_core functions
get_sunday_of_current_week = core.get_sunday_of_current_week
get_next_future_sunday = core.get_next_future_sunday
generate_request_id = core.generate_request_id
generate_non_completed_assignments_filename = core.generate_non_completed_assignments_filename

In [None]:
# Wrapper function for Databricks query that automatically uses config
def get_open_assignments_from_databricks(employee_ids):
    """
    Wrapper around core.get_open_assignments_from_databricks that uses the notebook's config.
    
    Args:
        employee_ids: List of employee IDs to query
    
    Returns:
        DataFrame with open assignments
    """
    return core.get_open_assignments_from_databricks(config, employee_ids)

In [None]:
# Manager - Query Databricks and select training content to assign to employees
print("=" * 80)
print("MANAGER - Assigning Training to Employees")
print("=" * 80)
print()

# Step 1: Get list of employee IDs from input file and query Databricks
employee_ids_list = employees_df['employee_id'].tolist()
print(f"Querying Databricks for assignments for {len(employee_ids_list)} employees from input file...")
print("-" * 80)
open_assignments_df = get_open_assignments_from_databricks(employee_ids_list)
print()

# Convert Databricks assignments to the NonCompletedAssignments format using simulation_core
databricks_assignments = core.convert_databricks_assignments_to_output_format(open_assignments_df)
if databricks_assignments:
    print(f"Converted {len(databricks_assignments)} Databricks assignment(s) to output format")
    print()

# Step 2: Load the standalone content file
new_manager_assignments = []
employee_assigned_daily_dose = {}  # Track Daily Dose assignments
employee_assigned_random = {}  # Track random non-Daily Dose assignments

if standalone_content_path and os.path.exists(standalone_content_path):
    print(f"Loading standalone content from: {standalone_content_path}")
    standalone_df = pd.read_csv(standalone_content_path)
    print(f"Loaded {len(standalone_df)} content items")
    print()
    
    # Calculate dates for NEW assignments - functions now return UTC directly
    created_date = datetime.now(PT).astimezone(UTC).isoformat()
    start_date = get_sunday_of_current_week().isoformat()  # Already returns UTC
    due_date = get_next_future_sunday().isoformat()  # Already returns UTC
    
    # PART A: DAILY DOSE ASSIGNMENTS
    print("=" * 80)
    print("PART A: DAILY DOSE ASSIGNMENTS")
    print("=" * 80)
    print()
    
    # Filter for content where Daily_Dose_BA is TRUE
    print("Filtering for Daily Dose training (Daily_Dose_BA = TRUE)...")
    daily_dose_content = standalone_df[
        (standalone_df['Daily_Dose_BA'] == 'TRUE') | 
        (standalone_df['Daily_Dose_BA'] == True)
    ].copy()
    
    print(f"Found {len(daily_dose_content)} Daily Dose content items")
    print()
    
    if len(daily_dose_content) > 0:
        # Convert CreateDate to datetime for sorting
        daily_dose_content['CreateDate_dt'] = pd.to_datetime(daily_dose_content['CreateDate'])
        
        # Sort by CreateDate (most recent first)
        daily_dose_content = daily_dose_content.sort_values('CreateDate_dt', ascending=False)
        
        # Select up to 3 most recent contents
        contents_to_assign = daily_dose_content.head(3)
        
        print(f"Selected {len(contents_to_assign)} Daily Dose content(s) to assign:")
        for idx, content in contents_to_assign.iterrows():
            content_id = content['ContentId']
            content_name = content['ContentName']
            create_date = content['CreateDate']
            print(f"  {format_content_id(int(content_id.replace(',', '')))} - {content_name} (Created: {create_date})")
        print()
        
        # Check which employees to skip for Daily Dose
        print("-" * 80)
        print("Checking for Daily Dose conflicts...")
        print()
        
        # Calculate current week boundaries (Monday to Monday)
        now_pt = datetime.now(PT)
        sunday_of_current_week = get_sunday_of_current_week()
        next_sunday = get_next_future_sunday()
        
        # For completion date comparison, use date objects
        week_start_date = sunday_of_current_week.date()
        week_end_date = next_sunday.date()
        
        print(f"Current week (for completion check): {week_start_date} to {week_end_date}")
        print()
        
        # Build set of Daily Dose content IDs for quick lookup
        daily_dose_content_ids = set()
        for _, dd_content in standalone_df[
            (standalone_df['Daily_Dose_BA'] == 'TRUE') | 
            (standalone_df['Daily_Dose_BA'] == True)
        ].iterrows():
            content_id_str = dd_content['ContentId']
            if isinstance(content_id_str, str):
                content_id_numeric = int(content_id_str.replace(',', ''))
            else:
                content_id_numeric = int(content_id_str)
            daily_dose_content_ids.add(content_id_numeric)
        
        # Check employees for Daily Dose conflicts
        employees_to_skip_dd = {}  # Map employee_id -> reason for skipping Daily Dose
        
        # CONFLICT CHECK 1: Query content_completion for Daily Dose completions this week
        if all([DATABRICKS_HOST, DATABRICKS_HTTP_PATH, DATABRICKS_TOKEN]):
            try:
                from databricks import sql
                
                # Connect to Databricks
                connection = sql.connect(
                    server_hostname=DATABRICKS_HOST,
                    http_path=DATABRICKS_HTTP_PATH,
                    access_token=DATABRICKS_TOKEN
                )
                
                cursor = connection.cursor()
                
                # Table name
                completion_table = f"{DATABRICKS_CATALOG}.{DATABRICKS_SCHEMA}.content_completion"
                
                print(f"Querying {completion_table} for Daily Dose completions this week...")
                
                # Build IN clause for employee IDs and Daily Dose content IDs
                employee_ids_str = ", ".join([str(emp_id) for emp_id in employee_ids_list])
                dd_content_ids_str = ", ".join([str(cid) for cid in daily_dose_content_ids])
                
                # Query: Find employees who completed ANY Daily Dose content this week
                query = f"""
                SELECT 
                    ba_id,
                    content_id,
                    completion_date
                FROM {completion_table}
                WHERE ba_id IN ({employee_ids_str})
                    AND content_id IN ({dd_content_ids_str})
                    AND completion_date >= '{week_start_date}'
                    AND completion_date <= '{week_end_date}'
                ORDER BY ba_id, completion_date DESC
                """
                
                cursor.execute(query)
                
                # Fetch results
                completion_rows = cursor.fetchall()
                
                # Close connection
                cursor.close()
                connection.close()
                
                if completion_rows:
                    print(f"Found {len(completion_rows)} Daily Dose completion(s) this week:")
                    for row in completion_rows:
                        emp_id = row[0]
                        content_id = row[1]
                        comp_date = row[2]
                        
                        # Mark this employee to skip
                        if emp_id not in employees_to_skip_dd:
                            employees_to_skip_dd[emp_id] = {
                                'reason': 'completed_this_week',
                                'content_id': content_id,
                                'completion_date': comp_date
                            }
                        
                        print(f"  Employee {emp_id}: Completed Daily Dose {content_id} on {comp_date}")
                    print()
                else:
                    print("✓ No Daily Dose completions found this week")
                    print()
                    
            except Exception as e:
                print(f"⚠ Could not query content_completion: {e}")
                print("  Continuing without completion check...")
                print()
        
        # CONFLICT CHECK 2: Check open assignments for Daily Dose with due dates this week
        today_date = now_pt.date()
        next_sunday_date = next_sunday.date()
        
        if not open_assignments_df.empty:
            for _, assignment in open_assignments_df.iterrows():
                employee_id = int(assignment['ba_id'])
                content_id = int(assignment['content_id'])
                assignment_due_date = assignment['assignment_due_date']
                
                # Check if this is a Daily Dose assignment
                if content_id in daily_dose_content_ids:
                    # Convert due date to date for comparison
                    if hasattr(assignment_due_date, 'date'):
                        due_date_check = assignment_due_date.date()
                    else:
                        from dateutil import parser
                        due_date_check = parser.parse(str(assignment_due_date)).date()
                    
                    # Check if due date is today or next future Monday
                    if due_date_check == today_date or due_date_check == next_sunday_date:
                        if employee_id not in employees_to_skip_dd:
                            employees_to_skip_dd[employee_id] = {
                                'reason': 'open_assignment_this_week',
                                'content_id': content_id,
                                'due_date': due_date_check
                            }
        
        # Log employees who will be skipped for Daily Dose
        if employees_to_skip_dd:
            print(f"❌ SKIPPING {len(employees_to_skip_dd)} employee(s) - Daily Dose conflict:")
            print()
            for emp_id in sorted(employees_to_skip_dd.keys()):
                skip_info = employees_to_skip_dd[emp_id]
                print(f"  Employee {emp_id}:")
                
                if skip_info['reason'] == 'completed_this_week':
                    print(f"    Reason: Already completed Daily Dose this week")
                    print(f"    Completed Content: {skip_info['content_id']}")
                    print(f"    Completion Date: {skip_info['completion_date']}")
                else:
                    print(f"    Reason: Has open Daily Dose assignment for this week")
                    print(f"    Assigned Content: {skip_info['content_id']}")
                    print(f"    Due Date: {skip_info['due_date']}")
                print()
        else:
            print(f"✓ No Daily Dose conflicts found")
            print()
        
        print("-" * 80)
        print()
        
        # Create Daily Dose assignments for eligible employees
        eligible_employees_dd = [emp for _, emp in employees_df.iterrows() 
                                 if emp['employee_id'] not in employees_to_skip_dd]
        
        if len(eligible_employees_dd) > 0:
            print(f"✓ ASSIGNING DAILY DOSE TO {len(eligible_employees_dd)} ELIGIBLE EMPLOYEE(S)")
            print()
            
            for employee in eligible_employees_dd:
                employee_id = employee['employee_id']
                
                print(f"Employee {employee_id}:")
                
                # Assign each selected Daily Dose content to this employee
                for _, content in contents_to_assign.iterrows():
                    content_id = content['ContentId']
                    content_name = content['ContentName']
                    
                    print(f"  ✓ Daily Dose: {content_id} - {content_name}")
                    
                    # Store what was assigned
                    if employee_id not in employee_assigned_daily_dose:
                        employee_assigned_daily_dose[employee_id] = []
                    employee_assigned_daily_dose[employee_id].append({
                        'content_id': content_id,
                        'content_name': content_name
                    })
                    
                    assignment = {
                        "UserID": employee_id,
                        "CreateDate_text": created_date,
                        "RequestId": generate_request_id(),
                        "TrainingElementId": content_id,
                        "Start_Date_text": start_date,
                        "DueDate_text": due_date,
                        "ContentType": "Media"
                    }
                    
                    new_manager_assignments.append(assignment)
                
                print()
            
            dd_count = len([a for a in new_manager_assignments])
            print(f"Created {dd_count} Daily Dose assignments")
            print()
        else:
            print("⚠ NO ELIGIBLE EMPLOYEES for Daily Dose")
            print("All employees have Daily Dose conflicts (completed or assigned this week).")
            print()
    else:
        print("No Daily Dose content found.")
        print()
    
    # PART B: RANDOM NON-DAILY DOSE ASSIGNMENTS
    print("=" * 80)
    print("PART B: RANDOM NON-DAILY DOSE ASSIGNMENTS")
    print("=" * 80)
    print()
    
    # Filter for content where Daily_Dose_BA is NOT TRUE
    print("Filtering for NON-Daily Dose training (Daily_Dose_BA != TRUE)...")
    non_daily_dose_content = standalone_df[
        ~((standalone_df['Daily_Dose_BA'] == 'TRUE') | 
          (standalone_df['Daily_Dose_BA'] == True))
    ].copy()
    
    print(f"Found {len(non_daily_dose_content)} non-Daily Dose content items")
    print()
    
    if len(non_daily_dose_content) > 0:
        # Collect assignment data for table display
        random_assignment_rows = []
        
        for _, employee in employees_df.iterrows():
            employee_id = employee['employee_id']
            
            # Randomly select one content from non-Daily Dose content
            selected_content = non_daily_dose_content.sample(n=1).iloc[0]
            content_id = selected_content['ContentId']
            content_name = selected_content['ContentName']
            
            # Store for table display
            random_assignment_rows.append((employee_id, content_id, content_name))
            
            # Store what was assigned
            employee_assigned_random[employee_id] = {
                'content_id': content_id,
                'content_name': content_name
            }
            
            assignment = {
                "UserID": employee_id,
                "CreateDate_text": created_date,
                "RequestId": generate_request_id(),
                "TrainingElementId": content_id,
                "Start_Date_text": start_date,
                "DueDate_text": due_date,
                "ContentType": "Media"
            }
            
            new_manager_assignments.append(assignment)
        
        # Print header
        print(f"Random non-Daily Dose training assigned to {len(employees_df)} employee(s):")
        print()
        print(f"{'Employee ID':<15} | {'Content ID':<15} | {'Content Name'}")
        print(f"{'-' * 15} | {'-' * 15} | {'-' * 50}")
        
        # Print each assignment
        for emp_id, content_id, content_name in sorted(random_assignment_rows):
            print(f"{emp_id:<15} | {content_id:<15} | {content_name}")
        
        print()
        random_count = len(employee_assigned_random)
        print(f"Created {random_count} random non-Daily Dose assignments")
        print()
    else:
        print("No non-Daily Dose content found.")
        print()
else:
    print("Standalone content file not found.")
    print("Please run the preprocessing section first.")

# Step 3: Combine Databricks assignments with new manager assignments
print("=" * 80)
print("COMBINING ASSIGNMENTS")
print("=" * 80)
print()
print(f"  Assignments from Databricks Table: {len(databricks_assignments)}")
print(f"  New Daily Dose assignments: {len(employee_assigned_daily_dose) * len(contents_to_assign) if employee_assigned_daily_dose else 0}")
print(f"  New random Non-Daily Dose assignments: {len(employee_assigned_random)}")
print(f"  Total new assignments: {len(new_manager_assignments)}")

# Databricks assignments go FIRST (as per manager.md)
all_assignments = databricks_assignments + new_manager_assignments
print(f"  Total assignments for output: {len(all_assignments)}")
print()

# Step 4: Generate output file
if all_assignments:
    assignments_filename = generate_non_completed_assignments_filename()
    assignments_path = f"{OUTPUT_DIR}/{assignments_filename}"
    
    # Create DataFrame
    assignments_df = pd.DataFrame(all_assignments)
    
    # Write to CSV with proper quoting
    assignments_df.to_csv(assignments_path, index=False, quoting=1)  # quoting=1 means QUOTE_ALL
    
    print(f"Generated NonCompletedAssignments file: {assignments_filename}")
    print(f"  Databricks Table assignments: {len(databricks_assignments)}")
    print(f"  New manager assignments: {len(new_manager_assignments)}")
    print(f"  Total assignments in file: {len(all_assignments)}")
    print()
    
    # Print summary for NEW assignments
    if new_manager_assignments:
        print("New Assignment Summary:")
        if employee_assigned_daily_dose:
            print(f"  Daily Dose assignments: {len(employee_assigned_daily_dose)} employee(s) × {len(contents_to_assign)} contents = {len(employee_assigned_daily_dose) * len(contents_to_assign)}")
            if employees_to_skip_dd:
                print(f"  Skipped (Daily Dose conflicts): {len(employees_to_skip_dd)} employee(s)")
        if employee_assigned_random:
            print(f"  Random assignments: {len(employee_assigned_random)} employee(s) × 1 content = {len(employee_assigned_random)}")
        print(f"  Total new assignments: {len(new_manager_assignments)}")
        print(f"  CreateDate: {created_date}")
        print(f"  Start Date: {start_date}")
        print(f"  Due Date: {due_date}")
else:
    print("No assignments created (neither from Databricks nor new manager assignments).")

print()
print("=" * 80)

# Employee Training Simulation

This section simulates employees completing training based on manager assignments and AI recommendations:

## Workflow:
1. **Get Recommendations** (next cell): Calls ML Training Recommender API for each employee
2. **Get Manager Assignments**: Loads assignments from NonCompletedAssignments file created by manager
3. **Combine Training**: Merges manager assignments with AI recommendations
4. **Check Recent Completions**: Queries content_completion table for training completed in last 13 days
   - **ONLY applies to AI recommendations** - manager assignments are ALWAYS included
   - Employees skip AI-recommended training they completed recently (today + prior 12 days)
   - Filtered AI training is removed from available training list
   - Logs which AI training was skipped and why
5. **Helper Functions** (following cell): Generates training timestamps
6. **Process Employee**: Determines completions based on employee type:
   - Type A: Completes all training (manager + filtered AI)
   - Type B: Completes one training (from combined list of manager + filtered AI)
   - Type F: Completes no training
7. **Filename Generator**: Creates unique output filename with timestamp
8. **Main Loop**: Processes all employees and collects completion records
9. **Generate Output**: Writes ContentUserCompletion CSV file
10. **Update NonCompletedAssignments**: Removes completed training from NonCompletedAssignments file
11. **Print Summary**: Displays completion summary with source (manager or AI) for each employee

**Important Notes**: 
- Employees will skip AI-recommended training they already completed in the last 13 days (current day + prior 12 days)
- Manager-assigned training is NEVER skipped - employees always see and complete manager assignments regardless of recent completion history
- This prevents duplicate AI recommendations while ensuring manager assignments are always honored

In [None]:
# Wrapper function for ML API calls that automatically uses config
def get_training_recommendations(employee_id):
    """
    Wrapper around core.get_training_recommendations that uses the notebook's config.
    
    Args:
        employee_id: The employee's ID
    
    Returns:
        List of training recommendations
    """
    return core.get_training_recommendations(config, employee_id)

In [None]:
# Use simulation_core function for generating training times
generate_training_times = core.generate_training_times

In [None]:
# Use simulation_core functions for employee processing
generate_output_filename = core.generate_output_filename

# Wrapper function for process_employee to adapt parameter order for notebook usage
def process_employee(employee_id: int, employee_type: str, manager_assignments_path: str, standalone_df, ai_recommendations = None):
    """
    Wrapper around simulation_core.process_employee that adapts the signature for notebook usage.
    
    Args:
        employee_id: The employee's ID
        employee_type: The employee's type (a, b, or f)
        manager_assignments_path: Path to the NonCompletedAssignments CSV file
        standalone_df: DataFrame containing standalone content for lookups
        ai_recommendations: Optional pre-fetched AI recommendations
    
    Returns:
        List of completed training records
    """
    # Get manager assignments using simulation_core helper
    manager_assignments = core.get_manager_assignments_for_employee(
        employee_id, manager_assignments_path, standalone_df)
    
    # Get AI recommendations if not provided
    if ai_recommendations is None:
        ai_recommendations = core.get_training_recommendations(config, employee_id)
    
    # Call simulation_core.process_employee
    return core.process_employee(
        config,
        employee_id,
        employee_type,
        manager_assignments,
        ai_recommendations,
        standalone_df
    )

In [None]:
# Main execution - Process employees and simulate training completions
print("=" * 80)
print("EMPLOYEE TRAINING SIMULATION")
print("=" * 80)
print()

# Check if manager assignments were created
if 'assignments_path' not in locals() or not os.path.exists(assignments_path):
    print("WARNING: Manager assignments file not found. Employees will only complete AI recommendations.")
    print()
    assignments_path = ""

# Process each employee
all_completions = []
employee_summaries = []
employee_ml_recommendations = []  # Store ML recommendations for summary

for _, employee in employees_df.iterrows():
    employee_id = employee['employee_id']
    employee_type = employee['employee_edu_type']
    
    print(f"Processing Employee {employee_id} (Type {employee_type.upper()})...")
    
    # Get AI recommendations
    ai_recommendations = get_training_recommendations(employee_id)
    
    # Store ML recommendations for this employee
    if ai_recommendations:
        ml_recs = []
        for rec in ai_recommendations:
            ml_recs.append({
                "content_id": rec.get("recommended_content_id"),
                "content_name": rec.get("recommended_content", "Unknown")
            })
        employee_ml_recommendations.append((employee_id, ml_recs))
    
    # Process employee with pre-fetched AI recommendations
    completions = process_employee(employee_id, employee_type, assignments_path, standalone_df, ai_recommendations)
    
    if completions:
        all_completions.extend(completions)
        # Store ContentId, CourseName, and Source for summary
        course_details = [(c['ContentId'], c['CourseName'], c['Source']) for c in completions]
        employee_summaries.append((employee_id, course_details))
        print(f"  Completed {len(completions)} training(s)")
    else:
        print(f"  No training completed")
    print()

print("=" * 80)

In [None]:
# Generate output file
if all_completions:
    # Use simulation_core function to write the output file
    output_path = core.write_content_user_completion_file(all_completions, OUTPUT_DIR)
    
    output_filename = os.path.basename(output_path)
    print(f"Generated output file: {output_filename}")
    print(f"Total completions: {len(all_completions)}")
    print()
else:
    print("No training completions to write.")
    print()


In [None]:
# Update NonCompletedAssignments file to remove completed training
if all_completions and 'assignments_path' in locals() and os.path.exists(assignments_path):
    print("=" * 80)
    print("UPDATING NON-COMPLETED ASSIGNMENTS FILE")
    print("=" * 80)
    print()
    
    # Use simulation_core function to update the assignments file
    initial_count, removed_count = core.update_non_completed_assignments_file(
        assignments_path, all_completions)
    
    remaining_count = initial_count - removed_count
    
    print(f"Updated NonCompletedAssignments file:")
    print(f"  File: {assignments_path}")
    print(f"  Initial assignments: {initial_count}")
    print(f"  Completed assignments (removed): {removed_count}")
    print(f"  Remaining assignments: {remaining_count}")
    
    if remaining_count == 0:
        print()
        print("✓ All assignments completed!")
        print("  File contains headers only (no remaining assignments)")
    
    print()
    print("=" * 80)
    print()
elif all_completions:
    print("⚠ NonCompletedAssignments file not found - skipping update")
    print()
else:
    print("⚠ No completions to process - skipping NonCompletedAssignments update")
    print()


In [None]:
# Print summary
print("-" * 80)
print("EXECUTION SUMMARY")
print("-" * 80)
print()

print("=" * 80)
print("MANAGER-ASSIGNMENTS GIVEN NEW")
print("=" * 80)
print()

# PART A: Daily Dose Assignments
if 'employee_assigned_daily_dose' in locals() and len(employee_assigned_daily_dose) > 0:
    print("PART A: DAILY DOSE ASSIGNMENTS")
    print("-" * 80)
    print()
    print(f"Daily Dose training assigned to {len(employee_assigned_daily_dose)} employee(s):")
    print()
    
    # Get the contents from the first employee (they all have the same Daily Dose)
    first_employee_contents = list(employee_assigned_daily_dose.values())[0]
    
    print("Daily Dose Contents:")
    for content_info in first_employee_contents:
        content_id = content_info['content_id']
        content_name = content_info['content_name']
        print(f"  {content_id} - {content_name}")
    
    print()
    
    # Print list of all employees who received these assignments
    employee_ids_str = ", ".join([str(emp_id) for emp_id in sorted(employee_assigned_daily_dose.keys())])
    print(f"Employees: {employee_ids_str}")
    print()
    
    if 'employees_to_skip_dd' in locals() and employees_to_skip_dd:
        skipped_ids_str = ", ".join([str(emp_id) for emp_id in sorted(employees_to_skip_dd.keys())])
        print(f"Skipped (already have Daily Dose): {skipped_ids_str}")
        print()
else:
    print("PART A: DAILY DOSE ASSIGNMENTS")
    print("-" * 80)
    print()
    print("No new Daily Dose assignments were created in this run.")
    if 'employees_to_skip_dd' in locals() and employees_to_skip_dd:
        print(f"All {len(employees_to_skip_dd)} employee(s) already have Daily Dose for current week.")
    print()

# PART B: Random Non-Daily Dose Assignments
if 'employee_assigned_random' in locals() and len(employee_assigned_random) > 0:
    print("PART B: RANDOM NON-DAILY DOSE ASSIGNMENTS")
    print("-" * 80)
    print()
    print(f"Random non-Daily Dose training assigned to {len(employee_assigned_random)} employee(s):")
    print()
    
    # Print header
    print(f"{'Employee ID':<15} | {'Content ID':<15} | {'Course Name'}")
    print(f"{'-' * 15} | {'-' * 15} | {'-' * 50}")
    
    # Print each employee's random assignment
    for emp_id in sorted(employee_assigned_random.keys()):
        content_info = employee_assigned_random[emp_id]
        content_id = content_info['content_id']
        content_name = content_info['content_name']
        print(f"{emp_id:<15} | {content_id:<15} | {content_name}")
    
    print()
else:
    print("PART B: NON-DAILY DOSE ASSIGNMENTS RAMDOMLY CHOSEN")
    print("-" * 80)
    print()
    print("No random non-Daily Dose assignments were created in this run.")
    print()

print("=" * 80)
print("RECOMMENDATIONS GIVEN BY ML API")
print("=" * 80)
print()

# Display all ML recommendations given to employees in pipe-separated format
if employee_ml_recommendations:
    # Collect all recommendation rows
    recommendation_rows = []
    
    for employee_id, ml_recs in employee_ml_recommendations:
        for rec in ml_recs:
            content_id = rec["content_id"]
            content_name = rec["content_name"]
            recommendation_rows.append((employee_id, content_id, content_name))
    
    # Sort by employee ID, then content ID
    recommendation_rows.sort(key=lambda x: (x[0], str(x[1])))
    
    # Print header
    print(f"{'Employee ID':<15} | {'Content ID':<15} | {'Content Name'}")
    print(f"{'-' * 15} | {'-' * 15} | {'-' * 50}")
    
    # Print each recommendation as a separate row
    for employee_id, content_id, content_name in recommendation_rows:
        print(f"{employee_id:<15} | {content_id:<15} | {content_name}")
    
    print()
else:
    print("No ML recommendations were given to any employee.")
    print()

print("=" * 80)
print("EMPLOYEE TRAINING COMPLETIONS OF MANAGER-ASSIGNED")
print("=" * 80)
print()

# Track if any manager assignments were completed
manager_completions_found = False

# Collect all manager completions for table display
manager_completion_rows = []

for employee_id, course_details in employee_summaries:
    # Filter for manager-assigned training only
    manager_courses = [(content_id, course_name) for content_id, course_name, source in course_details if source == "manager"]
    
    if manager_courses:
        manager_completions_found = True
        for content_id, course_name in manager_courses:
            manager_completion_rows.append((employee_id, content_id, course_name))

if manager_completions_found:
    # Sort by employee ID, then content ID
    manager_completion_rows.sort(key=lambda x: (x[0], str(x[1])))
    
    # Print header
    print(f"{'Employee ID':<15} | {'Content ID':<15} | {'Course Name'}")
    print(f"{'-' * 15} | {'-' * 15} | {'-' * 50}")
    
    # Print each completion on a separate row
    for employee_id, content_id, course_name in manager_completion_rows:
        print(f"{employee_id:<15} | {content_id:<15} | {course_name}")
else:
    print("No manager-assigned training was completed by any employee.")

print()
print("=" * 80)
print(" EMPLOYEE TRAINING COMPLETIONS OF ML-RECOMMENDED")
print("=" * 80)
print()

# Track if any ML recommendations were completed
ml_completions_found = False

# Collect all ML completions for table display
ml_completion_rows = []

for employee_id, course_details in employee_summaries:
    # Filter for ML-recommended training only
    ml_courses = [(content_id, course_name) for content_id, course_name, source in course_details if source == "ai"]
    
    if ml_courses:
        ml_completions_found = True
        for content_id, course_name in ml_courses:
            ml_completion_rows.append((employee_id, content_id, course_name))

if ml_completions_found:
    # Sort by employee ID, then content ID
    ml_completion_rows.sort(key=lambda x: (x[0], str(x[1])))
    
    # Print header
    print(f"{'Employee ID':<15} | {'Content ID':<15} | {'Course Name'}")
    print(f"{'-' * 15} | {'-' * 15} | {'-' * 50}")
    
    # Print each completion on a separate row
    for employee_id, content_id, course_name in ml_completion_rows:
        print(f"{employee_id:<15} | {content_id:<15} | {course_name}")
else:
    print("No ML-recommended training was completed by any employee.")

print()
print("=" * 80)
print("execution complete")
print("=" * 80)

In [None]:
# Postprocessing - Publish generated files to SFTP outbound server
print()
print("=" * 80)
print("POSTPROCESSING - Publish Files to SFTP Outbound Server")
print("=" * 80)
print()

# Check if publishing is enabled
if not SFTP_PUBLISH_ENABLED:
    print("⊘ SFTP publishing is DISABLED")
    print(f"  To enable publishing, set SFTP_PUBLISH_ENABLED=true in .env file")
    print()
else:
    print("✓ SFTP publishing is ENABLED")
    print()
    
    # Collect all files to publish
    files_to_publish = []
    
    # Add generated output files
    if 'output_path' in locals() and os.path.exists(output_path):
        files_to_publish.append(output_path)
    
    if 'assignments_path' in locals() and os.path.exists(assignments_path):
        files_to_publish.append(assignments_path)
    
    if 'user_completion_path' in locals() and os.path.exists(user_completion_path):
        files_to_publish.append(user_completion_path)
    
    # Add downloaded files (preprocessing files)
    if 'course_catalog_path' in locals() and os.path.exists(course_catalog_path):
        files_to_publish.append(course_catalog_path)
    
    if 'standalone_content_path' in locals() and os.path.exists(standalone_content_path):
        files_to_publish.append(standalone_content_path)
    
    if files_to_publish:
        print(f"Files to publish ({len(files_to_publish)}):")
        print()
        
        # Categorize files for better display
        generated_files = []
        downloaded_files = []
        
        for file_path in files_to_publish:
            filename = os.path.basename(file_path)
            if 'course_catalog_path' in locals() and file_path == course_catalog_path:
                downloaded_files.append(filename)
            elif 'standalone_content_path' in locals() and file_path == standalone_content_path:
                downloaded_files.append(filename)
            else:
                generated_files.append(filename)
        
        if generated_files:
            print("Generated files:")
            for filename in generated_files:
                print(f"  - {filename}")
        
        if downloaded_files:
            print()
            print("Downloaded files (from preprocessing):")
            for filename in downloaded_files:
                print(f"  - {filename}")
        
        print()
        
        # Publish files
        print("Publishing files...")
        print("-" * 80)
        success = publish_files_to_sftp_outbound(files_to_publish)
        print("-" * 80)
        print()
        
        if success:
            print("✓ All files published successfully")
        else:
            print("⚠ Some files failed to publish")
    else:
        print("⚠ No files found to publish")
        print("  Generated files may not exist. Please run the notebook cells in order.")

print()
print("=" * 80)