# BTC Fake - Training Completion Simulator

This notebook simulates employees completing training courses based on recommendations from the training recommender API.

## How it works:
1. Reads employee population from `actors/employees.csv`
2. For each employee, calls the recommendation API
3. Based on employee type (A, B, or F), completes training
4. Generates a ContentUserCompletion CSV file
5. Prints summary for each employee

In [1]:
import sys
print("Python executable:", sys.executable)
print("Python version:", sys.version)
from dotenv import load_dotenv
print("Success!")

Python executable: /Users/khansen/craft/stores/python/python-projects-rdi/btc_fake/.venv/bin/python
Python version: 3.13.2 (main, Feb  4 2025, 14:51:09) [Clang 16.0.0 (clang-1600.0.26.6)]
Success!


In [2]:
import pandas as pd
import requests
from datetime import datetime, timedelta
import random
import string
from typing import List, Dict
import urllib3

# Disable SSL warnings when ignoring certificate verification
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Configuration
API_BASE_URL = "https://dataiku-api-devqa.lower.internal.sephora.com"
API_ENDPOINT = "/public/api/v1/mltr/v3/run"
EMPLOYEES_FILE = "actors/employees.csv"
OUTPUT_DIR = "generated_files"

# Preprocessing - Download Files from SFTP

This section downloads the most recent files from the SFTP server:
1. **CourseCatalog** - Training curriculum elements like Courses and components
2. **StandAloneContent** - All training content (videos, PDFs, documents)

## Requirements:
1. Copy `.env.example` to `.env` and add your SFTP password
2. Files will be downloaded to `downloaded_files/` directory
3. The system finds the most recent file based on the date in the filename

## File Formats:
- CourseCatalog: `CourseCatalog_V2_YYYY_M_DD_1_random.csv`
- StandAloneContent: `StandAloneContent_v2_YYYY_M_DD_1_random.csv`

In [3]:
# Import SFTP libraries and load environment
import os
import re
from dotenv import load_dotenv
import paramiko
from datetime import datetime

# Load environment variables
load_dotenv()

# SFTP Configuration
SFTP_HOST = "sftp.sephora.com"
SFTP_USER = "SephoraMSL"
SFTP_PASSWORD = os.getenv("SFTP_PASSWORD", "your_sftp_password_placeholder")
SFTP_REMOTE_PATH = "/inbound/BTC/retailData/prod/vendor/mySephoraLearning-archive"
SFTP_LOCAL_DIR = "downloaded_files"

def parse_course_catalog_filename(filename: str) -> tuple:
    """
    Parse course catalog filename to extract date components.
    Format: CourseCatalog_V2_YYYY_M_DD_1_random.csv
    
    Args:
        filename: The course catalog filename
    
    Returns:
        Tuple of (year, month, day, datetime_obj) or None if parsing fails
    """
    pattern = r'CourseCatalog_V2_(\d{4})_(\d{1,2})_(\d{1,2})_\d+_[a-z0-9]+\.csv'
    match = re.match(pattern, filename, re.IGNORECASE)
    
    if match:
        year = int(match.group(1))
        month = int(match.group(2))
        day = int(match.group(3))
        
        try:
            date_obj = datetime(year, month, day)
            return (year, month, day, date_obj)
        except ValueError:
            return None
    return None

def parse_standalone_content_filename(filename: str) -> tuple:
    """
    Parse standalone content filename to extract date components.
    Format: StandAloneContent_v2_YYYY_M_DD_1_random.csv
    
    Args:
        filename: The standalone content filename
    
    Returns:
        Tuple of (year, month, day, datetime_obj) or None if parsing fails
    """
    pattern = r'StandAloneContent_v2_(\d{4})_(\d{1,2})_(\d{1,2})_\d+_[a-z0-9]+\.csv'
    match = re.match(pattern, filename, re.IGNORECASE)
    
    if match:
        year = int(match.group(1))
        month = int(match.group(2))
        day = int(match.group(3))
        
        try:
            date_obj = datetime(year, month, day)
            return (year, month, day, date_obj)
        except ValueError:
            return None
    return None

def download_most_recent_course_catalog() -> str:
    """
    Connect to SFTP server and download the most recent CourseCatalog file.
    
    Returns:
        Path to the downloaded file, or None if download fails
    """
    try:
        # Create SFTP connection
        transport = paramiko.Transport((SFTP_HOST, 22))
        transport.connect(username=SFTP_USER, password=SFTP_PASSWORD)
        sftp = paramiko.SFTPClient.from_transport(transport)
        
        print(f"Connected to SFTP server: {SFTP_HOST}")
        
        # Change to remote directory
        sftp.chdir(SFTP_REMOTE_PATH)
        print(f"Changed to directory: {SFTP_REMOTE_PATH}")
        
        # List all files in the directory
        files = sftp.listdir()
        print(f"Found {len(files)} files in directory")
        
        # Filter for course catalog files and parse dates
        catalog_files = []
        for filename in files:
            parsed = parse_course_catalog_filename(filename)
            if parsed:
                catalog_files.append((filename, parsed[3]))  # (filename, datetime_obj)
        
        if not catalog_files:
            print("No valid CourseCatalog files found")
            sftp.close()
            transport.close()
            return None
        
        # Sort by date (most recent first)
        catalog_files.sort(key=lambda x: x[1], reverse=True)
        most_recent_file = catalog_files[0][0]
        most_recent_date = catalog_files[0][1]
        
        print(f"Most recent file: {most_recent_file} (date: {most_recent_date.strftime('%Y-%m-%d')})")
        
        # Download the file
        local_path = os.path.join(SFTP_LOCAL_DIR, most_recent_file)
        sftp.get(most_recent_file, local_path)
        print(f"Downloaded to: {local_path}")
        
        # Close connections
        sftp.close()
        transport.close()
        
        return local_path
        
    except Exception as e:
        print(f"Error downloading course catalog: {e}")
        print(f"  SFTP Host: {SFTP_HOST}")
        print(f"  SFTP Path: {SFTP_REMOTE_PATH}")
        print(f"  SFTP User: {SFTP_USER}")
        return None

def download_most_recent_standalone_content() -> str:
    """
    Connect to SFTP server and download the most recent StandAloneContent file.
    
    Returns:
        Path to the downloaded file, or None if download fails
    """
    try:
        # Create SFTP connection
        transport = paramiko.Transport((SFTP_HOST, 22))
        transport.connect(username=SFTP_USER, password=SFTP_PASSWORD)
        sftp = paramiko.SFTPClient.from_transport(transport)
        
        print(f"Connected to SFTP server: {SFTP_HOST}")
        
        # Change to remote directory
        sftp.chdir(SFTP_REMOTE_PATH)
        print(f"Changed to directory: {SFTP_REMOTE_PATH}")
        
        # List all files in the directory
        files = sftp.listdir()
        print(f"Found {len(files)} files in directory")
        
        # Filter for standalone content files and parse dates
        content_files = []
        for filename in files:
            parsed = parse_standalone_content_filename(filename)
            if parsed:
                content_files.append((filename, parsed[3]))  # (filename, datetime_obj)
        
        if not content_files:
            print("No valid StandAloneContent files found")
            sftp.close()
            transport.close()
            return None
        
        # Sort by date (most recent first)
        content_files.sort(key=lambda x: x[1], reverse=True)
        most_recent_file = content_files[0][0]
        most_recent_date = content_files[0][1]
        
        print(f"Most recent file: {most_recent_file} (date: {most_recent_date.strftime('%Y-%m-%d')})")
        
        # Download the file
        local_path = os.path.join(SFTP_LOCAL_DIR, most_recent_file)
        sftp.get(most_recent_file, local_path)
        print(f"Downloaded to: {local_path}")
        
        # Close connections
        sftp.close()
        transport.close()
        
        return local_path
        
    except Exception as e:
        print(f"Error downloading standalone content: {e}")
        print(f"  SFTP Host: {SFTP_HOST}")
        print(f"  SFTP Path: {SFTP_REMOTE_PATH}")
        print(f"  SFTP User: {SFTP_USER}")
        return None

In [4]:
# Execute: Download Course Catalog and Standalone Content from SFTP
print("=" * 80)
print("PREPROCESSING - Download Files from SFTP")
print("=" * 80)
print()

# Download Course Catalog
print("Downloading Course Catalog...")
print("-" * 80)
course_catalog_path = download_most_recent_course_catalog()

if course_catalog_path:
    print()
    print(f"✓ Course catalog downloaded successfully")
    print(f"  File: {course_catalog_path}")
    
    # Optionally load and preview the file
    try:
        catalog_df = pd.read_csv(course_catalog_path)
        print(f"  Rows: {len(catalog_df)}")
        print(f"  Columns: {list(catalog_df.columns)}")
    except Exception as e:
        print(f"  Note: Could not preview file: {e}")
else:
    print()
    print("✗ Failed to download course catalog")
    print("  Please check:")
    print("    1. .env file contains valid SFTP_PASSWORD")
    print("    2. SFTP server is accessible")
    print("    3. Remote path exists and contains CourseCatalog files")

print()
print("-" * 80)

# Download Standalone Content
print("Downloading Standalone Content...")
print("-" * 80)
standalone_content_path = download_most_recent_standalone_content()

if standalone_content_path:
    print()
    print(f"✓ Standalone content downloaded successfully")
    print(f"  File: {standalone_content_path}")
    
    # Optionally load and preview the file
    try:
        content_df = pd.read_csv(standalone_content_path)
        print(f"  Rows: {len(content_df)}")
        print(f"  Columns: {list(content_df.columns)}")
    except Exception as e:
        print(f"  Note: Could not preview file: {e}")
else:
    print()
    print("✗ Failed to download standalone content")
    print("  Please check:")
    print("    1. .env file contains valid SFTP_PASSWORD")
    print("    2. SFTP server is accessible")
    print("    3. Remote path exists and contains StandAloneContent files")

print()
print("=" * 80)

PREPROCESSING - Download Files from SFTP

Downloading Course Catalog...
--------------------------------------------------------------------------------
Connected to SFTP server: sftp.sephora.com
Changed to directory: /inbound/BTC/retailData/prod/vendor/mySephoraLearning-archive
Found 670 files in directory
Most recent file: CourseCatalog_V2_2026_1_6_1_c0996d.csv (date: 2026-01-06)
Downloaded to: downloaded_files/CourseCatalog_V2_2026_1_6_1_c0996d.csv

✓ Course catalog downloaded successfully
  File: downloaded_files/CourseCatalog_V2_2026_1_6_1_c0996d.csv
  Rows: 7773
  Columns: ['CourseId', 'CourseName', 'CourseAdminName', 'CourseTags', 'ModuleName', 'ModuleId', 'ModuleDescription', 'ContentName', 'ContentAdminName', 'ContentId', 'ContentDescriptions', 'ContentType', 'ContentId_List', 'ContentTags', 'CreateDate', 'Link_Content', 'Course_Link', 'DailyDose', 'DailyDoseBAs', 'CourseDailyDose', 'CourseDailyDoseBAs', 'CourseNameFrench', 'CourseDescriptionFrench', 'ContentNameFrench', 'Cont

In [5]:
def get_training_recommendations(employee_id: int) -> List[Dict]:
    """
    Call the training recommender API for a given employee.
    
    Args:
        employee_id: The employee's ID (ba_id)
    
    Returns:
        List of recommended training courses
    """
    url = f"{API_BASE_URL}{API_ENDPOINT}"
    payload = {"data": {"ba_id": employee_id}}
    
    try:
        # Disable SSL certificate verification for internal APIs
        response = requests.post(url, json=payload, timeout=30, verify=False)
        response.raise_for_status()
        data = response.json()
        
        # Response structure: {"response": {"ml_recommendations": [...], "coaching_note": {...}}, "timing": {...}, "apiContext": {...}}
        if isinstance(data, dict):
            response_data = data.get("response", {})
            if isinstance(response_data, dict):
                # Get ml_recommendations from nested response
                recommendations = response_data.get("ml_recommendations", [])
            else:
                # Response is directly a list
                recommendations = response_data if isinstance(response_data, list) else []
        else:
            print(f"  Unexpected response type: {type(data)}")
            return []
        
        # Print selected fields from API response
        if isinstance(recommendations, list) and recommendations:
            print(f"  API Response for employee {employee_id}:")
            for rec in recommendations:
                ba_id = rec.get("ba_id", "N/A")
                content_id = rec.get("recommended_content_id", "N/A")
                recommended_content = rec.get("recommended_content", "N/A")
                print(f"  {ba_id} | {content_id} | {recommended_content}")
            print()
        
        # Ensure we have a list
        if isinstance(recommendations, list):
            return recommendations
        else:
            print(f"  Recommendations is not a list: {type(recommendations)}")
            return []
            
    except Exception as e:
        print(f"  Error fetching recommendations for employee {employee_id}: {e}")
        return []

In [6]:
def format_content_id(content_id: int) -> str:
    """
    Format content ID with commas for human readability.
    Example: 1915085 -> "1,915,085"
    
    Args:
        content_id: The numeric content ID
    
    Returns:
        Formatted string with commas
    """
    return f"{content_id:,}"

def generate_training_times(num_courses: int) -> List[tuple]:
    """
    Generate start and completion times for training courses.
    Start time: 2 minutes ago
    Completion time: 1 minute ago
    
    Args:
        num_courses: Number of courses to generate times for
    
    Returns:
        List of (start_time, end_time) tuples in ISO-8601 format
    """
    times = []
    current_time = datetime.now()
    
    for _ in range(num_courses):
        # Start time: 2 minutes ago
        start_time = current_time - timedelta(minutes=2)
        
        # Completion time: 1 minute ago
        end_time = current_time - timedelta(minutes=1)
        
        times.append((
            start_time.strftime("%Y-%m-%dT%H:%M:%SZ"),
            end_time.strftime("%Y-%m-%dT%H:%M:%SZ")
        ))
    
    return times

In [7]:
def process_employee(employee_id: int, employee_type: str) -> List[Dict]:
    """
    Process a single employee: get recommendations and simulate completions.
    
    Args:
        employee_id: The employee's ID
        employee_type: The employee's type (a, b, or f)
    
    Returns:
        List of completed training records
    """
    employee_type = employee_type.lower().strip()
    recommendations = get_training_recommendations(employee_id)
    
    if not recommendations:
        print(f"  No recommendations for employee {employee_id}")
        return []
    
    # Determine how many courses to complete based on employee type
    if employee_type == 'a':
        # Type A: complete all assignments
        courses_to_complete = recommendations
    elif employee_type == 'b':
        # Type B: complete one assignment
        courses_to_complete = recommendations[:1]
    else:
        # Type F: complete no assignments
        courses_to_complete = []
    
    # Generate completion records
    completions = []
    times = generate_training_times(len(courses_to_complete))
    
    for i, course in enumerate(courses_to_complete):
        try:
            # Validate course is a dict
            if not isinstance(course, dict):
                print(f"  WARNING: Course is not a dict, it's {type(course)}: {course}")
                continue
            
            start_time, end_time = times[i]
            completions.append({
                "UserId": employee_id,
                "ContentId": format_content_id(course["recommended_content_id"]),
                "DateStarted": start_time,
                "DateCompleted": end_time,
                "CourseName": course.get("recommended_content", "Unknown")
            })
        except KeyError as e:
            print(f"  WARNING: Missing key {e} in course data: {course}")
            continue
        except Exception as e:
            print(f"  WARNING: Error processing course: {e}")
            continue
    
    return completions

In [8]:
def generate_output_filename() -> str:
    """
    Generate output filename with timestamp and random suffix.
    Format: ContentUserCompletion_V2_YY_MM_DD_1_RAND.csv
    
    Returns:
        Generated filename
    """
    now = datetime.now()
    year = now.strftime("%Y")
    month = now.strftime("%m")
    day = now.strftime("%d")
    
    # Generate 6-character random alphanumeric suffix
    random_suffix = ''.join(random.choices(string.ascii_lowercase + string.digits, k=6))
    
    return f"ContentUserCompletion_V2_{year}_{month}_{day}_1_{random_suffix}.csv"

In [9]:
# Main execution
print("=" * 80)
print("BTC Fake - Training Completion Simulator")
print("=" * 80)
print()

# Load employees
print(f"Loading employees from {EMPLOYEES_FILE}...")
employees_df = pd.read_csv(EMPLOYEES_FILE)
print(f"Loaded {len(employees_df)} employees\n")

# Process each employee
all_completions = []
employee_summaries = []

for _, employee in employees_df.iterrows():
    employee_id = employee['employee_id']
    employee_type = employee['employee_edu_type']
    
    print(f"Processing Employee {employee_id} (Type {employee_type.upper()})...")
    completions = process_employee(employee_id, employee_type)
    
    if completions:
        all_completions.extend(completions)
        # Store both ContentId and CourseName for summary
        course_details = [(c['ContentId'], c['CourseName']) for c in completions]
        employee_summaries.append((employee_id, course_details))
        print(f"  Completed {len(completions)} training(s)")
    else:
        print(f"  No training completed")
    print()

print("=" * 80)

BTC Fake - Training Completion Simulator

Loading employees from actors/employees.csv...
Loaded 29 employees

Processing Employee 63419 (Type F)...
  API Response for employee 63419:
  63419 | 657907 | Sell. How to Add on
  63419 | 657908 | Sell. How to Reassure Your Client

  No training completed

Processing Employee 63492 (Type A)...
  No recommendations for employee 63492
  No training completed

Processing Employee 75412 (Type A)...
  No recommendations for employee 75412
  No training completed

Processing Employee 85038 (Type A)...
  API Response for employee 85038:
  85038 | 913731 | Sell. How to Multiworld Sell

  Completed 1 training(s)

Processing Employee 86994 (Type A)...
  API Response for employee 86994:
  86994 | 892298 | Fragrance - Get. Give. Teach. Sell.

  Completed 1 training(s)

Processing Employee 88563 (Type A)...
  API Response for employee 88563:
  88563 | 892298 | Fragrance - Get. Give. Teach. Sell.

  Completed 1 training(s)

Processing Employee 104829 (Type

In [10]:
# Generate output file
if all_completions:
    output_filename = generate_output_filename()
    output_path = f"{OUTPUT_DIR}/{output_filename}"
    
    # Create DataFrame with only the required columns for CSV
    output_df = pd.DataFrame(all_completions)
    output_df = output_df[['UserId', 'ContentId', 'DateStarted', 'DateCompleted']]
    
    # Write to CSV with proper quoting
    output_df.to_csv(output_path, index=False, quoting=1)  # quoting=1 means QUOTE_ALL
    
    print(f"Generated output file: {output_filename}")
    print(f"Total completions: {len(all_completions)}")
    print()
else:
    print("No training completions to write.")
    print()

Generated output file: ContentUserCompletion_V2_2026_01_06_1_q7atrh.csv
Total completions: 20



In [11]:
# Print summary
print("=" * 80)
print("EMPLOYEE TRAINING SUMMARY")
print("=" * 80)
print()

for employee_id, course_details in employee_summaries:
    # Format: "ContentId: CourseName"
    course_list = ", ".join([f"{content_id}: {course_name}" for content_id, course_name in course_details])
    print(f"Employee {employee_id}: {course_list}")

print()
print("=" * 80)
print("Simulation complete!")
print("=" * 80)

EMPLOYEE TRAINING SUMMARY

Employee 85038: 913,731: Sell. How to Multiworld Sell
Employee 86994: 892,298: Fragrance - Get. Give. Teach. Sell.
Employee 88563: 892,298: Fragrance - Get. Give. Teach. Sell.
Employee 104829: 574,327: Servicing Multiple Clients
Employee 109828: 1,549,460: Sell.
Employee 151557: 594,097: Sell. Three Ways to Sell
Employee 155810: 594,096: Sell. When Clients Say No
Employee 173789: 863,648: Sell. How to Add on
Employee 175342: 892,298: Fragrance - Get. Give. Teach. Sell.
Employee 221603: 892,298: Fragrance - Get. Give. Teach. Sell.
Employee 263634: 863,643: Give. Determine the Client Journey, 885,653: Give. Setting Expectations
Employee 266064: 913,731: Sell. How to Multiworld Sell
Employee 266837: 863,648: Sell. How to Add on
Employee 298211: 1,717,886: Get - Client Cues
Employee 304617: 892,298: Fragrance - Get. Give. Teach. Sell.
Employee 312675: 604,266: Give. Setting Expectations
Employee 359755: 892,298: Fragrance - Get. Give. Teach. Sell.
Employee 359779