# Fake Appreticeships

This is an attempt to produce some dummy data to train our system on.

In [7]:
pip install faker

Collecting faker
  Downloading Faker-30.3.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-30.3.0-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: faker
Successfully installed faker-30.3.0
Note: you may need to restart the kernel to use updated packages.


In [9]:
# Initaial imports

from faker import Faker
import random
from datetime import datetime, timedelta
import pandas as pd

fake = Faker()

## First attempt

The below was its first attempt, though the output consisted of gibberish placenames and "Postcodes" that were more like US-style zipcodes.

In [11]:


# List of common job sectors and roles for apprenticeships
SECTORS = [
    "Information Technology", "Healthcare", "Manufacturing", "Construction",
    "Business Administration", "Hospitality", "Automotive", "Engineering",
    "Digital Marketing", "Financial Services"
]

ROLES = {
    "Information Technology": ["Software Developer", "Network Technician", "IT Support", "Cloud Engineer", "Cybersecurity Analyst"],
    "Healthcare": ["Healthcare Assistant", "Dental Nurse", "Pharmacy Technician", "Medical Administrator"],
    "Manufacturing": ["Production Operative", "Quality Control", "Manufacturing Technician", "Process Operator"],
    "Construction": ["Carpenter", "Electrician", "Plumber", "Bricklayer", "Construction Technician"],
    "Business Administration": ["Business Administrator", "HR Assistant", "Office Manager", "Project Coordinator"],
    "Hospitality": ["Chef", "Hospitality Team Member", "Hotel Operations", "Events Coordinator"],
    "Automotive": ["Motor Vehicle Technician", "Auto Body Repair", "Vehicle Paint Technician"],
    "Engineering": ["Mechanical Engineer", "Electrical Engineer", "Maintenance Engineer"],
    "Digital Marketing": ["Digital Marketing Assistant", "Social Media Coordinator", "Content Creator"],
    "Financial Services": ["Accounting Technician", "Insurance Practitioner", "Financial Administrator"]
}

LEVELS = ["Intermediate", "Advanced", "Higher", "Degree"]
DURATIONS = [12, 18, 24, 36, 48]  # months

def generate_salary():
    """Generate a realistic apprenticeship salary."""
    base = random.uniform(15000, 35000)
    level_multiplier = random.uniform(0.9, 1.3)
    return round(base * level_multiplier, -3)  # Round to nearest thousand

def generate_apprenticeship():
    """Generate a single apprenticeship entry."""
    sector = random.choice(SECTORS)
    role = random.choice(ROLES[sector])
    level = random.choice(LEVELS)
    duration = random.choice(DURATIONS)
    
    # Generate start date within next 6 months
    start_date = fake.date_between(
        start_date=datetime.now(),
        end_date=datetime.now() + timedelta(days=180)
    )

    return {
        "position_id": fake.unique.uuid4(),
        "company_name": fake.company(),
        "role_title": role,
        "sector": sector,
        "level": level,
        "duration_months": duration,
        "salary_gbp": generate_salary(),
        "location": fake.city(),
        "postal_code": fake.postcode(),
        "start_date": start_date.strftime("%Y-%m-%d"),
        "requirements": f"Minimum {random.choice(['GCSE', 'A-Level'])} qualifications required",
        "contact_email": fake.company_email(),
        "posted_date": (start_date - timedelta(days=random.randint(1, 30))).strftime("%Y-%m-%d")
    }

def generate_apprenticeships(num_records=4500):
    """Generate multiple apprenticeship records and save to CSV."""
    apprenticeships = [generate_apprenticeship() for _ in range(num_records)]
    df = pd.DataFrame(apprenticeships)
    
    # Save to CSV
    filename = f"apprenticeships_data_{datetime.now().strftime('%Y%m%d')}.csv"
    df.to_csv(filename, index=False)
    print(f"Generated {num_records} apprenticeship records and saved to {filename}")
    return df

if __name__ == "__main__":
    generate_apprenticeships()

Generated 4500 apprenticeship records and saved to apprenticeships_data_20241008.csv


## Second attempt

Asked to ammend the code to use real English places and their corresponding postcodes.

In [17]:
from faker import Faker
import random
from datetime import datetime, timedelta
import pandas as pd

fake = Faker(['en_GB'])  # Use British locale

# List of major English cities and towns
ENGLISH_LOCATIONS = [
    "Bath"
    "Birmingham", 
    "Blackpool",
    "Bournemouth",
    "Brighton",
    "Bristol", 
    "Cambridge",
    "Carlisle",
    "Coventry",
    "Derby",
    "Exeter", 
    "Hull", 
    "Ipswich",
    "Leeds",
    "Leicester",
    "Liverpool",     
    "London", 
    "Luton",
    "Manchester",
    "Middlesbrough", 
    "Milton Keynes",
    "Newcastle",
    "Northampton", 
    "Norwich",
    "Nottingham", 
    "Oxford",
    "Peterborough",
    "Plymouth",
    "Portsmouth",
    "Preston",
    "Reading",
    "Sheffield", 
    "Southampton",
    "St Ives",
    "Stoke-on-Trent", 
    "Sunderland",
    "Swindon",
    "Watford",
    "Wolverhampton", 
    "York"
]

# Dictionary mapping cities to their general postcode area
POSTCODE_AREAS = {
    "Watford": ["WD"],
    "Bournemouth": ["BH"],
    "St Ives":["TR"],
    "Sunderland": ["SR"],
    "Carlisle": ["CA"],
    "Bath": ["BA"],
    "London": ["E", "EC", "N", "NW", "SE", "SW", "W", "WC"],
    "Manchester": ["M"],
    "Birmingham": ["B"],
    "Liverpool": ["L"],
    "Leeds": ["LS"],
    "Newcastle": ["NE"],
    "Sheffield": ["S"],
    "Nottingham": ["NG"],
    "Bristol": ["BS"],
    "Leicester": ["LE"],
    "Coventry": ["CV"],
    "York": ["YO"],
    "Portsmouth": ["PO"],
    "Cambridge": ["CB"],
    "Oxford": ["OX"],
    "Brighton": ["BN"],
    "Reading": ["RG"],
    "Milton Keynes": ["MK"],
    "Southampton": ["SO"],
    "Plymouth": ["PL"],
    "Norwich": ["NR"],
    "Swindon": ["SN"],
    "Exeter": ["EX"],
    "Ipswich": ["IP"],
    "Derby": ["DE"],
    "Northampton": ["NN"],
    "Luton": ["LU"],
    "Blackpool": ["FY"],
    "Hull": ["HU"],
    "Preston": ["PR"],
    "Middlesbrough": ["TS"],
    "Stoke-on-Trent": ["ST"],
    "Wolverhampton": ["WV"],
    "Peterborough": ["PE"]
}

def generate_uk_postcode(city):
    """Generate a realistic UK postcode for a given city."""
    area = random.choice(POSTCODE_AREAS.get(city, ["XX"]))  # Default to XX if city not found
    
    # Generate the first part (outward code)
    if len(area) == 1:
        outward = f"{area}{random.randint(1, 11)}"
    else:
        outward = f"{area}{random.randint(1, 9)}"
    
    # Generate the second part (inward code)
    inward = f"{random.randint(0, 9)}{chr(random.randint(65, 90))}{chr(random.randint(65, 90))}"
    
    return f"{outward} {inward}"

SECTORS = [
    "Information Technology", "Healthcare", "Manufacturing", "Construction",
    "Business Administration", "Hospitality", "Automotive", "Engineering",
    "Digital Marketing", "Financial Services"
]

ROLES = {
    "Information Technology": ["Software Developer", "Network Technician", "IT Support", "Cloud Engineer", "Cybersecurity Analyst"],
    "Healthcare": ["Healthcare Assistant", "Dental Nurse", "Pharmacy Technician", "Medical Administrator"],
    "Manufacturing": ["Production Operative", "Quality Control", "Manufacturing Technician", "Process Operator"],
    "Construction": ["Carpenter", "Electrician", "Plumber", "Bricklayer", "Construction Technician"],
    "Business Administration": ["Business Administrator", "HR Assistant", "Office Manager", "Project Coordinator"],
    "Hospitality": ["Chef", "Hospitality Team Member", "Hotel Operations", "Events Coordinator"],
    "Automotive": ["Motor Vehicle Technician", "Auto Body Repair", "Vehicle Paint Technician"],
    "Engineering": ["Mechanical Engineer", "Electrical Engineer", "Maintenance Engineer"],
    "Digital Marketing": ["Digital Marketing Assistant", "Social Media Coordinator", "Content Creator"],
    "Financial Services": ["Accounting Technician", "Insurance Practitioner", "Financial Administrator"]
}

LEVELS = [2, 3, 4, 6]
DURATIONS = [12, 18, 24, 36, 48]  # months

def generate_salary():
    """Generate a realistic apprenticeship salary."""
    base = random.uniform(15000, 35000)
    level_multiplier = random.uniform(0.9, 1.3)
    return round(base * level_multiplier, -3)  # Round to nearest thousand

def generate_apprenticeship():
    """Generate a single apprenticeship entry."""
    sector = random.choice(SECTORS)
    role = random.choice(ROLES[sector])
    level = random.choice(LEVELS)
    duration = random.choice(DURATIONS)
    location = random.choice(ENGLISH_LOCATIONS)
    
    # Generate start date within next 6 months
    start_date = fake.date_between(
        start_date=datetime.now(),
        end_date=datetime.now() + timedelta(days=180)
    )

    return {
        "position_id": fake.unique.uuid4(),
        "company_name": fake.company(),
        "role_title": role,
        "sector": sector,
        "level": level,
        "duration_months": duration,
        "salary_gbp": generate_salary(),
        "location": location,
        "postal_code": generate_uk_postcode(location),
        "start_date": start_date.strftime("%Y-%m-%d"),
        "requirements": f"Minimum {random.choice(['GCSE', 'A-Level'])} qualifications required",
        "contact_email": fake.company_email(),
        "posted_date": (start_date - timedelta(days=random.randint(1, 30))).strftime("%Y-%m-%d")
    }

def generate_apprenticeships(num_records=4500):
    """Generate multiple apprenticeship records and save to CSV."""
    apprenticeships = [generate_apprenticeship() for _ in range(num_records)]
    df = pd.DataFrame(apprenticeships)
    
    # Save to CSV
    filename = f"apprenticeships_data_v2_{datetime.now().strftime('%Y%m%d')}.csv"
    df.to_csv(filename, index=False)
    print(f"Generated {num_records} apprenticeship records and saved to {filename}")
    return df

if __name__ == "__main__":
    generate_apprenticeships()

Generated 4500 apprenticeship records and saved to apprenticeships_data_v2_20241008.csv


# This attempt

Added some course description functionality.

In [25]:
fake = Faker(['en_GB'])  # Use British locale

# Previous constants remain the same (ENGLISH_LOCATIONS, POSTCODE_AREAS, SECTORS, ROLES, LEVELS, DURATIONS)
#[Previous constants remain unchanged...]

# Add course description components
COURSE_COMPONENTS = {
    "Information Technology": {
        "core_skills": [
            "programming fundamentals",
            "database management",
            "network infrastructure",
            "cybersecurity principles",
            "cloud computing",
            "agile methodologies",
            "system architecture",
            "software testing"
        ],
        "learning_outcomes": [
            "develop robust software applications",
            "implement secure network solutions",
            "manage IT infrastructure",
            "troubleshoot technical issues",
            "design system architectures",
            "maintain cloud-based systems"
        ]
    },
    "Healthcare": {
        "core_skills": [
            "patient care",
            "medical terminology",
            "health and safety protocols",
            "clinical procedures",
            "medical record keeping",
            "infection control",
            "healthcare regulations"
        ],
        "learning_outcomes": [
            "deliver patient-centered care",
            "maintain medical records",
            "perform clinical procedures",
            "ensure regulatory compliance",
            "implement care plans"
        ]
    },
    "Manufacturing": {
        "core_skills": [
            "production processes",
            "quality control",
            "machinery operation",
            "workplace safety",
            "lean manufacturing",
            "inventory management"
        ],
        "learning_outcomes": [
            "operate industrial equipment",
            "implement quality control measures",
            "optimize production processes",
            "maintain safety standards",
            "manage inventory systems"
        ]
    },
    "Construction": {
        "core_skills": [
            "building techniques",
            "safety regulations",
            "blueprint reading",
            "tool proficiency",
            "site management",
            "construction materials"
        ],
        "learning_outcomes": [
            "execute construction projects",
            "interpret technical drawings",
            "ensure site safety",
            "coordinate construction activities",
            "implement building regulations"
        ]
    },
    "Business Administration": {
        "core_skills": [
            "office management",
            "communication",
            "document handling",
            "administrative procedures",
            "project coordination",
            "business software"
        ],
        "learning_outcomes": [
            "manage office operations",
            "coordinate business activities",
            "implement administrative systems",
            "support project delivery",
            "maintain business records"
        ]
    }
}

# Default skills and outcomes for sectors not explicitly defined
DEFAULT_COMPONENTS = {
    "core_skills": [
        "industry-specific knowledge",
        "professional communication",
        "technical expertise",
        "safety awareness",
        "quality standards"
    ],
    "learning_outcomes": [
        "demonstrate professional competence",
        "apply industry standards",
        "contribute to team objectives",
        "maintain quality requirements",
        "support business operations"
    ]
}

def generate_course_description(sector, role, level):
    """Generate a detailed course description based on sector, role, and level."""
    components = COURSE_COMPONENTS.get(sector, DEFAULT_COMPONENTS)
    
    # Select random skills and outcomes
    skills = random.sample(components["core_skills"], min(3, len(components["core_skills"])))
    outcomes = random.sample(components["learning_outcomes"], min(3, len(components["learning_outcomes"])))
    
    # Level-specific terminology
    level_terms = {
        2: "foundation knowledge and skills",
        3: "advanced techniques and comprehensive understanding",
        4: "higher-level expertise and strategic competencies",
        6: "degree-level theoretical and practical knowledge"
    }

    description = f"""This Level {level} apprenticeship in {role} provides {level_terms[level]} within the {sector} sector. 

Key areas of study include:
- {skills[0]}
- {skills[1]}
- {skills[2]}

Upon completion, apprentices will be able to:
- {outcomes[0]}
- {outcomes[1]}
- {outcomes[2]}

This programme combines workplace learning with structured training, enabling apprentices to gain practical experience while developing theoretical understanding. Assessment includes a combination of workplace observation, portfolio development, and final assessment.

Successful completion leads to a recognized qualification in {role}, with opportunities for career progression in {sector}."""

    return description

def generate_salary():
    """Generate a realistic apprenticeship salary."""
    base = random.uniform(10000, 35000)
    level_multiplier = random.uniform(0.9, 1.3)
    return round(base * level_multiplier, -3)  # Round to nearest thousand

LEVELS = [2, 3, 4, 6]

def generate_apprenticeship():
    """Generate a single apprenticeship entry."""
    sector = random.choice(SECTORS)
    role = random.choice(ROLES[sector])
    level = random.choice(LEVELS)
    duration = random.choice(DURATIONS)
    location = random.choice(ENGLISH_LOCATIONS)
    
    # Generate start date within next 6 months
    start_date = fake.date_between(
        start_date=datetime.now(),
        end_date=datetime.now() + timedelta(days=180)
    )

    return {
        "position_id": fake.unique.uuid4(),
        "company_name": fake.company(),
        "role_title": role,
        "sector": sector,
        "level": level,
        "duration_months": duration,
        "salary_gbp": generate_salary(),
        "location": location,
        "postal_code": generate_uk_postcode(location),
        "start_date": start_date.strftime("%Y-%m-%d"),
        "requirements": f"Minimum {random.choice(['GCSE', 'A-Level'])} qualifications required",
        "contact_email": fake.company_email(),
        "posted_date": (start_date - timedelta(days=random.randint(1, 30))).strftime("%Y-%m-%d"),
        "course_description": generate_course_description(sector, role, level)
    }

def generate_apprenticeships(num_records=5000):
    """Generate multiple apprenticeship records and save to CSV."""
    apprenticeships = [generate_apprenticeship() for _ in range(num_records)]
    df = pd.DataFrame(apprenticeships)
    
    # Save to CSV
    filename = f"apprenticeships_data_v2_{datetime.now().strftime('%Y%m%d')}.csv"
    df.to_csv(filename, index=False)
    print(f"Generated {num_records} apprenticeship records and saved to {filename}")
    return df

if __name__ == "__main__":
    generate_apprenticeships()

Generated 5000 apprenticeship records and saved to apprenticeships_data_v2_20241010.csv


In [35]:
n = (12570 + 0.8*(46000-12570))/12
n

3276.1666666666665

In [47]:
n*0.5-1200-119

319.08333333333326

In [39]:
n*0.3

982.8499999999999

In [41]:
n*0.2

655.2333333333333

In [43]:
n*0.3/30.5

32.22459016393442