In [1]:
import pandas as pd
import numpy as np
import random

# Set a seed for reproducibility
np.random.seed(42)
random.seed(42)

N = 2500 # Target number of samples per class

# --- Electricity Descriptions ---
elec_keywords = ['Electricity Supply', 'Power Consumption', 'Kwh Usage', 'Electrical Service', 'Grid Charge']
elec_locations = ['Main Plant', 'R&D Lab', 'Corporate Office', 'North Warehouse', 'Production Line 3', 'Data Center', 'Packaging Facility']
elec_periods = ['Monthly', 'Quarterly', 'Semi-Annual', 'Year-End', 'Q1', 'Q3', 'January', 'December']
elec_descriptions = [
    f"{np.random.choice(elec_keywords)} - {np.random.choice(elec_locations)} ({np.random.choice(elec_periods)})"
    for _ in range(N)
]
elec_descriptions.extend([
    f"{np.random.choice(elec_keywords)} for {np.random.choice(['HVAC System', 'Lighting', 'Machinery'])} at {np.random.choice(elec_locations)}"
    for _ in range(N//4)
])
elec_descriptions.extend([
    f"{np.random.choice(elec_keywords)} - {np.random.choice(['Site A', 'Site B', 'Site C'])} {np.random.choice(elec_periods)} Billing"
    for _ in range(N//4)
])
elec_df = pd.DataFrame({'Description': elec_descriptions[:N], 'Class': 'Electricity'})


# --- Heating Descriptions ---
heat_keywords = ['Natural Gas Supply', 'Propane Purchase', 'Fuel Oil Delivery', 'Heating Service', 'Gas Consumption']
heat_locations = ['Boiler Room', 'Manufacturing Area', 'Warehouse', 'Office Building 2', 'Chemical Storage', 'Drying Ovens']
heat_periods = ['Winter Season', 'Peak Demand', 'Monthly', 'Q4', 'October', 'February', 'Gas Meter Reading']
heat_descriptions = [
    f"{np.random.choice(heat_keywords)} - {np.random.choice(heat_locations)} {np.random.choice(heat_periods)}"
    for _ in range(N)
]
heat_descriptions.extend([
    f"{np.random.choice(['LPG', 'Diesel'])} for {np.random.choice(['Furnace', 'Generator'])} at {np.random.choice(heat_locations)}"
    for _ in range(N//4)
])
heat_descriptions.extend([
    f"{np.random.choice(['Gas Charge', 'Heating Bill'])} - {np.random.choice(['Plant 1', 'Plant 2', 'North Facility'])}"
    for _ in range(N//4)
])
heat_df = pd.DataFrame({'Description': heat_descriptions[:N], 'Class': 'Heating'})


# --- Water Descriptions ---
water_keywords = ['Water Supply', 'Water Charges', 'Utility Water Bill', 'H2O Billing', 'Municipal Water']
water_locations = ['Production Site', 'Admin Block', 'Laboratory', 'Irrigation System', 'Cooling Tower', 'Fire Sprinklers']
water_periods = ['Monthly Invoice', 'Q2', 'Q3', 'Annual Fee', 'Meter Reading', 'April', 'July']
water_descriptions = [
    f"{np.random.choice(water_keywords)} - {np.random.choice(water_locations)} {np.random.choice(water_periods)}"
    for _ in range(N)
]
water_descriptions.extend([
    f"{np.random.choice(['Raw Water', 'Treated Water'])} Purchase for {np.random.choice(['Industrial Use', 'General Use'])}"
    for _ in range(N//4)
])
water_descriptions.extend([
    f"{np.random.choice(water_keywords)} - {np.random.choice(['Site X', 'Site Y', 'Central Facility'])} Consumption"
    for _ in range(N//4)
])
water_df = pd.DataFrame({'Description': water_descriptions[:N], 'Class': 'Water'})


# --- Waste Disposal Descriptions (Excluding 'Treatment' or similar) ---
waste_keywords = ['Waste Disposal', 'Refuse Collection', 'Chemical Waste Removal', 'Hazardous Waste Pickup', 'Scrap Metal Hauling']
waste_types = ['General Refuse', 'Industrial Debris', 'Solvent Waste', 'Recyclable Materials', 'Pallet Disposal']
waste_periods = ['Weekly Service', 'Bi-Weekly Pickup', 'Monthly Fee', 'Container Emptying', 'Q1']
waste_descriptions = [
    f"{np.random.choice(waste_keywords)} - {np.random.choice(waste_types)} {np.random.choice(waste_periods)}"
    for _ in range(N)
]
waste_descriptions.extend([
    f"{np.random.choice(['Disposal Fee', 'Collection Charge'])} for {np.random.choice(waste_types)} at {np.random.choice(['Dock 5', 'Rear Yard'])}"
    for _ in range(N//4)
])
waste_descriptions.extend([
    f"{np.random.choice(['Medical Waste', 'Biohazard'])} Disposal Services - {np.random.choice(['Lab 1', 'Clinic'])}"
    for _ in range(N//4)
])
waste_df = pd.DataFrame({'Description': waste_descriptions[:N], 'Class': 'Waste Disposal'})


# Combine and shuffle
training_df = pd.concat([elec_df, heat_df, water_df, waste_df], ignore_index=True)
training_df = training_df.sample(frac=1).reset_index(drop=True)

# Save to CSV
output_filename = 'training_dataset.csv'
training_df.to_csv(output_filename, index=False)