In [2]:
import random
import numpy as np
import pandas as pd
from pathlib import Path

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

N = 10_000

# Controlled vocabularies
business_type = ["restaurant", "cafe", "fashion_store", "electronics", "clinic",
                 "gaming_arcade", "gym", "supermarket", "beauty_salon", "education_center"]
campaign_goal = ["sales", "footfall", "awareness", "product_launch"]
budget_level = ["low", "medium", "high"]
campaign_duration = ["short_week", "mid_month", "long_2months"]
area_type = ["inside_mall", "main_street", "near_entertainment"]
active_hours = ["morning", "noon", "evening"]
ad_style = ["short_video", "static_image", "coupon_offer"]
interaction_goal = ["store_visit", "website_visit", "purchase", "social_follow"]
offer_type = ["product", "service", "experience"]
business_stage = ["new", "established", "famous"]
best_place_domain = ["malls", "streets", "entertainment_restaurants"]

def pick(options, w=None):
    return random.choices(options, weights=w, k=1)[0]

rows = []
for i in range(1, N+1):
    bt  = pick(business_type,       w=[0.18,0.10,0.16,0.10,0.12,0.06,0.06,0.09,0.07,0.06])
    cg  = pick(campaign_goal,       w=[0.42,0.25,0.23,0.10])
    bl  = pick(budget_level,        w=[0.35,0.45,0.20])
    cd  = pick(campaign_duration,   w=[0.35,0.45,0.20])
    at  = pick(area_type,           w=[0.40,0.35,0.25])
    ah  = pick(active_hours,        w=[0.25,0.30,0.45])
    ads = pick(ad_style,            w=[0.45,0.30,0.25])
    ig  = pick(interaction_goal,    w=[0.40,0.20,0.30,0.10])
    ot  = pick(offer_type,          w=[0.55,0.30,0.15])
    bs  = pick(business_stage,      w=[0.25,0.55,0.20])

    label = None

    # Strong mall signals
    if bt in ["fashion_store", "electronics", "supermarket", "beauty_salon"] and at == "inside_mall":
        label = "malls"
    elif cg == "sales" and ads == "coupon_offer":
        label = "malls"
    elif bs == "famous" and ig in ["store_visit", "purchase"]:
        label = "malls"

    # Strong street signals
    if label is None:
        if at == "main_street" and (cg in ["awareness","footfall"] or ads == "static_image"):
            label = "streets"
        elif bt in ["clinic", "education_center", "gym"] and at == "main_street":
            label = "streets"

    # Strong entertainment signals
    if label is None:
        if bt in ["restaurant", "cafe", "gaming_arcade"] and ah == "evening":
            label = "entertainment_restaurants"
        elif at == "near_entertainment" and (ads == "short_video" or ig in ["social_follow", "website_visit"]):
            label = "entertainment_restaurants"

    # Tie-breakers
    if label is None:
        if bt in ["restaurant","cafe"] and cg in ["sales","footfall"]:
            label = "entertainment_restaurants"
        elif bt in ["fashion_store","electronics"]:
            label = "malls"
        elif cg == "awareness":
            label = "streets"
        else:
            label = random.choice(best_place_domain)

    # 8% noise
    if random.random() < 0.08:
        alt = [p for p in best_place_domain if p != label]
        label = random.choice(alt)

    rows.append({
        "ID": i,
        "business_type": bt,
        "campaign_goal": cg,
        "budget_level": bl,
        "campaign_duration": cd,
        "area_type": at,
        "active_hours": ah,
        "ad_style": ads,
        "interaction_goal": ig,
        "offer_type": ot,
        "business_stage": bs,
        "best_place": label
    })

df_clients = pd.DataFrame(rows)

csv_path  = Path("ad_intelligence.csv")
xlsx_path = Path("ad_intelligence.xlsx")

df_clients.to_csv(csv_path, index=False, encoding="utf-8-sig")
try:
    df_clients.to_excel(xlsx_path, index=False)
    xlsx_ok = True
except Exception:
    xlsx_ok = False

print("Saved CSV to:", csv_path.resolve())
print("Saved XLSX:", xlsx_ok)
print(df_clients.head())

Saved CSV to: C:\Users\hlaa7\OneDrive\سطح المكتب\project\ad_intelligence.csv
Saved XLSX: True
   ID     business_type campaign_goal budget_level campaign_duration  \
0   1            clinic         sales          low        short_week   
1   2       electronics         sales          low         mid_month   
2   3     fashion_store         sales         high        short_week   
3   4  education_center         sales       medium      long_2months   
4   5        restaurant         sales          low        short_week   

     area_type active_hours      ad_style interaction_goal offer_type  \
0  main_street      evening  coupon_offer      store_visit    product   
1  main_street      morning  static_image         purchase    product   
2  inside_mall      morning  coupon_offer         purchase    service   
3  main_street      evening  static_image         purchase    product   
4  main_street         noon   short_video      store_visit    product   

  business_stage best_place  
0   

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

# Reproducibility
rng = np.random.default_rng(42)

#  Parameters 
N_PER_CLASS = 4000  # 4000 per label -> 12000 total

MODELS = [
    "LED-Outdoor",
    "LED-Indoor",
    "LCD-Signage",
    "VideoWall",
    "Interactive-Kiosk",
    "Flexible-LED"
]

RESOLUTIONS = ["FHD", "QHD", "4K"]
CONTENT_TYPES = ["Static", "Animated", "Video", "Mixed"]
MEDIA_SRC = ["Player", "USB"]
IP_RATINGS = ["IP20", "IP43", "IP54", "IP65", "IP66", "IP67"]

INSTALL_TYPES = ["Ground", "Hanging", "Kiosk", "Wall"]


def make_block(n: int, label: str) -> pd.DataFrame:
    """
    Generate n rows for a given status label: 'OK', 'WARN', or 'FAIL'.
    Each block has different distributions for hours, temp, reboots, humidity.
    Width/Height and Power have shared logic across all blocks.
    """

    #  1) Usage & environment (different per status) 
    if label == "OK":
        daily_hours = rng.normal(10, 2, n).clip(1, 24)
        temp = rng.normal(35, 4, n).clip(20, 55)
        reboots = rng.poisson(0.5, n).clip(0, 5)
        humidity = rng.normal(40, 10, n).clip(10, 80)
        install_year = rng.integers(2019, 2025, size=n)  

    elif label == "WARN":
        daily_hours = rng.normal(14, 3, n).clip(1, 24)
        temp = rng.normal(45, 5, n).clip(25, 65)
        reboots = rng.poisson(2, n).clip(0, 10)
        humidity = rng.normal(55, 10, n).clip(10, 90)
        install_year = rng.integers(2017, 2024, size=n)

    else:  # FAIL
        daily_hours = rng.normal(18, 3, n).clip(1, 24)
        temp = rng.normal(55, 6, n).clip(30, 75)
        reboots = rng.poisson(5, n).clip(0, 30)
        humidity = rng.normal(70, 15, n).clip(20, 100)
        install_year = rng.integers(2015, 2023, size=n)

    # 2) Physical size (shared logic) 
    width_m = rng.uniform(0.5, 3.0, size=n)   # meters
    height_m = rng.uniform(0.5, 3.0, size=n)  # meters
    area_m2 = width_m * height_m

    #  3) Rated power based on area (logical rule) 
    # Base power density per m² (rough ranges)
    # Slightly higher range for outdoor models
    # We choose model first to decide factor.
    model_choices = rng.choice(MODELS, size=n)

    power_density = np.empty(n)
    for i, m in enumerate(model_choices):
        if "Outdoor" in m:
            power_density[i] = rng.uniform(700, 1500)  # W/m² for outdoor
        elif "VideoWall" in m or "Interactive" in m:
            power_density[i] = rng.uniform(500, 1100)  # heavier setups
        else:
            power_density[i] = rng.uniform(300, 800)   # typical indoor / LCD

    rated_power = area_m2 * power_density

    # Add some random noise
    rated_power *= rng.normal(1.0, 0.12, size=n)

    # Clip to [10, 5000] as requested
    rated_power = np.clip(rated_power, 10, 5000)

    #  4) Other categorical fields 
    resolution = rng.choice(RESOLUTIONS, size=n)
    content_type = rng.choice(CONTENT_TYPES, size=n)
    media_source = rng.choice(MEDIA_SRC, size=n)
    ip_rating = rng.choice(IP_RATINGS, size=n)
    install_type = rng.choice(INSTALL_TYPES, size=n)

    # 5) Build DataFrame with the exact columns you want 
    df = pd.DataFrame({
        "Screen ID": [f"{label[:1]}-{i:05d}" for i in range(n)],
        "Model": model_choices,
        "Resolution": resolution,
        "Content Type": content_type,
        "Media Source": media_source,
        "IP Rating": ip_rating,
        "Daily Hours": np.round(daily_hours, 1),
        "Temperature": np.round(temp, 1),
        "Reboots per week": reboots.astype(int),
        "Humidity_%": np.round(humidity, 1),
        "Status": label,
        "install Type": install_type,
        "Install year": install_year.astype(int),
        "Width_m": np.round(width_m, 2),
        "Height_m": np.round(height_m, 2),
        "rated Power w": np.round(rated_power, 0).astype(int)
    })

    return df


#  Build  dataset (OK/WARN/FAIL) 
df_ok = make_block(N_PER_CLASS, "OK")
df_warn = make_block(N_PER_CLASS, "WARN")
df_fail = make_block(N_PER_CLASS, "FAIL")
df_full = pd.concat([df_ok, df_warn, df_fail], ignore_index=True)
df_full = df_full.sample(frac=1, random_state=42).reset_index(drop=True)

# Save to Excel 
file_path = "predictive_maintenance.xlsx"
df_full.to_excel(file_path, index=False)

print("Synthetic dataset saved as:", file_path)
print("Shape:", df_full.shape)
print("Status distribution:")
print(df_full["Status"].value_counts())

Synthetic dataset saved as: predictive_maintenance.xlsx
Shape: (12000, 16)
Status distribution:
Status
OK      4000
WARN    4000
FAIL    4000
Name: count, dtype: int64
