In [1]:
from google.colab import files
import pandas as pd

# Step 1: Upload the dataset file
uploaded = files.upload()  # choose synthetic_campaigns_updated_usd.csv

# Step 2: Load into pandas
df = pd.read_csv("synthetic_campaigns_updated_usd.csv")

# Step 3: Preview
print(df.head())
print("\nColumns:", df.columns.tolist())


Saving synthetic_campaigns_updated_usd.csv to synthetic_campaigns_updated_usd.csv
   total_budget  budget_instagram  pct_instagram  budget_google  pct_google  \
0         74.63             17.25         0.2311           6.48      0.0868   
1          6.73              1.03         0.1530           0.68      0.1010   
2          4.77              0.01         0.0021           0.25      0.0524   
3        134.44             43.29         0.3220           1.52      0.0113   
4         38.59              0.81         0.0210           2.27      0.0588   

   budget_tiktok  pct_tiktok  budget_facebook  pct_facebook  budget_youtube  \
0          11.63      0.1558            20.51        0.2748           15.69   
1           0.55      0.0817             0.85        0.1263            2.74   
2           3.72      0.7799             0.03        0.0063            0.73   
3           8.52      0.0634            38.01        0.2827           17.30   
4          13.78      0.3571             7.60   

In [2]:
# ================================
# Step 2: Train ML Model on USD dataset (with new features + 6 channels)
# ================================

# 1. Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from lightgbm import LGBMRegressor
import joblib
import json

# 2. Load dataset (make sure you uploaded synthetic_campaigns_updated_usd.csv)
csv_path = "synthetic_campaigns_updated_usd.csv"
df = pd.read_csv(csv_path)
print(f"Loaded dataset with {len(df)} rows and {len(df.columns)} columns")

# 3. Define feature columns and target
# Budgets + percentages (for 6 channels)
channel_names = ["instagram","google","tiktok","facebook","youtube","linkedin"]

budget_cols = [f"budget_{ch}" for ch in channel_names]
pct_cols = [f"pct_{ch}" for ch in channel_names]

extra_features = [
    "total_budget",
    "aov",
    "creative_quality",
    "campaign_days",
    "target_margin",
    "age",
    "gender",
    "income_level"
]

feature_cols = budget_cols + pct_cols + extra_features
target_col = "revenue"

# Sanity check
missing = [c for c in feature_cols+[target_col] if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns in dataset: {missing}")

# 4. Split features (X) and label (y)
X = df[feature_cols].copy()
y = df[target_col].copy()

# 5. One-hot encode categoricals
categorical_cols = ["gender","income_level"]
X = pd.get_dummies(X, columns=categorical_cols, drop_first=False)

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Train rows: {len(X_train)}, Test rows: {len(X_test)}")

# 7. Train LightGBM regressor
model = LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42
)

print("Training LightGBM model...")
model.fit(X_train, y_train)
print("Training complete.")

# 8. Evaluate model
preds = model.predict(X_test)
mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))
mape = np.mean(np.abs((y_test - preds) / (y_test + 1e-9))) * 100

print("\nEvaluation metrics on test set:")
print(f"  MAE  : ${mae:.2f}")
print(f"  RMSE : ${rmse:.2f}")
print(f"  MAPE : {mape:.2f}%")

# 9. Save model
model_path = "campaign_optimizer_usd.pkl"
joblib.dump(model, model_path)
print(f"\nSaved trained model to: {model_path}")

# 10. Save feature column order (so optimizer can use same order)
feature_map_path = "model_feature_columns_usd.json"
with open(feature_map_path, "w") as f:
    json.dump(list(X.columns), f)
print(f"Saved feature column order to: {feature_map_path}")

# 11. Quick inference example
example_split = {
    "budget_instagram": 300,
    "budget_google": 400,
    "budget_tiktok": 200,
    "budget_facebook": 150,
    "budget_youtube": 100,
    "budget_linkedin": 50,
    "pct_instagram": 300/1200,
    "pct_google": 400/1200,
    "pct_tiktok": 200/1200,
    "pct_facebook": 150/1200,
    "pct_youtube": 100/1200,
    "pct_linkedin": 50/1200,
    "total_budget": 1200,
    "aov": 50,
    "creative_quality": 0.7,
    "campaign_days": 10,
    "target_margin": 0.25,
    "age": 28,
    "gender": "man",
    "income_level": "high"
}

# Convert to DataFrame with proper one-hot encoding
example_df = pd.DataFrame([example_split])
example_df = pd.get_dummies(example_df, columns=categorical_cols)
for col in X.columns:
    if col not in example_df.columns:
        example_df[col] = 0
example_df = example_df[X.columns]  # reorder

example_pred = model.predict(example_df)[0]
print(f"\nExample predicted revenue for sample campaign: ${example_pred:.2f}")


Loaded dataset with 5000 rows and 46 columns
Train rows: 4000, Test rows: 1000
Training LightGBM model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000360 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4186
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 24
[LightGBM] [Info] Start training from score 213.597425
Training complete.

Evaluation metrics on test set:
  MAE  : $23.85
  RMSE : $58.97
  MAPE : 18.53%

Saved trained model to: campaign_optimizer_usd.pkl
Saved feature column order to: model_feature_columns_usd.json

Example predicted revenue for sample campaign: $2539.60


In [8]:
# ======================================
# Step 3: Final Hackathon-Ready Optimizer (UPDATED: supports age range slider)
# ======================================
import json, joblib, numpy as np, pandas as pd
import matplotlib.pyplot as plt

# 1) Load trained model + feature columns
model = joblib.load("campaign_optimizer_usd.pkl")
with open("model_feature_columns_usd.json","r") as f:
    feature_cols = json.load(f)

channels = ["instagram","google","tiktok","facebook","youtube","linkedin"]

# --- RULE-BASED SIMULATION (simple baseline) ---
# channel benchmarks (CPM in USD, CTR, CVR)
BASE_CHANNELS = {
    "instagram": {"cpm":150/83, "ctr":0.02, "cvr":0.04},
    "google":    {"cpm":200/83, "ctr":0.03, "cvr":0.06},
    "tiktok":    {"cpm":120/83, "ctr":0.025, "cvr":0.035},
    "facebook":  {"cpm":170/83, "ctr":0.018, "cvr":0.03},
    "youtube":   {"cpm":220/83, "ctr":0.015, "cvr":0.02},
    "linkedin":  {"cpm":260/83, "ctr":0.01, "cvr":0.015}
}

def simulate_split(total_budget, split_budgets, aov, creative_quality, campaign_days):
    totals = {"impressions":0,"clicks":0,"conversions":0,"revenue":0}
    for ch, budget in split_budgets.items():
        cpm = BASE_CHANNELS[ch]["cpm"]
        ctr = BASE_CHANNELS[ch]["ctr"] * (1 + (creative_quality - 0.5)*0.3)
        cvr = BASE_CHANNELS[ch]["cvr"]
        impressions = (budget / cpm) * 1000
        clicks = impressions * ctr
        conversions = clicks * cvr
        revenue = conversions * aov
        totals["impressions"] += impressions
        totals["clicks"] += clicks
        totals["conversions"] += conversions
        totals["revenue"] += revenue
    totals["roi"] = (totals["revenue"] - total_budget)/total_budget if total_budget>0 else 0
    return {k: round(v,2) for k,v in totals.items()}

# --- ML OPTIMIZER CORE ---
def build_features_row(total_budget, split_budgets, aov, creative_quality, campaign_days,
                       target_margin, age, gender, income_level):
    pct_vals = {f"pct_{ch}": (split_budgets[ch]/total_budget if total_budget>0 else 0.0) for ch in channels}
    feat = {
        **{f"budget_{ch}": split_budgets[ch] for ch in channels},
        **pct_vals,
        "total_budget": float(total_budget),
        "aov": float(aov),
        "creative_quality": float(creative_quality),
        "campaign_days": int(campaign_days),
        "target_margin": float(target_margin),
        "age": int(age),
        "gender": gender,
        "income_level": income_level
    }
    df = pd.DataFrame([feat])
    df = pd.get_dummies(df, columns=["gender","income_level"])
    for col in feature_cols:
        if col not in df.columns:
            df[col] = 0
    df = df[feature_cols]
    return df

def generate_candidates(total_budget, channels, K=500, seed=42):
    np.random.seed(seed)
    samples = np.random.dirichlet(np.ones(len(channels)), size=K)
    candidates = []
    for samp in samples:
        budgets = {ch: round(float(total_budget * frac), 2) for ch, frac in zip(channels, samp)}
        diff = round(total_budget - sum(budgets.values()), 2)
        if abs(diff) >= 0.01:
            largest = max(budgets, key=budgets.get)
            budgets[largest] = round(budgets[largest] + diff, 2)
        candidates.append(budgets)
    return candidates

def score_candidates(candidates, total_budget, aov, creative_quality, campaign_days, target_margin, age, gender, income_level):
    scored = []
    for cand in candidates:
        feat_row = build_features_row(total_budget, cand, aov, creative_quality, campaign_days, target_margin, age, gender, income_level)
        pred_rev = float(model.predict(feat_row)[0])
        pred_roi = (pred_rev - total_budget) / total_budget if total_budget > 0 else -9999
        scored.append({"split": cand, "pred_revenue": pred_rev, "pred_roi": pred_roi})
    return sorted(scored, key=lambda x: x["pred_roi"], reverse=True)

def ml_optimize(total_budget, aov, age, gender, income_level, creative_quality, campaign_days, target_margin,
                K=500, seed=42, min_pct=0.02, max_pct=0.8):
    candidates = generate_candidates(total_budget, channels, K=K, seed=seed)
    filtered = []
    for cand in candidates:
        ok=True
        for ch,amt in cand.items():
            pct = amt/total_budget if total_budget>0 else 0
            if pct<min_pct or pct>max_pct: ok=False; break
        if ok: filtered.append(cand)
    if len(filtered)<max(10,K//5): filtered=candidates
    scored = score_candidates(filtered, total_budget, aov, creative_quality, campaign_days, target_margin, age, gender, income_level)
    top = scored[0]
    pred_rev = round(top["pred_revenue"],2)
    pred_roi = round(top["pred_roi"],4)
    if pred_roi<0:
        warning="⚠️ Model predicts this campaign may be unprofitable under given inputs."
        pred_roi_display=0.0
    else:
        warning=None
        pred_roi_display=pred_roi
    return {
        "recommended_split": top["split"],
        "predicted_revenue": pred_rev,
        "predicted_roi": pred_roi_display,
        "raw_predicted_roi": pred_roi,
        "warning": warning
    }

# --- Safe input helpers (robust prompts) ---
def prompt_float(prompt_text, default=None, min_val=None, max_val=None):
    while True:
        raw = input(f"{prompt_text}" + (f" [default={default}]: " if default is not None else ": "))
        if raw.strip()=="" and default is not None:
            val = float(default)
        else:
            try:
                val = float(raw)
            except:
                print("Enter a valid number.")
                continue
        if min_val is not None and val < min_val:
            print(f"Value must be >= {min_val}")
            continue
        if max_val is not None and val > max_val:
            print(f"Value must be <= {max_val}")
            continue
        return val

def prompt_int(prompt_text, default=None, min_val=None, max_val=None):
    while True:
        raw = input(f"{prompt_text}" + (f" [default={default}]: " if default is not None else ": "))
        if raw.strip()=="" and default is not None:
            val = int(default)
        else:
            try:
                val = int(raw)
            except:
                print("Enter a valid integer.")
                continue
        if min_val is not None and val < min_val:
            print(f"Value must be >= {min_val}")
            continue
        if max_val is not None and val > max_val:
            print(f"Value must be <= {max_val}")
            continue
        return val

def prompt_choice(prompt_text, choices, default=None):
    choices_str = "/".join(choices)
    while True:
        raw = input(f"{prompt_text} ({choices_str})" + (f" [default={default}]: " if default is not None else ": "))
        if raw.strip()=="" and default is not None:
            return default
        if raw.strip() in choices:
            return raw.strip()
        print(f"Please choose one of: {choices_str}")

# --- INTERACTIVE TEST + VISUALS (UPDATED for age range slider) ---
def run_interactive_optimizer():
    print("=== Enter campaign details ===")
    total_budget = prompt_float("Total campaign budget (USD)", default=1200, min_val=1)
    aov = prompt_float("Average Order Value (USD)", default=50, min_val=0.01)

    print("\nEnter target age range (slider). You can enter an exact single age (e.g., 28) or a range (e.g., 20-35).")
    while True:
        raw_age = input("Age or Age range (13-80) [default=28]: ").strip()
        if raw_age == "":
            age_min, age_max = 28, 28
            break
        if "-" in raw_age:
            parts = raw_age.split("-")
            try:
                age_min = int(parts[0])
                age_max = int(parts[1])
            except:
                print("Enter ages like 20-35 (integers).")
                continue
            if age_min < 13 or age_max > 80 or age_min > age_max:
                print("Ages must be between 13 and 80 and min <= max.")
                continue
            break
        else:
            try:
                single_age = int(raw_age)
                if single_age < 13 or single_age > 80:
                    print("Age must be between 13 and 80.")
                    continue
                age_min = age_max = single_age
                break
            except:
                print("Enter a valid integer age or a range like 20-35.")
                continue

    # Use average age for model input
    age = (age_min + age_max) / 2.0

    gender = prompt_choice("Gender", ["man","woman","all"], default="man")
    income_level = prompt_choice("Income level", ["low","high","all"], default="high")
    creative_quality = prompt_float("Creative quality (0–1)", default=0.7, min_val=0.0, max_val=1.0)
    campaign_days = prompt_int("Campaign days", default=10, min_val=1)
    target_margin = prompt_float("Target margin (0.05 = 5%)", default=0.25, min_val=0.0, max_val=1.0)

    print("\nRunning optimizer...\n")
    result = ml_optimize(total_budget, aov, age, gender, income_level, creative_quality, campaign_days, target_margin, K=600, seed=123)

    # Print results (with warning if any)
    print("===== Optimizer Recommendation =====")
    print("Recommended split ($):")
    for ch, amt in result["recommended_split"].items():
        print(f"  {ch}: ${amt:.2f}")
    print(f"\nML predicted revenue: ${result['predicted_revenue']:.2f}")
    print(f"ML predicted ROI (displayed/clipped): {result['predicted_roi']*100:.1f}%")
    if result.get("warning"):
        print(result["warning"])
    print(f"(Raw predicted ROI: {result['raw_predicted_roi']*100:.1f}%)\n")

    # Rule-based simulation validation
    sim = simulate_split(total_budget, result["recommended_split"], aov, creative_quality, campaign_days)
    print("--- Rule-based Simulation (for trust) ---")
    for k,v in sim.items():
        if k == "roi":
            print(f"{k}: {v*100:.1f}%")
        else:
            print(f"{k}: {v}")

    # Visualization: Pie chart of split
    plt.figure(figsize=(6,6))
    labels = list(result["recommended_split"].keys())
    sizes = list(result["recommended_split"].values())
    plt.pie(sizes, labels=labels, autopct="%1.1f%%", startangle=140)
    plt.title("Recommended Budget Split")
    plt.show()

    # Visualization: Bar chart comparing ML vs Rule revenue
    plt.figure(figsize=(6,4))
    plt.bar(["ML Predicted Revenue","Rule-sim Revenue"], [result["predicted_revenue"], sim["revenue"]], color=["#4CAF50","#2196F3"])
    plt.ylabel("Revenue (USD)")
    plt.title("Revenue Comparison")
    plt.show()

    # ROI comparison
    plt.figure(figsize=(6,4))
    plt.bar(["ML Predicted ROI","Rule-sim ROI"], [result["predicted_roi"]*100, sim["roi"]*100], color=["#FF9800","#9C27B0"])
    plt.ylabel("ROI (%)")
    plt.title("ROI Comparison")
    plt.show()

    # Feature importance (top 10)
    importances = model.feature_importances_
    fi = pd.Series(importances,index=feature_cols).sort_values(ascending=False).head(10)
    plt.figure(figsize=(8,4))
    fi.plot(kind="barh")
    plt.title("Top 10 Feature Importances (ML model)")
    plt.xlabel("Importance")
    plt.gca().invert_yaxis()
    plt.show()

# Run the interactive optimizer (age-range capable)
if __name__ == "__main__":
    run_interactive_optimizer()


=== Enter campaign details ===


KeyboardInterrupt: Interrupted by user