In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

In [17]:
clean_df = pd.read_csv("campaigns_clean.csv")

In [18]:
clean_df["CTR"] = clean_df["Clicks"] / clean_df["Impressions"]
targets = ["ROI", "Conversion_Rate", "CTR", "Engagement_Score"]
display(clean_df[targets].describe())

Unnamed: 0,ROI,Conversion_Rate,CTR,Engagement_Score
count,300000.0,300000.0,300000.0,300000.0
mean,3.177691,0.080009,0.314156,4.369217
std,2.4612,0.040563,0.024657,3.156492
min,0.0,0.01,0.151187,1.0
25%,0.93,0.05,0.301698,1.0
50%,2.67,0.08,0.325373,4.0
75%,5.33,0.11,0.331266,7.0
max,8.0,0.15,0.333333,10.0


In [19]:
feature_cols = [
    "Campaign_Goal",
    "Channel_Used",
    "Target_Audience",
    "Language",
    "Location",
    "Customer_Segment",
    "Season",
    "Duration_Days",
    "Acquisition_Cost_Num"
]

X = clean_df[feature_cols]
Y = clean_df[targets]

In [20]:
print("X shape:", X.shape)
print("Y shape:", Y.shape)

X shape: (300000, 9)
Y shape: (300000, 4)


In [21]:
X_encoded = pd.get_dummies(
    X,
    columns=[
        "Campaign_Goal",
        "Channel_Used",
        "Target_Audience",
        "Language",
        "Location",
        "Customer_Segment",
        "Season"
    ],
    drop_first=True
)

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X_encoded, Y, test_size=0.2, random_state=42
)

In [23]:
models = {}

for t in targets:
    model = GradientBoostingRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=3,
        random_state=42
    )
    model.fit(X_train, Y_train[t])
    models[t] = model

SANITY CHECK

In [24]:
from sklearn.metrics import mean_absolute_error

for t in targets:
    preds = models[t].predict(X_test)
    mae = mean_absolute_error(Y_test[t], preds)
    print(f"{t} MAE:", round(mae, 4))

ROI MAE: 1.588
Conversion_Rate MAE: 0.0349
CTR MAE: 0.001
Engagement_Score MAE: 1.8965


In [25]:
target_means = Y_train.mean()
target_stds = Y_train.std().replace(0, 1e-9)

print("Target means:\n", target_means)
print("\nTarget stds:\n", target_stds)

Target means:
 ROI                 3.177120
Conversion_Rate     0.080038
CTR                 0.314139
Engagement_Score    4.366375
dtype: float64

Target stds:
 ROI                 2.461830
Conversion_Rate     0.040577
CTR                 0.024642
Engagement_Score    3.153150
dtype: float64


In [26]:
domain = {
    "Campaign_Goal": sorted(clean_df["Campaign_Goal"].unique()),
    "Channel_Used": sorted(clean_df["Channel_Used"].unique()),
    "Target_Audience": sorted(clean_df["Target_Audience"].unique()),
    "Language": sorted(clean_df["Language"].unique()),
    "Location": sorted(clean_df["Location"].unique()),
    "Customer_Segment": sorted(clean_df["Customer_Segment"].unique()),
    "Season": sorted(clean_df["Season"].unique()),
}

for k, v in domain.items():
    print(f"{k}: {len(v)} options -> {v[:5]}{'...' if len(v) > 5 else ''}")

Campaign_Goal: 4 options -> ['Brand Awareness', 'Increase Sales', 'Market Expansion', 'Product Launch']
Channel_Used: 4 options -> ['Facebook', 'Instagram', 'Pinterest', 'Twitter']
Target_Audience: 9 options -> ['All Ages', 'Men 18-24', 'Men 25-34', 'Men 35-44', 'Men 45-60']...
Language: 3 options -> ['English', 'French', 'Spanish']
Location: 5 options -> ['Austin', 'Las Vegas', 'Los Angeles', 'Miami', 'New York']
Customer_Segment: 5 options -> ['Fashion', 'Food', 'Health', 'Home', 'Technology']
Season: 4 options -> ['Fall', 'Spring', 'Summer', 'Winter']


In [27]:
print("\nEncoded feature columns:", X_encoded.shape[1])


Encoded feature columns: 29


In [28]:
def encode_candidate(candidate: dict, X_encoded_columns: list[str]) -> pd.DataFrame:
    df_row = pd.DataFrame([candidate])
    df_row_encoded = pd.get_dummies(df_row)
    df_row_encoded = df_row_encoded.reindex(columns=X_encoded_columns, fill_value=0)

    return df_row_encoded

EXAMPLE

In [29]:
test_candidate = {
    "Campaign_Goal": "Product Launch",
    "Channel_Used": "Instagram",
    "Target_Audience": "All Ages",
    "Language": "Spanish",
    "Location": "Miami",
    "Customer_Segment": "Technology",
    "Season": "Spring",
    "Duration_Days": 30,
    "Acquisition_Cost_Num": 7000
}

X_test_row = encode_candidate(test_candidate, X_encoded.columns.tolist())

print("Encoded row shape:", X_test_row.shape)
print("\nNon-zero columns:")
print(X_test_row.loc[:, (X_test_row != 0).any(axis=0)])

Encoded row shape: (1, 29)

Non-zero columns:
   Duration_Days  Acquisition_Cost_Num  Campaign_Goal_Product Launch  \
0             30                  7000                          True   

   Channel_Used_Instagram  Language_Spanish  Location_Miami  \
0                    True              True            True   

   Customer_Segment_Technology  Season_Spring  
0                         True           True  


In [30]:
def score_candidate(candidate: dict,
                    models: dict,
                    X_encoded_columns: list[str],
                    target_means: pd.Series,
                    target_stds: pd.Series,
                    weights: dict) -> dict:
    
    X_row = encode_candidate(candidate, X_encoded_columns)

    # Predict each target
    preds = {t: float(models[t].predict(X_row)[0]) for t in models}

    # Normalize 
    z = {t: (preds[t] - float(target_means[t])) / float(target_stds[t]) for t in preds}

    # Weighted sum of normalized predictions
    total_score = sum(weights[t] * z[t] for t in weights)

    out = candidate.copy()
    out.update({f"pred_{t}": preds[t] for t in preds})
    out["score"] = float(total_score)
    return out


TESTING

In [31]:
weights = {
    "ROI": 0.45,
    "Conversion_Rate": 0.25,
    "CTR": 0.15,
    "Engagement_Score": 0.15
}

result = score_candidate(
    test_candidate,
    models=models,
    X_encoded_columns=X_encoded.columns.tolist(),
    target_means=target_means,
    target_stds=target_stds,
    weights=weights
)

result

{'Campaign_Goal': 'Product Launch',
 'Channel_Used': 'Instagram',
 'Target_Audience': 'All Ages',
 'Language': 'Spanish',
 'Location': 'Miami',
 'Customer_Segment': 'Technology',
 'Season': 'Spring',
 'Duration_Days': 30,
 'Acquisition_Cost_Num': 7000,
 'pred_ROI': 4.046626882109401,
 'pred_Conversion_Rate': 0.08003866454570004,
 'pred_CTR': 0.32832477525523046,
 'pred_Engagement_Score': 5.512245508331145,
 'score': 0.2998080925030761}

TESTING WITH SEVERAL EXAMPLES

In [33]:
import random

def generate_candidates(n: int,
                        goal: str,
                        domain: dict,
                        constraints: dict | None = None,
                        seed: int = 42) -> list[dict]:

    rng = random.Random(seed)
    constraints = constraints or {}

    def pick(cat_name):
        options = constraints.get(cat_name, domain[cat_name])
        return rng.choice(list(options))

    dur_min, dur_max = constraints.get(
        "Duration_Days",
        (int(clean_df["Duration_Days"].min()), int(clean_df["Duration_Days"].max()))
    )
    cost_min, cost_max = constraints.get(
        "Acquisition_Cost_Num",
        (float(clean_df["Acquisition_Cost_Num"].min()), float(clean_df["Acquisition_Cost_Num"].max()))
    )

    candidates = []
    for _ in range(n):
        candidates.append({
            "Campaign_Goal": goal,
            "Channel_Used": pick("Channel_Used"),
            "Target_Audience": pick("Target_Audience"),
            "Language": pick("Language"),
            "Location": pick("Location"),
            "Customer_Segment": pick("Customer_Segment"),
            "Season": pick("Season"),
            "Duration_Days": rng.randint(dur_min, dur_max),
            "Acquisition_Cost_Num": rng.uniform(cost_min, cost_max),
        })

    return candidates


In [34]:
test_cands = generate_candidates(5, "Product Launch", domain, constraints=None, seed=1)
test_cands

[{'Campaign_Goal': 'Product Launch',
  'Channel_Used': 'Instagram',
  'Target_Audience': 'Men 18-24',
  'Language': 'French',
  'Location': 'Austin',
  'Customer_Segment': 'Home',
  'Season': 'Winter',
  'Duration_Days': 45,
  'Acquisition_Cost_Num': 9948.098104480063},
 {'Campaign_Goal': 'Product Launch',
  'Channel_Used': 'Instagram',
  'Target_Audience': 'Men 18-24',
  'Language': 'French',
  'Location': 'Austin',
  'Customer_Segment': 'Home',
  'Season': 'Winter',
  'Duration_Days': 53,
  'Acquisition_Cost_Num': 11553.061195640159},
 {'Campaign_Goal': 'Product Launch',
  'Channel_Used': 'Facebook',
  'Target_Audience': 'Women 35-44',
  'Language': 'French',
  'Location': 'Las Vegas',
  'Customer_Segment': 'Technology',
  'Season': 'Fall',
  'Duration_Days': 35,
  'Acquisition_Cost_Num': 943.5547539865263},
 {'Campaign_Goal': 'Product Launch',
  'Channel_Used': 'Facebook',
  'Target_Audience': 'Women 45-60',
  'Language': 'English',
  'Location': 'Miami',
  'Customer_Segment': 'Food

In [35]:
def optimize_campaign(goal: str,
                      n_candidates: int = 5000,
                      top_k: int = 10,
                      constraints: dict | None = None,
                      weights: dict | None = None,
                      seed: int = 42) -> pd.DataFrame:
    weights = weights or {
        "ROI": 0.45,
        "Conversion_Rate": 0.25,
        "CTR": 0.15,
        "Engagement_Score": 0.15
    }

    cands = generate_candidates(
        n=n_candidates,
        goal=goal,
        domain=domain,
        constraints=constraints,
        seed=seed
    )

    scored = [
        score_candidate(
            c,
            models=models,
            X_encoded_columns=X_encoded.columns.tolist(),
            target_means=target_means,
            target_stds=target_stds,
            weights=weights
        )
        for c in cands
    ]

    df_scored = pd.DataFrame(scored).sort_values("score", ascending=False).head(top_k)
    return df_scored

In [36]:
recs = optimize_campaign(
    goal="Product Launch",
    n_candidates=5000,
    top_k=10,
    constraints=None,
    seed=42
)

recs[[
    "Campaign_Goal","Channel_Used","Target_Audience","Language","Location",
    "Customer_Segment","Season","Duration_Days","Acquisition_Cost_Num",
    "pred_ROI","pred_Conversion_Rate","pred_CTR","pred_Engagement_Score","score"
]]

Unnamed: 0,Campaign_Goal,Channel_Used,Target_Audience,Language,Location,Customer_Segment,Season,Duration_Days,Acquisition_Cost_Num,pred_ROI,pred_Conversion_Rate,pred_CTR,pred_Engagement_Score,score
2536,Product Launch,Twitter,Women 25-34,Spanish,Los Angeles,Health,Spring,38,14522.287916,4.007497,0.097101,0.333065,5.483544,0.425261
4437,Product Launch,Facebook,Men 35-44,Spanish,Austin,Health,Summer,48,14521.751937,3.989249,0.093751,0.333061,5.387082,0.396675
2024,Product Launch,Twitter,Women 35-44,French,Los Angeles,Technology,Summer,46,14605.036598,4.049457,0.088904,0.333083,5.508672,0.383738
3642,Product Launch,Twitter,Women 18-24,Spanish,Los Angeles,Health,Spring,33,12670.006691,4.251825,0.081765,0.332517,5.577852,0.376591
2800,Product Launch,Instagram,Men 25-34,English,Austin,Health,Summer,60,11715.372712,3.918664,0.092226,0.332116,5.520838,0.37499
1615,Product Launch,Facebook,Women 35-44,Spanish,Los Angeles,Home,Summer,49,14590.668546,4.016132,0.087875,0.333076,5.535407,0.372535
4765,Product Launch,Facebook,All Ages,French,Austin,Food,Spring,49,10917.648582,4.349229,0.079623,0.33166,5.445047,0.369669
162,Product Launch,Instagram,Men 25-34,Spanish,Miami,Fashion,Spring,55,14591.249622,4.094749,0.084838,0.333069,5.541019,0.368424
1622,Product Launch,Instagram,Men 25-34,Spanish,Miami,Home,Summer,37,14591.865243,4.096263,0.084826,0.333069,5.531928,0.368188
4241,Product Launch,Twitter,All Ages,French,Los Angeles,Health,Summer,39,14584.539701,4.034696,0.085869,0.333074,5.509849,0.362346


TESTING

In [51]:
constraints = {
    "Channel_Used": ["Instagram", "Twitter"],
    "Acquisition_Cost_Num": (2000, 9000),
    "Duration_Days": (15, 45),
    "Season": ["Winter"],
}

recs_constrained = optimize_campaign(
    goal="Increase Sales",
    n_candidates=8000,
    top_k=10,
    constraints=constraints,
    seed=1
)

recs_constrained[[
    "Channel_Used","Target_Audience","Language","Location","Customer_Segment","Season",
    "Duration_Days","Acquisition_Cost_Num",
    "pred_ROI","pred_Conversion_Rate","pred_CTR","pred_Engagement_Score","score"
]]

Unnamed: 0,Channel_Used,Target_Audience,Language,Location,Customer_Segment,Season,Duration_Days,Acquisition_Cost_Num,pred_ROI,pred_Conversion_Rate,pred_CTR,pred_Engagement_Score,score
5583,Instagram,Women 35-44,English,Los Angeles,Technology,Winter,27,6647.733631,4.867843,0.079608,0.327743,5.49551,0.442933
6954,Instagram,Women 25-34,English,New York,Food,Winter,38,8624.947599,4.53128,0.079939,0.330167,5.537074,0.400174
415,Instagram,Men 25-34,Spanish,Austin,Technology,Winter,30,8626.659966,4.362349,0.07989,0.330166,5.505956,0.367514
1169,Twitter,Men 18-24,Spanish,Miami,Fashion,Winter,44,8625.605593,4.321741,0.080745,0.330164,5.448265,0.362601
6629,Twitter,Women 18-24,English,Austin,Technology,Winter,45,8627.060854,4.325566,0.080392,0.330164,5.462305,0.361791
5766,Twitter,Women 18-24,English,Austin,Home,Winter,27,8627.143184,4.321556,0.080276,0.330164,5.480156,0.361193
7540,Instagram,Men 18-24,French,Los Angeles,Health,Winter,33,8679.742696,4.306104,0.079848,0.330167,5.466995,0.35512
7689,Twitter,All Ages,French,Los Angeles,Food,Winter,26,8632.079265,4.287051,0.080342,0.330164,5.422599,0.352552
7226,Instagram,Men 18-24,Spanish,Austin,Food,Winter,26,8621.560927,4.265101,0.080145,0.330166,5.494849,0.350778
4269,Twitter,Women 45-60,Spanish,Las Vegas,Home,Winter,33,4766.119879,3.98205,0.092767,0.323856,5.458989,0.336692
