In [37]:
import numpy as np
import pandas as pd

In [38]:
# Set random seed for reproducibility
np.random.seed(42)

In [39]:
# Generate synthetic dataset
n_days = 730  # Two years of daily data
dates = pd.date_range(start="2023-01-01", periods=n_days, freq="D")

In [40]:
# Function to create a trend (increasing or decreasing)
def generate_trend(n_days, start=0.8, end=1.2):
    return np.linspace(start, end, n_days)

In [41]:
# Function to get holiday dates from a DatetimeIndex object
def get_holiday_year_from_dates(dates, holidays_base, easter_dates):
    start_year = dates.year.min()  # Get the starting year from the DatetimeIndex
    end_year = dates.year.max()  # Get the ending year from the DatetimeIndex
    num_years = end_year - start_year + 1  # Calculate the number of years

    holiday_dates = {}
    for holiday, base_date in holidays_base.items():
        holiday_dates[holiday] = [f"{year}-{base_date}" for year in range(start_year, end_year + 1)]

    # Add the Easter dates to the holiday list, assuming it's a list of yyyy-mm-dd strings
    holiday_dates["Easter"] = easter_dates

    return holiday_dates

# Define holidays with just the base date (no year)
holidays_base = {
    "New Year's Day": "01-01",
    "Independence Day": "07-04",
    "Christmas": "12-25",
    "Black Friday": "11-24",
    "Cyber Monday": "11-27",
    "Singles' Day": "11-11",
    "Valentine's Day": "02-14",
    "Back to School": "08-15",
    "Brand Anniversary": "06-01"
}

# Provide your list of Easter dates here
easter_dates = ["2023-04-09", "2024-03-31"]

# Get the holiday dates from the DatetimeIndex
holidays = get_holiday_year_from_dates(dates, holidays_base, easter_dates)

# Convert holidays to datetime format
holidays_dates = pd.to_datetime([date for holiday_dates in holidays.values() for date in holiday_dates])

# Function to generate holiday spikes
def generate_holiday_spikes(n_days, dates, holidays, ramp_up_days=5, min_spike=1.5, max_spike=2):
    spikes = np.ones(n_days)
    all_holidays = sum(holidays.values(), [])  # Flatten the holiday dates into one list

    for h_date in all_holidays:
        idx = (dates == h_date).argmax()
        for i in range(ramp_up_days):
            if idx - i >= 0:
                spikes[idx - i] += np.linspace(1.2, 2, ramp_up_days)[i]
        spikes[idx] += np.random.uniform(min_spike, max_spike)

    return spikes


In [42]:
# Function to randomly set zero-investment days
def apply_zero_investment(spend_array, probability=0.1):
    zero_days = np.random.rand(len(spend_array)) < probability
    spend_array[zero_days] = 0
    return spend_array

In [43]:
def generate_seasonality(n_days, dates, holidays, trend_factor=1.0,
                          annual_amplitude=0.3, weekly_amplitude=0.1, holiday_boost=0.5, ramp_up_days=7):
    days = np.arange(n_days)

    # Annual and weekly seasonality
    annual_effect = 1 + annual_amplitude * np.sin(2 * np.pi * days / 365)
    weekly_effect = 1 + weekly_amplitude * np.sin(2 * np.pi * days / 7)

    # Holiday interaction: Increase seasonality near holidays
    holiday_multiplier = np.ones(n_days)
    for holiday in holidays:
        holiday_date = pd.to_datetime(holiday)
        distances = np.abs((dates - holiday_date).days)
        holiday_multiplier += holiday_boost * np.exp(-distances / ramp_up_days)  # Exponential decay

    # Trend interaction: Seasonality scales with the trend
    trend_effect = 1 + trend_factor * (days / n_days)  # Linearly increasing/decreasing trend influence

    # Final seasonality effect (multiplicative combination)
    return annual_effect * weekly_effect * holiday_multiplier * trend_effect


In [44]:
def generate_media_seasonality(media_type, n_days, dates, holidays):
    """Generates seasonality for different media investments based on media type."""

    # Default parameters (can be tuned further)
    seasonality_params = {
        "tv": {"trend_factor": 0.5, "annual_amplitude": 0.3, "weekly_amplitude": 0.1, "holiday_boost": 0.5, "ramp_up_days": 10},
        "radio": {"trend_factor": 0.3, "annual_amplitude": 0.2, "weekly_amplitude": 0.05, "holiday_boost": 0.3, "ramp_up_days": 7},
        "ooh": {"trend_factor": 0.2, "annual_amplitude": 0.25, "weekly_amplitude": 0.2, "holiday_boost": 0.2, "ramp_up_days": 5},
        "meta": {"trend_factor": 0.6, "annual_amplitude": 0.15, "weekly_amplitude": 0.05, "holiday_boost": 0.4, "ramp_up_days": 8},
        "google": {"trend_factor": 0.7, "annual_amplitude": 0.2, "weekly_amplitude": 0.05, "holiday_boost": 0.5, "ramp_up_days": 9},
        "tiktok": {"trend_factor": 0.8, "annual_amplitude": 0.3, "weekly_amplitude": 0.1, "holiday_boost": 0.6, "ramp_up_days": 10},
        "digital_others": {"trend_factor": 0.4, "annual_amplitude": 0.2, "weekly_amplitude": 0.05, "holiday_boost": 0.3, "ramp_up_days": 6},
    }

    # Get the parameters for the selected media type (fallback to default if not listed)
    params = seasonality_params.get(media_type, seasonality_params["digital_others"])

    return generate_seasonality(
        n_days=n_days,
        dates=dates,
        holidays=holidays,
        trend_factor=params["trend_factor"],
        annual_amplitude=params["annual_amplitude"],
        weekly_amplitude=params["weekly_amplitude"],
        holiday_boost=params["holiday_boost"],
        ramp_up_days=params["ramp_up_days"]
    )


In [45]:
# Define different parameters for each media vehicle
tv_base = np.random.uniform(4000, 5000, n_days)
tv_seasonality = generate_media_seasonality("tv", n_days, dates, holidays_dates)
tv_spend = tv_base * tv_seasonality

# Apply zero-investment conditions
tv_spend = apply_zero_investment(tv_spend, probability=0.15)

In [46]:
radio_base = np.random.uniform(500, 1500, n_days)
radio_seasonality = generate_media_seasonality("radio", n_days, dates, holidays_dates)
radio_spend = radio_base * radio_seasonality

# Apply zero-investment conditions
radio_spend = apply_zero_investment(radio_spend, probability=0.20)

In [47]:
ooh_base = np.random.uniform(500, 1500, n_days)
ooh_seasonality = generate_media_seasonality("ooh", n_days, dates, holidays_dates)
ooh_spend = ooh_base * ooh_seasonality

# Apply zero-investment conditions
ooh_spend = apply_zero_investment(ooh_spend, probability=0.20)

In [48]:
meta_base = np.random.uniform(2000, 4000, n_days)
meta_seasonality = generate_media_seasonality("meta", n_days, dates, holidays_dates)
meta_spend = meta_base * meta_seasonality

In [49]:
google_base = np.random.uniform(4000, 6000, n_days)
google_seasonality = generate_media_seasonality("google", n_days, dates, holidays_dates)
google_spend = google_base * google_seasonality

In [50]:
tiktok_base = np.random.uniform(2000, 4000, n_days)
tiktok_seasonality = generate_media_seasonality("tiktok", n_days, dates, holidays_dates)
tiktok_spend = tiktok_base * tiktok_seasonality

In [51]:
digital_others_base = np.random.uniform(1500, 2000, n_days)
digital_others_seasonality = generate_media_seasonality("digital_others", n_days, dates, holidays_dates)
digital_others_spend = digital_others_base * digital_others_seasonality

In [52]:
# Generate sales data
sales_baseline = np.random.uniform(1500, 3000, n_days)
sales_trend = generate_trend(n_days, start=0.9, end=1.7)

# Use the updated seasonality function with holidays and trends
sales_seasonality = generate_seasonality(
    n_days=n_days,
    dates=dates,
    holidays=holidays_dates,
    trend_factor=0.5,  # Strength of seasonality-trend interaction
    annual_amplitude=0.2,  # Stronger annual cycle
    weekly_amplitude=0.1,  # Weekly cycle impact
    holiday_boost=0.5,  # Boost around holidays
    ramp_up_days=10  # Gradual increase before holidays
)

sales_holiday_spikes = generate_holiday_spikes(n_days, dates, holidays, ramp_up_days=10, min_spike=1.5, max_spike=2)

# Add noise
sales_noise = np.random.normal(0, 500, n_days)

# Add media contribution
sales_from_media = (
    0.1 * tv_spend +
    0.05 * radio_spend +
    0.05 * ooh_spend +
    0.2 * meta_spend +
    0.3 * google_spend +
    0.2 * tiktok_spend +
    0.1 * digital_others_spend
)

# Sales = Baseline + Media Contribution + Noise, all scaled by trend, seasonality, and holiday spikes
sales_total = 1000 * (sales_baseline + sales_from_media + sales_noise) * sales_trend * sales_seasonality * sales_holiday_spikes


In [53]:
# Create DataFrame
df = pd.DataFrame({
    "date": dates,
    "tv": tv_spend,
    "radio": radio_spend,
    "ooh": ooh_spend,
    "meta": meta_spend,
    "google": google_spend,
    "tiktok": tiktok_spend,
    "digital": digital_others_spend,
    "sales": sales_total
})


# Round all numerical columns to 2 decimal places
df = df.round(2)

# Add a 'holiday' column, where 1 means holiday and 0 means not
df['holiday'] = df['date'].isin(holidays_dates).astype(int)

In [54]:
# Visualize dataframe
df.head()

Unnamed: 0,date,tv,radio,ooh,meta,google,tiktok,digital,sales,holiday
0,2023-01-01,6588.79,1162.52,990.38,2977.19,8932.64,4556.6,2315.72,35768254.89,1
1,2023-01-02,7834.65,1513.92,1639.37,3026.25,8681.33,3851.36,1966.45,10222134.66,0
2,2023-01-03,7444.58,786.07,1800.9,4366.05,7732.49,3646.35,1977.41,9791019.64,0
3,2023-01-04,6731.67,1431.54,0.0,4129.13,7624.79,3143.86,2372.24,9989567.91,0
4,2023-01-05,5470.03,0.0,0.0,4638.42,6035.7,5192.94,1765.8,6910931.76,0


In [55]:
def introduce_nulls(df):
    """ Introduces missing values to mimic real-world missing data patterns. """

    np.random.seed(42)  # For reproducibility

    # Sales: Less than 1% of missing values (randomly chosen)
    sales_mask = np.random.rand(len(df)) < 0.01
    df.loc[sales_mask, "sales"] = np.nan

    # Media investments: 5% of zero values become NaN
    investment_columns = ["tv", "radio", "ooh", "digital"]
    for col in investment_columns:
        zero_mask = (df[col] == 0) & (np.random.rand(len(df)) < 0.05)
        df.loc[zero_mask, col] = np.nan

    # Holiday Effects: 5% of zero values become NaN
    holiday_columns = ["holiday"]
    for col in holiday_columns:
        zero_mask = (df[col] == 0) & (np.random.rand(len(df)) < 0.05)
        df.loc[zero_mask, col] = np.nan

    return df

# Apply the function to your dataset
df = introduce_nulls(df)


In [56]:
def introduce_full_duplicates(df, duplicate_fraction=0.05):
    """ Introduces full row duplications into the dataframe. """

    np.random.seed(42)  # For reproducibility

    # Number of rows to duplicate
    num_duplicates = int(len(df) * duplicate_fraction)
    duplicate_rows = df.sample(n=num_duplicates, replace=True)

    # Append duplicates to the dataframe
    df = pd.concat([df, duplicate_rows], ignore_index=True)

    return df

df = introduce_full_duplicates(df, duplicate_fraction=0.05)

In [57]:
def introduce_partial_duplicates_and_nulls(df, partial_fraction=0.05):
    """ Introduces partial row duplications where half the data is replaced with NaN. """

    np.random.seed(42)  # For reproducibility

    # Number of rows to duplicate partially
    num_partial = int(len(df) * partial_fraction)
    partial_rows = df.sample(n=num_partial, replace=True).copy()  # Create a copy to modify

    # Select **random half** of the columns for NaN replacement in partial rows
    for index in partial_rows.index:
        mask = np.random.rand(len(df.columns)) < 0.5  # 50% chance per column
        partial_rows.loc[index, mask] = np.nan  # Replace with NaN

    # Append partial duplicates to the dataframe
    df = pd.concat([df, partial_rows], ignore_index=True)

    return df

df = introduce_partial_duplicates_and_nulls(df, partial_fraction=0.05)

In [58]:
# Save the DataFrame to a CSV file
df.to_csv('/content/mmm_sample_data.csv', index=False)

# Download the CSV file
from google.colab import files
files.download('/content/mmm_sample_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>