In [2]:
import math
from pathlib import Path
from scipy import stats
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    mean_squared_log_error,
    median_absolute_error,
    r2_score
)
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
import mlflow


In [18]:

def generate_apple_sales_data_with_promo_adjustment(
        base_demand: int = 1000,
        n_rows: int = 5000,
        competitor_price_effect: float = -50.0
):
    '''
    Generates a synthetic dataset for predicting apple sales demand with
    multiple influencing factors.
    This function creates a pandas DataFrame with features relevant to apple sales.
    The features include date, average_temperature, rainfall, weekend_flag,
    holiday_flag, promotional_flag, price_per_kg, competitor's price, marketing intensity,
    stock availability and the previous day's demand. The target variable, 'demand',
    is generated based on a combination of these features with some added noise

    Args:
        base_demand (int, optional): Base deamnd for apples. Defaults to 1000
        n_rows (int, optional) : Number of rows of data to generate. Defaults to 5000
        competitor_price_effect (float, optional): Effect of competitor's price being lower on our sales. Defaults to -50

    Returns:
        pd.DataFrame: DataFrame with features and target variable for apple sales prediction.

    Example:
        >>> df = generate_apple_sales_with_promo_adjustment(base_demand=1200,n_rows=6000)
        >>> df.head()
    '''
    # set seed for reproducibility
    np.random.seed(1)

    # create date range
    dates = [datetime.now() - timedelta(days=i) for i in range(n_rows)]
    dates.reverse()

    # generate features
    df = pd.DataFrame(
        {
            'date' : dates,
            'average_temperature' : np.random.uniform(10, 35, n_rows),
            'rainfall' : np.random.exponential(5, n_rows),
            'weekend' : [(date.weekday()>= 5) * 1 for date in dates],
            'holiday' : np.random.choice([0,1], n_rows, p=[0.97,0.03]),
            'price_per_kg' : np.random.uniform(0.5, 3, n_rows),
            'month' : [date.month for date in dates]
        }
    )

    # introduce inflation over time (years)
    df['inflation_multiplier'] = 1 + (df['date'].dt.year - df['date'].dt.year.min()) * 0.03

    # incorporate seasonality due to apple harvests
    df['harvest_effect'] = np.sin(2 * np.pi * (df['month'] - 3) /12 ) + np.sin(2 * np.pi * (df['month'] - 9) / 12)

    # modify the price_per_kg base on harvest effect
    df['price_per_kg'] = df['price_per_kg'] - df['harvest_effect'] * 0.5

    # adjust promo periods to coincide with periods logging peak harvest by 1 month
    peak_months = [4, 10]
    df['promo'] = np.where(df['month'].isin(peak_months), 1, np.random.choice([0,1], n_rows, p=[0.85,0.15]))
    # Generate target variable based on features
    base_price_effect = -df["price_per_kg"] * 50
    seasonality_effect = df["harvest_effect"] * 50
    promo_effect = df["promo"] * 200

    df["demand"] = (
        base_demand
        + base_price_effect
        + seasonality_effect
        + promo_effect
        + df["weekend"] * 300
        + np.random.normal(0, 50, n_rows)
    ) * df["inflation_multiplier"]  # adding random noise

    # Add previous day's demand
    df["previous_days_demand"] = df["demand"].shift(1)
    df["previous_days_demand"].fillna(method="bfill", inplace=True)  # fill the first row

    # Introduce competitor pricing
    df["competitor_price_per_kg"] = np.random.uniform(0.5, 3, n_rows)
    df["competitor_price_effect"] = (
        df["competitor_price_per_kg"] < df["price_per_kg"]
    ) * competitor_price_effect

    # Stock availability based on past sales price (3 days lag with logarithmic decay)
    log_decay = -np.log(df["price_per_kg"].shift(3) + 1) + 2
    df["stock_available"] = np.clip(log_decay, 0.7, 1)

    # Marketing intensity based on stock availability
    # Identify where stock is above threshold
    high_stock_indices = df[df["stock_available"] > 0.95].index

    # For each high stock day, increase marketing intensity for the next week
    for idx in high_stock_indices:
        df.loc[idx : min(idx + 7, n_rows - 1), "marketing_intensity"] = np.random.uniform(0.7, 1)

    # If the marketing_intensity column already has values, this will preserve them;
    #  if not, it sets default values
    fill_values = pd.Series(np.random.uniform(0, 0.5, n_rows), index=df.index)
    df["marketing_intensity"].fillna(fill_values, inplace=True)

    # Adjust demand with new factors
    df["demand"] = df["demand"] + df["competitor_price_effect"] + df["marketing_intensity"]

    # Drop temporary columns
    df.drop(
        columns=[
            "inflation_multiplier",
            "harvest_effect",
            "month",
            "competitor_price_effect",
            "stock_available",
        ],
        inplace=True,
    )

    return df

In [19]:
df = generate_apple_sales_data_with_promo_adjustment(
    base_demand=1000, n_rows=10000, competitor_price_effect=-25.0
)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["previous_days_demand"].fillna(method="bfill", inplace=True)  # fill the first row
  df["previous_days_demand"].fillna(method="bfill", inplace=True)  # fill the first row
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["marketing_intensity"].fillna(fill_values, inplace=True

In [20]:
df.head()

Unnamed: 0,date,average_temperature,rainfall,weekend,holiday,price_per_kg,promo,demand,previous_days_demand,competitor_price_per_kg,marketing_intensity
0,1998-08-17 17:27:24.528906,20.42555,7.251434,0,0,1.398351,1,1010.229753,1035.07916,0.71194,0.150593
1,1998-08-18 17:27:24.528906,28.008112,1.842095,0,0,2.862576,0,802.735333,1035.07916,2.012319,0.452606
2,1998-08-19 17:27:24.528906,10.002859,6.647262,0,0,2.588351,0,797.14324,827.282726,0.557784,0.417891
3,1998-08-20 17:27:24.528906,17.558314,8.361846,0,0,0.802561,0,847.328267,821.725349,0.954631,0.791614
4,1998-08-21 17:27:24.528906,13.668897,1.350484,0,0,1.439733,0,892.839815,846.536653,2.074827,0.791614
