# Price Optimization

This example is adapted from the example in Gurobi's modeling examples [How Much
Is Too Much? Avocado Pricing and Supply Using Mathematical
Optimization](https://github.com/Gurobi/modeling-examples/tree/master/price_optimization).

We develop the same example as in the documentation but we try and compare different
regression models to estimate demand

In [None]:
import pandas as pd
import warnings

import matplotlib.pyplot as plt
from sklearn import tree
import seaborn as sns

import numpy as np

import gurobipy as gp
from gurobipy import GRB


from gurobi_ml import add_predictor_constr
import gurobipy_pandas as gppd

## Load the Packages and the Datasets

In [None]:
# Get the data

data_url = "https://raw.githubusercontent.com/Gurobi/modeling-examples/master/price_optimization/"
avocado = pd.read_csv(
    data_url + "HABdata_2019_2022.csv"
)  # dataset downloaded directly from HAB
avocado_old = pd.read_csv(
    data_url + "kaggledata_till2018.csv"
)  # dataset downloaded from Kaggle
avocado = pd.concat([avocado, avocado_old])

# Add the index for each year from 2015 through 2022
avocado["date"] = pd.to_datetime(avocado["date"])
avocado["year"] = pd.DatetimeIndex(avocado["date"]).year
avocado["year_index"] = avocado["year"] - 2015
avocado = avocado.sort_values(by="date")

# Define the peak season
avocado["month"] = pd.DatetimeIndex(avocado["date"]).month
peak_months = range(2, 8)  # <--------- Set the months for the "peak season"


def peak_season(row):
    return 1 if int(row["month"]) in peak_months else 0


avocado["peak"] = avocado.apply(lambda row: peak_season(row), axis=1)

# Scale the number of avocados to millions
avocado["units_sold"] = avocado["units_sold"] / 1000000

# Select only conventional avocados
avocado = avocado[avocado["type"] == "Conventional"]

avocado = avocado[
    ["date", "units_sold", "price", "region", "year", "month", "year_index", "peak"]
].reset_index(drop=True)

avocado

## Train regressions

We prepare the data using `OneHotEncoder` and `make_column_transformer`. We want
to transform the region feature using the encoder while we apply scaling to the other features.

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

feat_transform = make_column_transformer(
    (OneHotEncoder(drop="first"), ["region"]),
    (StandardScaler(), ["price", "year_index"]),
    ("passthrough", ["peak"]),
    verbose_feature_names_out=False,
    remainder='drop'
)


regions = [
    "Great_Lakes",
    "Midsouth",
    "Northeast",
    "Northern_New_England",
    "SouthCentral",
    "Southeast",
    "West",
    "Plains",
]
df = avocado[avocado.region.isin(regions)]

X = df[["region", "price", "year_index", "peak"]]
y = df["units_sold"]

To validate the regression model, we will randomly split the dataset into $80\%$
training and $20\%$ testing data and learn the weights using `Scikit-learn`.

In [None]:
from sklearn.model_selection import train_test_split

# Split the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, random_state=1
)

Create dictionary with various regression models that we want to use

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.base import clone
from time import time
regressions = {"Linear Regression": {"regressor":LinearRegression()},
               "MLP Regression": {"regressor": MLPRegressor([8]*2, max_iter=1000)},
               "Decision Tree": {"regressor": DecisionTreeRegressor(max_leaf_nodes=50)},
               "Random Forest": {"regressor": RandomForestRegressor(n_estimators=10, max_leaf_nodes=100)},
               "Gradient Boosting":
               {"regressor" : GradientBoostingRegressor(n_estimators=100)}}

Train the regressions

In [None]:
for regression, data in regressions.items():
    lin_reg = make_pipeline(feat_transform,
                            data["regressor"])
    train_start = time()
    lin_reg.fit(X_train, y_train)
    data["train time"] = time() - train_start
    data["pipeline"] = lin_reg

    # Get R^2 from test data
    y_pred = lin_reg.predict(X_test)
    r2_test = r2_score(y_test, y_pred)
    y_pred = lin_reg.predict(X_train)
    r2_train = r2_score(y_train, y_pred)
    data["R2 test"] = r2_test
    data["R2 train"] = r2_train
    print(f"{regression:<18} R^2 value in the test set is {r2_test:.3f} training {r2_train:.3f}")

Train the regressions adding polynomial features to the mix

In [None]:
regressions_poly = {}
for regression, data in regressions.items():
    data = {"regressor": clone(data["regressor"])}
    lin_reg = make_pipeline(feat_transform, PolynomialFeatures(),
                            data["regressor"])
    train_start = time()
    lin_reg.fit(X_train, y_train)
    data["train time"] = time() - train_start
    data["pipeline"] = lin_reg

    # Get R^2 from test data
    y_pred = lin_reg.predict(X_test)
    r2_test = r2_score(y_test, y_pred)
    y_pred = lin_reg.predict(X_train)
    r2_train = r2_score(y_train, y_pred)
    data["R2 test"] = r2_test
    data["R2 train"] = r2_train
    print(f"{regression} R^2 value in the test set is {r2_test:.3f} training {r2_train:.3f}")

    regressions_poly[f"{regression} polynomial feats"] = data

In [None]:
# Merge dictionary of polynomial features
regressions |= regressions_poly

## Prepare data of optimization model

In [None]:
# Sets and parameters
B = 30  # total amount ot avocado supply

peak_or_not = 1  # 1 if it is the peak season; 1 if isn't
year = 2020

c_waste = 0.1  # the cost ($) of wasting an avocado
# the cost of transporting an avocado
c_transport = pd.Series(
    {
        "Great_Lakes": 0.3,
        "Midsouth": 0.1,
        "Northeast": 0.4,
        "Northern_New_England": 0.5,
        "SouthCentral": 0.3,
        "Southeast": 0.2,
        "West": 0.2,
        "Plains": 0.2,
    }, name='transport_cost'
)

c_transport = c_transport.loc[regions]
# the cost of transporting an avocado

# Get the lower and upper bounds from the dataset for the price and the number of products to be stocked
a_min = 0  # minimum avocado price in each region
a_max = 2  # maximum avocado price in each region

data = pd.concat([c_transport,
                  df.groupby("region")["units_sold"].min().rename('min_delivery'),
                  df.groupby("region")["units_sold"].max().rename('max_delivery')], axis=1)

In [None]:
m = gp.Model("Avocado_Price_Allocation")

x = gppd.add_vars(m, data, name="x", lb='min_delivery', ub='max_delivery')
s = gppd.add_vars(m, data, name="s") # predicted amount of sales in each region for the given price).
w = gppd.add_vars(m, data, name="w") # excess wasteage in each region).
d = gppd.add_vars(m, data, lb=-gp.GRB.INFINITY, name="demand") # Add variables for the regression
p = gppd.add_vars(m, data, name="price", lb=a_min, ub=a_max)
m.update()

m.setObjective((p * s).sum() - c_waste * w.sum() - (c_transport * x).sum())
m.ModelSense = GRB.MAXIMIZE

m.addConstr(x.sum() == B)
m.update()

gppd.add_constrs(m, s, gp.GRB.LESS_EQUAL, x)
gppd.add_constrs(m, s, gp.GRB.LESS_EQUAL, d)
m.update()

gppd.add_constrs(m, w, gp.GRB.EQUAL, x - s)
m.update()

In [None]:
feats = pd.DataFrame(
    data={
        "year_index": year - 2015,
        "peak": peak_or_not,
        "region": regions,
    },
    index=regions
)
feats = pd.concat([feats, p], axis=1)[["region", "price", "year_index", "peak"]]

In [None]:
for regression, data in regressions.items():
    pred_constr = add_predictor_constr(m, data["pipeline"], feats, d, verbose=True)

    pred_constr.print_stats()

    m.Params.NonConvex = 2
    m.write(f"{regression}.rlp")
    try:
        start = time()
        m.optimize()
        data["optimization time"] = time() - start
    except:
        data["optimization time"] = float('nan')
        break
        pass
    pred_constr.remove()

In [None]:
files = !ls -l *.rlp

sizes = {' '.join(line.split()[8:])[:-4]: line.split()[4] for line in files}
!rm *.rlp

In [None]:
for key, value in sizes.items():
    regressions[key]["file size"] = value

In [None]:
res = pd.DataFrame.from_dict(regressions, orient='index').drop(["regressor", "pipeline"], axis=1)

In [None]:
res["file size"] = res["file size"].astype(int)
res["file size"] /= res.loc['Linear Regression', 'file size']
res["file size"] = res["file size"].round(0).astype(int)

In [None]:
res.round(2)

We have shown how to model the price and supply optimization problem with Gurobi
Machine Learning. In the [Gurobi modeling examples
notebook](https://github.com/Gurobi/modeling-examples/tree/master/price_optimization)
more analysis of the solutions this model can give is done interactively. Be
sure to take look at it.

Copyright © 2022 Gurobi Optimization, LLC