# Integrate a linear regression in a Gurobi model
*Note: The resulting model in this example will be too large for a size-limited license; in order to solve it, please visit https://www.gurobi.com/free-trial for a full license*

We take the model from Janos example:

$
\begin{align}
&\max \sum y_i \\
&\text{subject to:}\\
&\sum x_i \le 100,\\
&y_i = g(x_i, \psi),\\
& 0 \le x \le 2.5.
\end{align}
$

Where, $\psi$ is a vector of fixed features. And $g$ is an affine function computed using the  linear regression of scikit-learn.

Note that differently to Janos, we scale the feature corresponding to $x$ for the linear regression.

In [None]:
import gurobipy as gp
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from gurobi_ml.sklearn import PipelineConstr

# Base URL for retrieving data
janos_data_url = 'https://raw.githubusercontent.com/INFORMSJoC/2020.1023/master/data/'


In [None]:
# Retrieve historical data used to do the regression
historical_data = pd.read_csv(janos_data_url + 'college_student_enroll-s1-1.csv', index_col=0)


# classify our features between the ones that are fixed and the ones that will be
# part of the optimization problem

known_features = ["SAT", "GPA"]
dec_features = ["scholarship"]
target = "enroll"
features = known_features + dec_features

historical_data = historical_data[features + [target]]

### Do our linear regression

Note that we scale the features that are fixed and the ones that will be controled
by variables in the optimization problem separately.
This is to make the formulation simpler afterwards.

In [None]:
# Run our regression
X = historical_data.loc[:, features]
Y = historical_data.loc[:, "enroll"]
scaler = StandardScaler()
regression = LinearRegression()
pipe = make_pipeline(scaler, regression)
pipe.fit(X=X, y=Y)

### Now start with the optimization model

- Read in our data
- add the x and y variables and the regular matrix constraints

In [None]:
# Retrieve new data used to build the optimization problem
studentsdata = pd.read_csv(janos_data_url + 'college_applications6000.csv', index_col=0)
studentsdata = studentsdata[known_features]

nstudents = 500

# Select randomly nstudents in the data
studentsdata = studentsdata.sample(nstudents)

m = gp.Model()

knownidx = historical_data.columns.get_indexer(known_features)
scholarshipidx = historical_data.columns.get_indexer(dec_features)

lb = np.zeros((nstudents, len(features)))
ub = np.ones((nstudents, len(features))) * gp.GRB.INFINITY
lb[:, knownidx] = studentsdata.loc[:, known_features]
ub[:, knownidx] = studentsdata.loc[:, known_features]

x = m.addMVar(lb.shape, lb=lb, ub=ub, name="x")
scholarship = x[:, scholarshipidx][:, 0]
scholarship.LB = 0.0
scholarship.UB = 2.5
y = m.addMVar(nstudents, lb=-gp.GRB.INFINITY, name="y")

In [None]:
m.setObjective(y.sum(), gp.GRB.MAXIMIZE)
m.addConstr(scholarship.sum() <= 0.2 * nstudents)

### Add the constraint corresponding to the linear regression

In [None]:
pipe2gurobi = PipelineConstr(m, pipe, x, y)

### Finally optimize it

In [None]:
m.optimize()

Copyright © 2022 Gurobi Optimization, LLC