# Integrate a logistic regression in a Gurobi model

We take the model from JANOS example:

$
\begin{align}
&\max \sum y_i \\
&\text{subject to:}\\
&\sum x_i \le 100,\\
&y_i = g(x_i, \psi),\\
& 0 \le x \le 2.5.
\end{align}
$

Where, $\psi$ is a matrix of known features. And $g$ is a logistic function computed using the  logistic regression of scikit-learn.

Note that differently to JANOS, we scale the feature corresponding to $x$ for the regression.

In [None]:
import sys

import gurobipy as gp
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
# import my functions
%load_ext autoreload
%autoreload 2
sys.path.append("../../src/")
from gurobi.machinelearning import add_predictor_constr

In [None]:
# Retrieve historical data used to do the regression
historical_data = pd.read_csv("data/college_student_enroll-s1-1.csv", index_col=0)


# classify our features between the ones that are fixed and the ones that will be
# part of the optimization problem

known_features = ["SAT", "GPA"]
dec_features = ["scholarship"]
target = "enroll"
features = known_features + dec_features

historical_data = historical_data[features + [target]]

### Do our logistic regression

In [None]:
# Run our regression
X = historical_data.loc[:, features]
Y = historical_data.loc[:, "enroll"]
scaler = StandardScaler()
regression = LogisticRegression(random_state=1, penalty='l1', C=10, solver='saga')
pipe = make_pipeline(scaler, regression)
pipe.fit(X=X, y=Y)

### Now start with the optimization model

- Read in our data
- add the x and y variables and the regular matrix constraints

In [None]:
# Retrieve new data used to build the optimization problem
studentsdata = pd.read_csv("data/admissions500.csv", index_col=0)
studentsdata = studentsdata[known_features]
nstudents = studentsdata.shape[0]

# Start with classical part of the model
m = gp.Model()

knownidx = historical_data.columns.get_indexer(known_features)
scholarshipidx = historical_data.columns.get_indexer(dec_features)

lb = np.zeros((nstudents, len(features)))
ub = np.ones((nstudents, len(features))) * gp.GRB.INFINITY
lb[:, knownidx] = studentsdata.loc[:, known_features]
ub[:, knownidx] = studentsdata.loc[:, known_features]

x = m.addMVar(lb.shape, lb=lb, ub=ub, name="x")
scholarship = x[:, scholarshipidx][:, 0]
y = m.addMVar(nstudents, ub=1, name="y")
z = m.addMVar(nstudents, vtype=gp.GRB.BINARY)

scholarship.LB = 0.0
scholarship.UB = 2.5

m.setObjective(y.sum(), gp.GRB.MAXIMIZE)
m.addConstr(scholarship.sum() <= 0.2 * nstudents)
m.addConstr(scholarship <= 2.5 * z[:])
m.addConstr(scholarship >= 0.5 * z[:])

add_predictor_constr(m, pipe, x, y)

### Finally optimize it

In [None]:
m.optimize()

### Look at solution

In [None]:
for i, (yX, xX) in enumerate(zip(y.X.round(2), scholarship.X.round(2))):
    if yX > 0.0 or xX > 0.0:
        print("Student {} y {} x {}".format(i, yX, xX))

In [None]:
# This is what we predicted
plt.scatter(scholarship.X, y.X)

In [None]:
# Proportion of students offered a scholarship
print(
    "In historical data {:.4}% students offered a scholarship".format(
        100 * (sum(historical_data.loc[:, "scholarship"] > 0) / len(historical_data.loc[:, "scholarship"]))
    )
)
print("In our solution {:.4}% students offered a scholarship".format(100 * sum(scholarship.X > 0) / nstudents))

Copyright © 2022 Gurobi Optimization, LLC