# Integrate a logistic regression in a Gurobi model

We take the model from JANOS example:

$
\begin{align}
&\max \sum y_i \\
&\text{subject to:}\\
&\sum x_i \le 100,\\
&y_i = g(x_i, X),\\
& 0.5 z \le x \le 2.5 z.
\end{align}
$

Where, $X$ is a vector of known features. And $g$ is a logistic function fitted by scikit-learn to predicts the probabilty that a student will join.

In [None]:
import sys

import gurobipy as gp
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
# import my functions
%load_ext autoreload
%autoreload 2
sys.path.append("../../src")
from ml2gurobi.sklearn import PipelinePredictor

### Do the logistic regression

In [None]:
# Retrieve historical data used to do the regression
historical_data = pd.read_csv("data/college_student_enroll-s1-1.csv", index_col=0)


# classify our features between the ones that are fixed and the ones that will be
# part of the optimization problem

known_features = ["SAT", "GPA"]
dec_features = ["scholarship"]
target = "enroll"
features = known_features + dec_features

historical_data = historical_data[features + [target]]

In [None]:
# Run our regression
X = historical_data.loc[:, features]
Y = historical_data.loc[:, "enroll"]
scaler = StandardScaler()
regression = LogisticRegression(random_state=0, solver="lbfgs")
pipe = make_pipeline(scaler, regression)
pipe.fit(X=X, y=Y)

### Do the optimization model

In [None]:
# Retrieve new data used to build the optimization problem
studentsdata = pd.read_csv("data/admissions500.csv", index_col=0)
studentsdata = studentsdata[known_features]
nstudents = studentsdata.shape[0]

# Start with classical part of the model
m = gp.Model()

knownidx = historical_data.columns.get_indexer(known_features)
scholarshipidx = historical_data.columns.get_indexer(dec_features)

lb = np.zeros((nstudents, len(features)))
ub = np.ones((nstudents, len(features))) * gp.GRB.INFINITY
lb[:, knownidx] = studentsdata.loc[:, known_features]
ub[:, knownidx] = studentsdata.loc[:, known_features]

x = m.addMVar(lb.shape, lb=lb, ub=ub, name="x")
scholarship = x[:, scholarshipidx][:, 0]
y = m.addMVar(nstudents, ub=1, name="y")
z = m.addMVar(nstudents, vtype=gp.GRB.BINARY)

scholarship.LB = 0.0
scholarship.UB = 2.5

m.setObjective(y.sum(), gp.GRB.MAXIMIZE)
m.addConstr(scholarship.sum() <= 0.2 * nstudents)
m.addConstr(scholarship <= 2.5 * z[:])
m.addConstr(scholarship >= 0.5 * z[:])

PipelinePredictor(m, pipe, x, y)

### Finally optimize it

In [None]:
m.optimize()

### Look at the solution

In [None]:
# This is what we predicted
plt.scatter(scholarship.X, y.X)

In [None]:
# This is the historical data
plt.scatter(X.loc[:, "scholarship"], pipe.predict_proba(X)[:, 1])

In [None]:
# Proportion of students offered a scholarship
print(
    "In historical data {:.4}% students offered a scholarship".format(
        100 * ((X.loc[:, "scholarship"] > 0).sum() / len(X.loc[:, "scholarship"]))
    )
)
print("In our solution {:.4}% students offered a scholarship".format(100 * sum(scholarship.X > 0) / nstudents))

In [None]:
bottom10 = historical_data.sort_values(by=["GPA"])[:2000]

top10 = historical_data.sort_values(by=["GPA"])[-2000:]

top10.enroll.sum()

print("In historical data")
print(
    "Among top 10% students: {}% were offered a scholarship and {}% joined".format(
        100 * (top10.scholarship > 0).sum() / top10.shape[0],
        100 * (top10.enroll.sum() / top10.shape[0]),
    )
)
print(
    "Among bottom 10% students: {}% were offered a scholarship and {}% joined".format(
        100 * (bottom10.scholarship > 0).sum() / bottom10.shape[0],
        100 * (bottom10.enroll.sum() / bottom10.shape[0]),
    )
)

Xpredicted = pd.concat(
    [
        studentsdata,
        pd.DataFrame(scholarship.X, columns=["scholarship"], index=studentsdata.index),
        pd.DataFrame(y.X, columns=["enroll"], index=studentsdata.index),
    ],
    axis=1,
)
top10 = Xpredicted.sort_values(by="GPA")[-50:]
bottom10 = Xpredicted.sort_values(by="GPA")[:50]

print("In predicted data")
print(
    "Among top 10% students: {:.4}% were offered a scholarship and {:.4}% joined".format(
        100 * (top10.scholarship > 0).sum() / top10.shape[0],
        100 * (top10.enroll.sum() / top10.shape[0]),
    )
)
print(
    "Among bottom 10% students: {:.4}% were offered a scholarship and {:.4}% joined".format(
        100 * (bottom10.scholarship > 0).sum() / bottom10.shape[0],
        100 * (bottom10.enroll.sum() / bottom10.shape[0]),
    )
)

Copyright © 2022 Gurobi Optimization, LLC