# Integrate a random forest in a Gurobi model
*Note: The resulting model in this example will be too large for a size-limited license; in order to solve it, please visit https://www.gurobi.com/free-trial for a full license*

We take the model from JANOS example:

$
\begin{align}
&\max \sum y_i \\
&\text{subject to:}\\
&\sum x_i \le 100,\\
&y_i = g(x_i, Xfix),\\
& 0 \le x \le 2.5.
\end{align}
$

Where, $Xknown$ is a vector of fixed features. And $g$ is a random forum computed using scikit-learn that predicts the probabilty that a student will join.

In [None]:
import gurobipy as gp
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from gurobi_ml import add_predictor_constr

# Base URL for retrieving data
janos_data_url = 'https://raw.githubusercontent.com/INFORMSJoC/2020.1023/master/data/'

### Do our Tree regression

In [None]:
# Retrieve historical data used to do the regression
historical_data = pd.read_csv(janos_data_url + 'college_student_enroll-s1-1.csv', index_col=0)

# Classify our features between the ones that are fixed and the ones
# that will be part of the optimization problem
known_features = ["SAT", "GPA"]
dec_features = ["merit"]
features = known_features + dec_features

# The target for training
target = "enroll"

historical_data = historical_data[features + [target]]

In [None]:
# Run our regression

X = historical_data.loc[:, features]
Y = historical_data.loc[:, "enroll"]
scaler = StandardScaler()

layers = [10] * 2
regression = RandomForestRegressor(n_estimators=20, max_depth=5, random_state=10)
pipe = make_pipeline(regression)
pipe.fit(X=X, y=Y)

## Do the optimization model

In [None]:
# Retrieve new data used for the optimization
studentsdata = pd.read_csv(janos_data_url + 'college_applications6000.csv', index_col=0)
studentsdata = studentsdata[known_features]
# Check that features are in identical order

assert (studentsdata.columns.get_indexer(known_features) == historical_data.columns.get_indexer(known_features)).all()

nstudents = 100

# Select randomly nstudents in the data
studentsdata = studentsdata.sample(nstudents)

In [None]:
# Start with classical part of the model
m = gp.Model()

knownidx = historical_data.columns.get_indexer(known_features)
decidx = historical_data.columns.get_indexer(dec_features)

lb = np.zeros((nstudents, len(features)))
ub = np.ones((nstudents, len(features))) * 2.5
lb[:, knownidx] = studentsdata.loc[:, known_features]
ub[:, knownidx] = studentsdata.loc[:, known_features]

x = m.addMVar(lb.shape, lb=lb, ub=ub, name="x")
y = m.addMVar(nstudents, lb=-gp.GRB.INFINITY, name="y")

m.setObjective(y.sum(), gp.GRB.MAXIMIZE)
m.addConstr(x[:, decidx].sum() <= 0.2 * nstudents)

# create transforms to turn scikit-learn pipeline into Gurobi constraints
pipelineconstr = add_predictor_constr(m, pipe, x, y)

### Finally optimize it

In [None]:
m.Params.TimeLimit = 60
m.Params.MIPGap = 0.01

In [None]:
m.optimize()

### Look at the solution

In [None]:
# This is what we predicted
plt.scatter(x.X[:, decidx], y.X)

In [None]:
# This is the historical data
plt.scatter(X.iloc[:, decidx], pipe.predict(X))

In [None]:
# Proportion of students offered a scholarship
print(
    "In historical data {:.4}% students offered a scholarship".format(
        100 * ((X.iloc[:, decidx] > 0).sum()[0] / len(X.iloc[:, decidx]))
    )
)
print("In our solution {:.4}% students offered a scholarship".format(100 * sum(x.X[:, decidx] > 0)[0] / nstudents))

In [None]:
bottom10 = historical_data.sort_values(by=["GPA"])[:2000]

top10 = historical_data.sort_values(by=["GPA"])[-2000:]

top10.enroll.sum()

print("In historical data")
print(
    "Among top 10% students: {}% were offered a scholarship and {}% joined".format(
        100 * (top10.merit > 0).sum() / top10.shape[0],
        100 * (top10.enroll.sum() / top10.shape[0]),
    )
)
print(
    "Among bottom 10% students: {}% were offered a scholarship and {}% joined".format(
        100 * (bottom10.merit > 0).sum() / bottom10.shape[0],
        100 * (bottom10.enroll.sum() / bottom10.shape[0]),
    )
)

Xpredicted = pd.concat(
    [
        studentsdata,
        pd.DataFrame(x.X[:, decidx], columns=["merit"], index=studentsdata.index),
        pd.DataFrame(y.X, columns=["enroll"], index=studentsdata.index),
    ],
    axis=1,
)
top10 = Xpredicted.sort_values(by="GPA")[-50:]
bottom10 = Xpredicted.sort_values(by="GPA")[:50]

print("In predicted data")
print(
    "Among top 10% students: {:.4}% were offered a scholarship and {:.4}% joined".format(
        100 * (top10.merit > 0).sum() / top10.shape[0],
        100 * (top10.enroll.sum() / top10.shape[0]),
    )
)
print(
    "Among bottom 10% students: {:.4}% were offered a scholarship and {:.4}% joined".format(
        100 * (bottom10.merit > 0).sum() / bottom10.shape[0],
        100 * (bottom10.enroll.sum() / bottom10.shape[0]),
    )
)

Copyright © 2022 Gurobi Optimization, LLC