In [None]:
import numpy as np
import pandas as pd
from data_utils import load_data, select_and_clean, encode_and_split, build_rl_dataset

DATA_PATH = './data/accepted_2007_to_2018.csv'
df = load_data(DATA_PATH, nrows=200000)
proc = select_and_clean(df)
# choose features used for state
features = [c for c in proc.columns if c != 'target']
states, actions, rewards = build_rl_dataset(proc, features=features)
print('States shape:', states.shape)
print('Actions shape:', actions.shape)
print('Rewards shape:', rewards.shape)

## Behavior Cloning (policy-learning via supervised learning)
Train a classifier to predict the action taken in the dataset (here action==1 for approved). This produces a policy pi(a|s).

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(states, actions, test_size=0.2, random_state=0, stratify=actions)
clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)
print('BC accuracy:', clf.score(X_test, y_test))

### Policy Evaluation (Estimated Policy Value)
A simple importance-sampling OPE is not directly possible because we do not have behavior policy probabilities. As a crude estimate, we can compute the expected reward of a deterministic policy on the dataset by averaging rewards where the policy's action matches the dataset action, and otherwise applying the counterfactual reward estimate (here we only have observed outcomes when action==1).
This notebook provides a basic conservative estimate: average reward on states where policy approves and dataset approved; treat other approvals as unknown. For a production-quality evaluation use dedicated OPE libraries.