In [198]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import scipy.stats as stats
# import tensorflow_probability as tfp
# import gpytorch
# from gpytorch.distributions import MultivariateNormal
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from scipy.stats import gamma, norm, beta
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder,  MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import warnings

In [199]:
# !pip install ucimlrepo
from ucimlrepo import fetch_ucirepo

## 1. Data preprocess

In [200]:
# fetch dataset 
statlog_german_credit_data = fetch_ucirepo(id=144) 
  
# data (as pandas dataframes) 
data = statlog_german_credit_data.data.features 
y = statlog_german_credit_data.data.targets.values - 1 ## 1 subtracted for (0 = Good,  1 = Bad) labelling
  
# metadata 
#print(statlog_german_credit_data.metadata) 
  
# variable information 
#print(statlog_german_credit_data.variables) 

In [201]:
df=data[statlog_german_credit_data.variables.name[:-1]]
df_full=df.copy()
df_full.columns=statlog_german_credit_data.variables.description[:-1].to_list()

In [202]:
# Define preprocessing steps
numeric_features = ['Attribute2', 'Attribute5', 'Attribute8', 'Attribute11', 'Attribute13', 'Attribute16', 'Attribute18']
binary_features = ['Attribute19', 'Attribute20']
categorical_features = ['Attribute1', 'Attribute3', 'Attribute4', 'Attribute6', 'Attribute7', 'Attribute9', 'Attribute10', 'Attribute12', 'Attribute14', 'Attribute15', 'Attribute17']

# Apply LabelEncoder to binary features
label_encoders = {}
for feature in binary_features:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])
    label_encoders[feature] = le  # Store the encoder for future use (e.g., inverse transform)

# Pipeline for numeric features: Imputation and Min-Max Scaling between -1 and 1
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler(feature_range=(-1, 1)))
])

# Pipeline for categorical features: Imputation, OneHotEncoding, and Min-Max Scaling between -1 and 1
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False)),  # Set sparse=False for easy concatenation
    ('scaler', MinMaxScaler(feature_range=(-1, 1)))  # Scaling the one-hot encoded features
])

# For binary features, use Min-Max Scaling as well
binary_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler(feature_range=(-1, 1)))
])

# Combine the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('bin', binary_transformer, binary_features)
    ]
)

# Apply the preprocessing steps to the DataFrame
df2 = preprocessor.fit_transform(df)

# If you want to convert it back to a DataFrame for ease of use
# Create column names for the one-hot encoded features
onehot_feature_names = list(preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out())

# Combine all feature names
all_feature_names = numeric_features + onehot_feature_names + binary_features

# Create the processed DataFrame
df2 = pd.DataFrame(df2, columns=all_feature_names)

# Show the processed DataFrame
df2

Unnamed: 0,Attribute2,Attribute5,Attribute8,Attribute11,Attribute13,Attribute16,Attribute18,x0_A11,x0_A12,x0_A13,...,x8_A143,x9_A151,x9_A152,x9_A153,x10_A171,x10_A172,x10_A173,x10_A174,Attribute19,Attribute20
0,-0.941176,-0.898867,1.000000,1.000000,0.714286,-0.333333,-1.0,1.0,-1.0,-1.0,...,1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0
1,0.294118,-0.372620,-0.333333,-0.333333,-0.892857,-1.000000,-1.0,-1.0,1.0,-1.0,...,1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0
2,-0.764706,-0.796853,-0.333333,0.333333,0.071429,-1.000000,1.0,-1.0,-1.0,-1.0,...,1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0
3,0.117647,-0.160119,-0.333333,1.000000,-0.071429,-1.000000,1.0,1.0,-1.0,-1.0,...,1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0
4,-0.411765,-0.491581,0.333333,1.000000,0.214286,-0.333333,1.0,1.0,-1.0,-1.0,...,1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.764706,-0.836470,0.333333,1.000000,-0.571429,-1.000000,-1.0,-1.0,-1.0,-1.0,...,1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0
996,-0.235294,-0.603059,1.000000,1.000000,-0.250000,-1.000000,-1.0,1.0,-1.0,-1.0,...,1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,-1.0
997,-0.764706,-0.939034,1.000000,1.000000,-0.321429,-1.000000,-1.0,-1.0,-1.0,-1.0,...,1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0
998,0.205882,-0.824475,1.000000,1.000000,-0.857143,-1.000000,-1.0,1.0,-1.0,-1.0,...,1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0


## 2. Define linear reward functions

In [203]:
# Suppress warnings
warnings.filterwarnings('ignore')

# Set seed for reproducibility
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

# Call the seed setting function
set_seed(42)

In [204]:
# RewardGenerator and generate_data functions (from your code)
class RewardGenerator:
    def __init__(self, drift_rate=0.005, seasonal_period=400):
        self.drift_rate = drift_rate
        self.seasonal_period = seasonal_period
        self.t = 0  # Initialize time step
    
    def generate_rewards1(self, context, action, optimal_action, n = 3):
        '''
        Generates rewards using a simple linear function with noise.

        Parameters:
        - context: np.array, the feature context
        - action: int, the action taken
        - optimal_action: int, the optimal action for this context

        Returns:
        - reward: float, the generated reward
        '''

        # Sample `n` context features
        if len(context) >= n:
            sampled_context = np.random.choice(context, size=n, replace=False)
        else:
            sampled_context = context  # Use all features if fewer than n
        
        # Linear reward based on context sum
        context_factor = np.sum(sampled_context)
        
        # Add a base reward depending on whether the action is optimal
        base_reward = 1.5 if action == optimal_action else 0.8

        # Linear reward calculation with noise
        reward = base_reward + 0.1 * context_factor + np.random.normal(0, 0.1)

        return reward

    def generate_rewards2(self, context, action, optimal_action, n=3):
        '''
        Generates rewards with added complexity from context sampling and non-stationarity.

        Parameters:
        - context: np.array, the feature context
        - action: int, the action taken
        - optimal_action: int, the optimal action for this context
        - n: int, number of context features to sample for reward generation

        Returns:
        - reward: float, the generated reward
        '''
        # Sample `n` context features
        if len(context) >= n:
            sampled_context = np.random.choice(context, size=n, replace=False)
        else:
            sampled_context = context  # Use all features if fewer than n
        
        # Compute a context factor based on the sampled features
        context_factor = np.sum(sampled_context)
        
        # Non-stationarity (drift and seasonality)
        time_drift = self.drift_rate * self.t
        seasonality = np.sin(2 * np.pi * self.t / self.seasonal_period)

        # Calculate reward based on action and non-stationary factors
        if action == optimal_action:
            mean = 1 + 0.2 * context_factor + time_drift + seasonality
            std = 0.1  # Small variance for optimal action
        else:
            mean = 0 + 0.5 * context_factor + time_drift + seasonality
            std = 0.4  # Higher variance for non-optimal action
        
        # Sample reward from Gaussian distribution
        reward = np.random.normal(mean, std)

        # Increment time step for next call
        self.t += 1
        
        return reward
    

    def generate_rewards3(self, context, action, optimal_action, n=3):
        '''
        Generates rewards using a piecewise linear function with context-dependent intervals.
        '''
        # Sample `n` context features
        if len(context) >= n:
            sampled_context = np.random.choice(context, size=n, replace=False)
        else:
            sampled_context = context  # Use all features if fewer than n

        # Compute a context factor based on the sampled features
        context_factor = np.sum(sampled_context)
        
        # Piecewise modification based on context factor
        if context_factor < 0.5:
            context_factor = 1
        elif 0.5 <= context_factor < 0.6:
            context_factor = -10 * context_factor + 4
        else:
            context_factor = 0
        
        # Non-stationarity (drift and seasonality)
        time_drift = self.drift_rate * self.t
        seasonality = np.sin(2 * np.pi * self.t / self.seasonal_period)

        # Calculate reward based on action and non-stationary factors
        if action == optimal_action:
            mean = 1 + 0.3 * context_factor + time_drift + seasonality
            std = 0.1  # Small variance for optimal action
        else:
            mean = 0 + 0.6 * context_factor + time_drift + seasonality
            std = 0.3  # Higher variance for non-optimal action
        
        # Sample reward from Gaussian distribution
        reward = np.random.normal(mean, std)

        # Increment time step for next call
        self.t += 1
        
        return reward

    def generate_rewards4(self, context, action, optimal_action, n=3):
        '''
        Generates rewards with a weighted linear combination of context features, drift, and seasonality.
        '''
        # Sample `n` context features
        if len(context) >= n:
            sampled_context = np.random.choice(context, size=n, replace=False)
        else:
            sampled_context = context  # Use all features if fewer than n
        
        # Assign weights to context features
        context_weights = np.array([0.3, 0.5, 0.2])[:len(sampled_context)]
        # Weighted sum of context features
        context_factor = np.dot(sampled_context, context_weights)  

        # Non-stationary components: drift and seasonality
        time_drift = self.drift_rate * self.t  
        seasonality = 1.5 * np.sin(2 * np.pi * self.t / (self.seasonal_period * 4))

        # Calculate reward based on action and non-stationary factors
        if action == optimal_action:
            mean = 1.2 + 0.4 * context_factor + time_drift + seasonality
            std = 0.1  # Small variance for optimal action
        else:
            mean = 0.5 + 0.7 * context_factor + time_drift + seasonality
            std = 0.3  # Higher variance for non-optimal action

        # Sample reward from Gaussian distribution
        reward = np.random.normal(mean, std)

        # Increment time step for next call
        self.t += 1
        
        return reward

    def generate_rewards5(self, context, action, optimal_action, n=3):
        '''
        Reward function that models heteroscedastic rewards where reward variance depends on the context.
        '''
        # Randomly generate theta_heteroscedastic and theta_var
        theta_heteroscedastic = np.random.randn(n)  # Random vector for mean reward
        theta_var = np.random.randn(n)              # Random vector for variance

        # Sample `n` context features
        if len(context) >= n:
            sampled_context = np.random.choice(context, size=n, replace=False)
        else:
            sampled_context = context  # Use all features if fewer than n
    
        # Linear reward based on context sum
        context_factor = np.sum(sampled_context)
    
        # Calculate the mean reward (dot product with theta_heteroscedastic)
        base_reward = np.dot(sampled_context, theta_heteroscedastic)
    
        # Calculate the context-dependent variance (dot product with theta_var)
        context_variance = np.abs(np.dot(sampled_context, theta_var)) 
    
        # Add a base reward depending on whether the action is optimal
        optimal_bonus = 1.5 if action == optimal_action else 0.8
    
        # Linear reward calculation with noise
        reward = optimal_bonus + base_reward + 0.1 * context_factor + np.random.normal(0, context_variance)
        # reward = optimal_bonus + base_reward + np.random.normal(0, context_variance)
        
        return reward



In [205]:
# Example usage in your data generation function
def generate_data(context, total_samples, n_actions, optimal_actions, reward_type):
    rewards = torch.zeros(total_samples, n_actions)
    
    reward_gen = RewardGenerator()

    for i in range(total_samples):
        for action in range(n_actions):
            # Generate rewards based on the given logic
            if reward_type == 'lin1':
                rewards[i, action] = reward_gen.generate_rewards1(context[i], action,  optimal_actions[i])
            elif reward_type == 'lin2':
                rewards[i, action] = reward_gen.generate_rewards2(context[i], action,  optimal_actions[i])
            elif reward_type == 'lin3':
                rewards[i, action] = reward_gen.generate_rewards3(context[i], action,  optimal_actions[i])
            elif reward_type == 'lin4':
                rewards[i, action] = reward_gen.generate_rewards4(context[i], action,  optimal_actions[i])
            elif reward_type == 'lin5':
                rewards[i, action] = reward_gen.generate_rewards5(context[i], action,  optimal_actions[i])   

    return rewards


# Function to generate actions from a normal distribution and clip them to valid action range
# def generate_actions(total_samples):
#    return  np.random.choice([0, 1], size=total_samples)

def generate_actions(total_samples, bias=0.6):
    # Generate actions with the specified bias
    actions = np.random.choice([0, 1], size=total_samples, p=[1 - bias, bias])
    return actions

# Define the data generation and splitting function
def generate_data_and_split(X, total_samples, n_actions, optimal_actions, reward_type):
    
    rewards = generate_data(X, total_samples, n_actions, optimal_actions, reward_type)
    split_index = int(total_samples * 0.9)
    X_train = X[:split_index]
    historical_actions = generate_actions(split_index)
    historical_rewards = rewards[:split_index]
    
    X_val = X[split_index:]
    optimal_actions_val = optimal_actions[split_index:]
    rewards_val = rewards[split_index:]
    
    return X_train, X_val, historical_actions, historical_rewards, optimal_actions_val, rewards_val

## 3. LinUCB with DB

In [206]:
class LinUCB_DR:
    def __init__(self, n_actions, context_dim, alpha=0.5):
        self.n_actions = n_actions
        self.context_dim = context_dim
        self.alpha = alpha
        self.A = [np.eye(context_dim) for _ in range(n_actions)]  # Identity matrices for each action
        self.b = [np.zeros(context_dim) for _ in range(n_actions)]  # Zero vectors for each action
        self.propensities = None

    def calculate_propensity_scores(self, historical_actions):
        # Calculate propensity scores based on historical action frequencies
        total_samples = len(historical_actions)
        action_freq = np.bincount(historical_actions, minlength=self.n_actions) / total_samples
        self.propensities = action_freq[historical_actions]
        return self.propensities

    def update(self, action, reward, context):
        # Update A and b for the taken action
        self.A[action] += np.outer(context, context)
        self.b[action] += reward * context

    def predict(self, X):
        # Predict the reward for each action in the context X
        p = np.zeros((X.shape[0], self.n_actions))
        for a in range(self.n_actions):
            theta_a = np.linalg.solve(self.A[a], self.b[a])  # Solve A * theta = b
            p[:, a] = X @ theta_a + self.alpha * np.sqrt(np.sum(X @ np.linalg.inv(self.A[a]) * X, axis=1))
        return p

    def doubly_robust_estimator(self, X, chosen_actions, historical_rewards, historical_actions, all_true_rewards):
        # Calculate doubly robust reward estimates
        N = len(chosen_actions)
        dr_rewards = np.zeros(N)
        propensities = self.propensities if self.propensities is not None else self.calculate_propensity_scores(historical_actions)

        numerator_all = np.exp(self.predict(X))
        for i in range(N):
            Ai = chosen_actions[i]
            Ci = X[i]
            Ri = historical_rewards[i, historical_actions[i]]
            # pi_ratio = propensities[i] / propensities[chosen_actions[i]]
            # pi_ratio = propensities[chosen_actions[i]] / propensities[historical_actions[i]]
            
            # numerator = p/sum(p)  (softmax)
            # numerator = numerator_all[i] / np.sum(numerator_all[i])
            pi_ratio = numerator_all[i, Ai] / np.sum(numerator_all[i])
            # print(pi_ratio)

            predicted_reward = all_true_rewards[i, Ai]
            
            # dr_rewards[i] = Ri * pi_ratio + 1 / pi_ratio * predicted_reward
            dr_rewards[i] = pi_ratio * (Ri - predicted_reward) + predicted_reward

        return dr_rewards


## 4. Backtest

In [207]:
def run_backtest_DR(X, total_samples, n_actions, optimal_actions, context_dim, num_samples=10, lengthscale=1.0, outputscale=1.0, epochs=20, reward_type='lin1'):
    # Generate and split the dataset
    X_train, X_val, historical_actions, historical_rewards, optimal_actions_val, rewards_val = generate_data_and_split(
        X, total_samples, n_actions, optimal_actions, reward_type)

    # Convert PyTorch tensors to NumPy arrays if needed
    if isinstance(X_train, torch.Tensor):
        X_train = X_train.detach().numpy()
    if isinstance(X_val, torch.Tensor):
        X_val = X_val.detach().numpy()
    if isinstance(historical_rewards, torch.Tensor):
        historical_rewards = historical_rewards.detach().numpy()
    if isinstance(historical_actions, torch.Tensor):
        historical_actions = historical_actions.detach().numpy()

    # Initialize LinUCB_DR model
    lin_UCB_DR_model = LinUCB_DR(n_actions, context_dim, alpha=0.5)
    lin_UCB_DR_model.calculate_propensity_scores(historical_actions)  # Calculate propensities internally

    # Update the model with historical data
    for i in range(len(historical_actions)):
        lin_UCB_DR_model.update(historical_actions[i], historical_rewards[i, historical_actions[i]], X_train[i])

    # Generate predictions on validation data
    lin_UCB_preds = lin_UCB_DR_model.predict(X_val)
    lin_UCB_preds = np.argmax(lin_UCB_preds, axis=1)

    # Generate true rewards based on optimal actions
    all_true_rewards = np.zeros((X_val.shape[0], n_actions))
    reward_gen = RewardGenerator()
    
    for action in range(n_actions):
        for i, opt_action in enumerate(optimal_actions_val):
            if reward_type == 'lin1':
                all_true_rewards[i, action] = reward_gen.generate_rewards1(X_val[i], action, opt_action)
            elif reward_type == 'lin2':
                all_true_rewards[i, action] = reward_gen.generate_rewards2(X_val[i], action, opt_action)
            elif reward_type == 'lin3':
                all_true_rewards[i, action] = reward_gen.generate_rewards3(X_val[i], action, opt_action)
            elif reward_type == 'lin4':
                all_true_rewards[i, action] = reward_gen.generate_rewards4(X_val[i], action, opt_action)
            elif reward_type == 'lin5':
                all_true_rewards[i, action] = reward_gen.generate_rewards5(X_val[i], action, opt_action)

    # Calculate doubly robust rewards
    dr_rewards = lin_UCB_DR_model.doubly_robust_estimator(X_val, lin_UCB_preds, historical_rewards, historical_actions, all_true_rewards)

    # return average rewards
    print(f"Average Rewards_DR: {np.mean(dr_rewards):.4f}")
    print(f"LinUCB Accuracy: {accuracy_score(optimal_actions_val, lin_UCB_preds):.4f}")
    print(f"LinUCB F1 Score: {f1_score(optimal_actions_val, lin_UCB_preds, average='weighted'):.4f}")


In [208]:
# Example usage
X = df2.values
total_samples = X.shape[0]
n_actions = 2
context_dim = X.shape[1]   # Number of context features
optimal_actions = y

In [209]:
run_backtest_DR(X, total_samples, n_actions, optimal_actions, context_dim,  num_samples=10, lengthscale=1.0, outputscale=1.0, epochs=50)
run_backtest_DR(X, total_samples, n_actions, optimal_actions, context_dim,  num_samples=10, lengthscale=1.0, outputscale=1.0, epochs=50,reward_type='lin2')
run_backtest_DR(X, total_samples, n_actions, optimal_actions, context_dim,  num_samples=10, lengthscale=1.0, outputscale=1.0, epochs=50,reward_type='lin3')
run_backtest_DR(X, total_samples, n_actions, optimal_actions, context_dim,  num_samples=10, lengthscale=1.0, outputscale=1.0, epochs=50,reward_type='lin4')
run_backtest_DR(X, total_samples, n_actions, optimal_actions, context_dim,  num_samples=10, lengthscale=1.0, outputscale=1.0, epochs=50,reward_type='lin5')

Average Rewards_DR: 1.0415
LinUCB Accuracy: 0.7500
LinUCB F1 Score: 0.7334
Average Rewards_DR: 1.1209
LinUCB Accuracy: 0.6300
LinUCB F1 Score: 0.6397
Average Rewards_DR: 2.0445
LinUCB Accuracy: 0.5200
LinUCB F1 Score: 0.5355
Average Rewards_DR: 1.5758
LinUCB Accuracy: 0.6400
LinUCB F1 Score: 0.6471
Average Rewards_DR: 1.0236
LinUCB Accuracy: 0.6300
LinUCB F1 Score: 0.6409


## 5. Different alpha

In [210]:
def run_backtest_with_alpha(X, total_samples, n_actions, optimal_actions, context_dim, reward_type='linear', alpha_values=[0.1, 0.5, 1.0], epochs=20):
    X_train, X_val, historical_actions, historical_rewards, optimal_actions_val, rewards_val = generate_data_and_split(
        X, total_samples, n_actions, optimal_actions, reward_type)

    # Convert PyTorch tensors to NumPy arrays if needed
    if isinstance(X_train, torch.Tensor):
        X_train = X_train.detach().numpy()
    if isinstance(X_val, torch.Tensor):
        X_val = X_val.detach().numpy()
    if isinstance(historical_rewards, torch.Tensor):
        historical_rewards = historical_rewards.detach().numpy()
    if isinstance(historical_actions, torch.Tensor):
        historical_actions = historical_actions.detach().numpy()

    # Generate true rewards based on optimal actions
    all_true_rewards = np.zeros((X_val.shape[0], n_actions))
    reward_gen = RewardGenerator()
    
    for action in range(n_actions):
        for i, opt_action in enumerate(optimal_actions_val):
            if reward_type == 'lin1':
                all_true_rewards[i, action] = reward_gen.generate_rewards1(X_val[i], action, opt_action)
            elif reward_type == 'lin2':
                all_true_rewards[i, action] = reward_gen.generate_rewards2(X_val[i], action, opt_action)
            elif reward_type == 'lin3':
                all_true_rewards[i, action] = reward_gen.generate_rewards3(X_val[i], action, opt_action)
            elif reward_type == 'lin4':
                all_true_rewards[i, action] = reward_gen.generate_rewards4(X_val[i], action, opt_action)
            elif reward_type == 'lin5':
                all_true_rewards[i, action] = reward_gen.generate_rewards5(X_val[i], action, opt_action)

    # Loop over alpha values and calculate DR rewards
    for alpha in alpha_values:
        # Initialize LinUCB_DR model with the current alpha
        lin_UCB_DR_model = LinUCB_DR(n_actions, context_dim, alpha=alpha)
        lin_UCB_DR_model.calculate_propensity_scores(historical_actions)  # Calculate propensities internally
        
        # Update the model with historical data
        for i in range(len(historical_actions)):
            lin_UCB_DR_model.update(historical_actions[i], historical_rewards[i, historical_actions[i]], X_train[i])

        # Generate predictions on validation data
        lin_UCB_preds = lin_UCB_DR_model.predict(X_val)
        lin_UCB_preds = np.argmax(lin_UCB_preds, axis=1)

        # Calculate doubly robust rewards
        dr_rewards = lin_UCB_DR_model.doubly_robust_estimator(X_val, lin_UCB_preds, historical_rewards, historical_actions, all_true_rewards)
        
        # Output average DR rewards for the current alpha
        print(f"Alpha: {alpha}, Average DR Rewards: {np.mean(dr_rewards):.4f}")


In [211]:
alpha_values = [0.01, 0.1, 1.0, 2.0, 5.0, 10.0, 15.0, 20.0, 50.0]
run_backtest_with_alpha(X, total_samples=1000, n_actions=2, optimal_actions=optimal_actions, context_dim=X.shape[1], reward_type='lin1', alpha_values=alpha_values)

Alpha: 0.01, Average DR Rewards: 1.0497
Alpha: 0.1, Average DR Rewards: 1.0546
Alpha: 1.0, Average DR Rewards: 1.0529
Alpha: 2.0, Average DR Rewards: 1.0371
Alpha: 5.0, Average DR Rewards: 1.0228
Alpha: 10.0, Average DR Rewards: 1.0135
Alpha: 15.0, Average DR Rewards: 1.0086
Alpha: 20.0, Average DR Rewards: 1.0047
Alpha: 50.0, Average DR Rewards: 0.9949


In [212]:
run_backtest_with_alpha(X, total_samples=1000, n_actions=2, optimal_actions=optimal_actions, context_dim=X.shape[1], reward_type='lin2', alpha_values=alpha_values)

Alpha: 0.01, Average DR Rewards: 0.8731
Alpha: 0.1, Average DR Rewards: 0.8735
Alpha: 1.0, Average DR Rewards: 0.8894
Alpha: 2.0, Average DR Rewards: 0.8816
Alpha: 5.0, Average DR Rewards: 0.8670
Alpha: 10.0, Average DR Rewards: 0.8638
Alpha: 15.0, Average DR Rewards: 0.8504
Alpha: 20.0, Average DR Rewards: 0.8183
Alpha: 50.0, Average DR Rewards: 0.7546


In [213]:
run_backtest_with_alpha(X, total_samples=1000, n_actions=2, optimal_actions=optimal_actions, context_dim=X.shape[1], reward_type='lin3', alpha_values=alpha_values)

Alpha: 0.01, Average DR Rewards: 2.0868
Alpha: 0.1, Average DR Rewards: 2.0868
Alpha: 1.0, Average DR Rewards: 2.0839
Alpha: 2.0, Average DR Rewards: 2.0866
Alpha: 5.0, Average DR Rewards: 2.0866
Alpha: 10.0, Average DR Rewards: 2.0758
Alpha: 15.0, Average DR Rewards: 2.0971
Alpha: 20.0, Average DR Rewards: 2.0823
Alpha: 50.0, Average DR Rewards: 2.0913


In [214]:
run_backtest_with_alpha(X, total_samples=1000, n_actions=2, optimal_actions=optimal_actions, context_dim=X.shape[1], reward_type='lin4', alpha_values=alpha_values)

Alpha: 0.01, Average DR Rewards: 1.6129
Alpha: 0.1, Average DR Rewards: 1.6135
Alpha: 1.0, Average DR Rewards: 1.6079
Alpha: 2.0, Average DR Rewards: 1.5888
Alpha: 5.0, Average DR Rewards: 1.5885
Alpha: 10.0, Average DR Rewards: 1.5638
Alpha: 15.0, Average DR Rewards: 1.5766
Alpha: 20.0, Average DR Rewards: 1.5737
Alpha: 50.0, Average DR Rewards: 1.5982


In [215]:
run_backtest_with_alpha(X, total_samples=1000, n_actions=2, optimal_actions=optimal_actions, context_dim=X.shape[1], reward_type='lin5', alpha_values=alpha_values)

Alpha: 0.01, Average DR Rewards: 1.0490
Alpha: 0.1, Average DR Rewards: 1.0492
Alpha: 1.0, Average DR Rewards: 1.0568
Alpha: 2.0, Average DR Rewards: 1.0605
Alpha: 5.0, Average DR Rewards: 1.0847
Alpha: 10.0, Average DR Rewards: 1.0804
Alpha: 15.0, Average DR Rewards: 1.0638
Alpha: 20.0, Average DR Rewards: 1.0304
Alpha: 50.0, Average DR Rewards: 0.9163


## 6. Statistical Test

In [223]:
class LinUCB_DR_bias_variance:
    def __init__(self, n_actions, context_dim, alpha=0.5):
        self.n_actions = n_actions
        self.context_dim = context_dim
        self.alpha = alpha
        self.A = [np.eye(context_dim) for _ in range(n_actions)]
        self.b = [np.zeros(context_dim) for _ in range(n_actions)]
        self.propensities = None

    def update(self, action, reward, context):
        """Update A and b matrices for the selected action based on context and observed reward."""
        self.A[action] += np.outer(context, context)
        self.b[action] += reward * context

    def predict(self, X):
        """Predict expected rewards for each action given context X using LinUCB framework."""
        p = np.zeros((X.shape[0], self.n_actions))
        for a in range(self.n_actions):
            theta_a = np.linalg.solve(self.A[a], self.b[a])
            p[:, a] = X @ theta_a + self.alpha * np.sqrt(np.sum(X @ np.linalg.inv(self.A[a]) * X, axis=1))
        return p

    def calculate_propensity_scores(self, historical_actions, imbalance_ratio=None):
        """Calculate propensity scores based on action frequencies, optionally applying imbalance."""
        total_samples = len(historical_actions)
        action_freq = np.bincount(historical_actions, minlength=self.n_actions) / total_samples
        if imbalance_ratio is not None:
            action_freq = np.clip(action_freq * imbalance_ratio, 0, 1)
        self.propensities = action_freq[historical_actions]
        # Debug: print propensities
        print("Propensities:", self.propensities[:10])  # Print first 10 for a quick check
        return self.propensities

    def doubly_robust_estimator(self, X, chosen_actions, historical_rewards, historical_actions, propensities, all_true_rewards):
        """Compute DR rewards and calculate bias and variance of the estimator."""
        N = len(chosen_actions)
        dr_rewards = np.zeros(N)
        propensities = propensities if propensities is not None else self.propensities

        numerator_all = np.exp(self.predict(X))
        for i in range(N):
            Ai = chosen_actions[i]
            Ri = historical_rewards[i, historical_actions[i]]
            pi_ratio = numerator_all[i, Ai] / np.sum(numerator_all[i])
            predicted_reward = all_true_rewards[i, Ai]
            dr_rewards[i] = pi_ratio * (Ri - predicted_reward) + predicted_reward

        return dr_rewards
    
def calculate_true_policy_value(all_true_rewards):
    """Compute the true policy value as the mean of optimal rewards."""
    optimal_rewards = np.max(all_true_rewards, axis=1)
    return np.mean(optimal_rewards)

def bootstrap_variance(estimator_fn, X, chosen_actions, historical_rewards, historical_actions, propensities, all_true_rewards, n_bootstraps=50):
    """Calculate variance, bias, and MSE using bootstrapping for the given estimator."""
    estimates = []
    true_policy_value = calculate_true_policy_value(all_true_rewards)
    
    for _ in range(n_bootstraps):
        # Resample the data with replacement
        sample_indices = np.random.choice(len(X), len(X), replace=True)
        X_sample = X[sample_indices]
        chosen_actions_sample = chosen_actions[sample_indices]
        historical_rewards_sample = historical_rewards[sample_indices]
        historical_actions_sample = historical_actions[sample_indices]
        propensities_sample = propensities[sample_indices]  # Ensure propensity scores align with resampled data

        # Calculate the estimator's output for the resampled data
        estimate = estimator_fn(X_sample, chosen_actions_sample, historical_rewards_sample, historical_actions_sample, propensities_sample, all_true_rewards)
        estimates.append(np.mean(estimate))

    estimates = np.array(estimates)
    variance = np.var(estimates)
    bias = abs(np.mean(estimates) - true_policy_value)
    mse = bias ** 2 + variance
    
    # # Debug: Print final bias, variance, and mse for validation
    # print("Bias:", bias, "Variance:", variance, "MSE:", mse)
    return variance, bias, mse


In [224]:
def run_backtest_with_alpha_test(X, total_samples, n_actions, optimal_actions, context_dim, reward_type='linear', alpha_values=[0.1, 0.5, 1.0], epochs=20, imbalance_ratio=None):
    X_train, X_val, historical_actions, historical_rewards, optimal_actions_val, rewards_val = generate_data_and_split(
        X, total_samples, n_actions, optimal_actions, reward_type)
    # Convert PyTorch tensors to NumPy arrays if needed
    if isinstance(X_train, torch.Tensor):
        X_train = X_train.detach().numpy()
    if isinstance(X_val, torch.Tensor):
        X_val = X_val.detach().numpy()
    if isinstance(historical_rewards, torch.Tensor):
        historical_rewards = historical_rewards.detach().numpy()
    if isinstance(historical_actions, torch.Tensor):
        historical_actions = historical_actions.detach().numpy()

    # Generate true rewards based on optimal actions
    all_true_rewards = np.zeros((X_val.shape[0], n_actions))
    reward_gen = RewardGenerator()
    
    for action in range(n_actions):
        for i, opt_action in enumerate(optimal_actions_val):
            if reward_type == 'lin1':
                all_true_rewards[i, action] = reward_gen.generate_rewards1(X_val[i], action, opt_action)
            elif reward_type == 'lin2':
                all_true_rewards[i, action] = reward_gen.generate_rewards2(X_val[i], action, opt_action)
            elif reward_type == 'lin3':
                all_true_rewards[i, action] = reward_gen.generate_rewards3(X_val[i], action, opt_action)
            elif reward_type == 'lin4':
                all_true_rewards[i, action] = reward_gen.generate_rewards4(X_val[i], action, opt_action)
            elif reward_type == 'lin5':
                all_true_rewards[i, action] = reward_gen.generate_rewards5(X_val[i], action, opt_action)
    
    results_df = pd.DataFrame(columns=['Reward Type', 'Alpha', 'Avg DR Rewards', 'Avg LinUCB Rewards', 
                                       'Paired t-stat', 'Paired p-value', 'Observed Mean Diff', 
                                       'Bootstrap p-value', 'Bias DR', 'Bias Non-DR', 'Variance DR', 'Variance Non-DR'])
    
    for alpha in alpha_values:
        lin_UCB_DR_model = LinUCB_DR_bias_variance(n_actions, context_dim, alpha=alpha)
        lin_UCB_DR_model.calculate_propensity_scores(historical_actions, imbalance_ratio=imbalance_ratio)
        
        # Train the LinUCB model
        for i in range(len(historical_actions)):
            lin_UCB_DR_model.update(historical_actions[i], historical_rewards[i, historical_actions[i]], X_train[i])
        
        # Predict actions using LinUCB
        lin_UCB_preds = np.argmax(lin_UCB_DR_model.predict(X_val), axis=1)

        # Calculate DR rewards
        dr_rewards = lin_UCB_DR_model.doubly_robust_estimator(X_val, lin_UCB_preds, historical_rewards, historical_actions, lin_UCB_DR_model.propensities, all_true_rewards)
        dr_rewards_avg = np.mean(dr_rewards)
        
        # Calculate rewards for Non-DR estimator (LinUCB)
        lin_UCB_rewards = np.array([all_true_rewards[i, lin_UCB_preds[i]] for i in range(len(lin_UCB_preds))])
        lin_UCB_rewards_avg = np.mean(lin_UCB_rewards)
        
        # Perform bootstrapping to calculate bias, variance, and MSE for DR and Non-DR estimators
        variance_dr, bias_dr, mse_dr = bootstrap_variance(
            lin_UCB_DR_model.doubly_robust_estimator,
            X_val, lin_UCB_preds, historical_rewards, historical_actions, 
            lin_UCB_DR_model.propensities, all_true_rewards
        )

        # For Non-DR estimator (ensure resampling is consistent)
        variance_non_dr, bias_non_dr, mse_non_dr = bootstrap_variance(
            lambda X, chosen_actions, hr, ha, ps, atr: np.array([atr[i, chosen_actions[i]] for i in range(len(chosen_actions))]),
            X_val, lin_UCB_preds, historical_rewards, historical_actions,
            lin_UCB_DR_model.propensities, all_true_rewards
        )
        
        # Paired t-test between DR and Non-DR rewards
        t_stat, p_value_ttest = stats.ttest_rel(dr_rewards, lin_UCB_rewards)

        # Bootstrap Test
        n_bootstrap = 10000
        differences = dr_rewards - lin_UCB_rewards
        observed_mean_diff = np.mean(differences)
        
        bootstrap_means = np.array([
            np.mean(np.random.choice(differences, size=len(differences), replace=True)) 
            for _ in range(n_bootstrap)
        ])
        
        p_value_bootstrap = np.mean(bootstrap_means >= observed_mean_diff)
        
        # Append results to the DataFrame
        temp_df = pd.DataFrame([{
            'Reward Type': reward_type,
            'Alpha': alpha,
            'Avg DR Rewards': dr_rewards_avg,
            'Avg LinUCB Rewards': lin_UCB_rewards_avg,
            'Paired t-stat': t_stat,
            'Paired p-value': p_value_ttest,
            'Observed Mean Diff': observed_mean_diff,
            'Bootstrap p-value': p_value_bootstrap,
            'Bias DR': bias_dr,
            'Bias Non-DR': bias_non_dr,
            'Variance DR': variance_dr,
            'Variance Non-DR': variance_non_dr
        }])

        results_df = pd.concat([results_df, temp_df], ignore_index=True)

    return results_df

In [225]:
# Running the backtest
alpha_values = [0.01, 0.1, 1.0, 10.0, 50.0]
imbalance_ratio = 1.5  # Test different imbalance levels as desired
results_all = pd.DataFrame()

for reward_type in ['lin1', 'lin2', 'lin3', 'lin4', 'lin5']:
    result_df = run_backtest_with_alpha_test(
        X, total_samples=1000, n_actions=2, optimal_actions=optimal_actions, 
        context_dim=X.shape[1], reward_type=reward_type, 
        alpha_values=alpha_values, imbalance_ratio=imbalance_ratio)
    results_all = pd.concat([results_all, result_df], ignore_index=True)

# Display or save final results DataFrame
results_all

Propensities: [0.93833333 0.56166667 0.93833333 0.56166667 0.56166667 0.93833333
 0.56166667 0.56166667 0.56166667 0.93833333]
Propensities: [0.93833333 0.56166667 0.93833333 0.56166667 0.56166667 0.93833333
 0.56166667 0.56166667 0.56166667 0.93833333]
Propensities: [0.93833333 0.56166667 0.93833333 0.56166667 0.56166667 0.93833333
 0.56166667 0.56166667 0.56166667 0.93833333]
Propensities: [0.93833333 0.56166667 0.93833333 0.56166667 0.56166667 0.93833333
 0.56166667 0.56166667 0.56166667 0.93833333]
Propensities: [0.93833333 0.56166667 0.93833333 0.56166667 0.56166667 0.93833333
 0.56166667 0.56166667 0.56166667 0.93833333]
Propensities: [0.56833333 0.93166667 0.93166667 0.93166667 0.93166667 0.56833333
 0.56833333 0.93166667 0.93166667 0.56833333]
Propensities: [0.56833333 0.93166667 0.93166667 0.93166667 0.93166667 0.56833333
 0.56833333 0.93166667 0.93166667 0.56833333]
Propensities: [0.56833333 0.93166667 0.93166667 0.93166667 0.93166667 0.56833333
 0.56833333 0.93166667 0.93166

Unnamed: 0,Reward Type,Alpha,Avg DR Rewards,Avg LinUCB Rewards,Paired t-stat,Paired p-value,Observed Mean Diff,Bootstrap p-value,Bias DR,Bias Non-DR,Variance DR,Variance Non-DR
0,lin1,0.01,1.001675,1.145896,-4.353978,3.3e-05,-0.14422,0.496,0.397187,0.306272,0.00075,0.001048
1,lin1,0.1,1.00141,1.145896,-4.354879,3.3e-05,-0.144485,0.501,0.401991,0.313613,0.000731,0.001064
2,lin1,1.0,0.99257,1.13371,-4.18511,6.2e-05,-0.14114,0.4993,0.389679,0.280463,0.000505,0.000861
3,lin1,10.0,0.966647,1.140609,-3.916799,0.000165,-0.173962,0.5085,0.393209,0.227028,0.000648,0.0
4,lin1,50.0,0.916783,1.140609,-3.916858,0.000165,-0.223826,0.4974,0.454456,0.227028,0.000935,0.0
5,lin2,0.01,0.926252,1.144727,-2.139412,0.034861,-0.218474,0.5001,0.899162,0.754881,0.008052,0.004548
6,lin2,0.1,0.926505,1.144727,-2.137245,0.035041,-0.218222,0.5013,0.902592,0.744951,0.007235,0.007413
7,lin2,1.0,0.942434,1.170252,-2.27397,0.025127,-0.227818,0.5013,0.911055,0.770378,0.005194,0.007971
8,lin2,10.0,0.982394,1.21341,-2.232377,0.027843,-0.231016,0.5024,0.870362,0.745131,0.006265,0.00694
9,lin2,50.0,0.89693,1.096884,-1.511622,0.133816,-0.199954,0.5025,0.916258,0.750676,0.006678,0.000341


In [219]:
import numpy as np
import pandas as pd
import scipy.stats as stats

def run_backtest_with_alpha_test_with_exploration(X, total_samples, n_actions, optimal_actions, context_dim, reward_type='linear', alpha_values=[0.1, 0.5, 1.0], epochs=20):
    X_train, X_val, historical_actions, historical_rewards, optimal_actions_val, rewards_val = generate_data_and_split(
        X, total_samples, n_actions, optimal_actions, reward_type)

    # Convert PyTorch tensors to NumPy arrays if needed
    if isinstance(X_train, torch.Tensor):
        X_train = X_train.detach().numpy()
    if isinstance(X_val, torch.Tensor):
        X_val = X_val.detach().numpy()
    if isinstance(historical_rewards, torch.Tensor):
        historical_rewards = historical_rewards.detach().numpy()
    if isinstance(historical_actions, torch.Tensor):
        historical_actions = historical_actions.detach().numpy()

    # Generate true rewards based on optimal actions
    all_true_rewards = np.zeros((X_val.shape[0], n_actions))
    reward_gen = RewardGenerator()
    
    for action in range(n_actions):
        for i, opt_action in enumerate(optimal_actions_val):
            if reward_type == 'lin1':
                all_true_rewards[i, action] = reward_gen.generate_rewards1(X_val[i], action, opt_action)
            elif reward_type == 'lin2':
                all_true_rewards[i, action] = reward_gen.generate_rewards2(X_val[i], action, opt_action)
            elif reward_type == 'lin3':
                all_true_rewards[i, action] = reward_gen.generate_rewards3(X_val[i], action, opt_action)
            elif reward_type == 'lin4':
                all_true_rewards[i, action] = reward_gen.generate_rewards4(X_val[i], action, opt_action)
            elif reward_type == 'lin5':
                all_true_rewards[i, action] = reward_gen.generate_rewards5(X_val[i], action, opt_action)

    # Create an empty DataFrame to store results
    results_df = pd.DataFrame(columns=['Reward Type', 'Alpha Pair', 'Avg DR Rewards (No Exploration)', 'Avg DR Rewards (With Exploration)', 'Paired t-stat', 'Paired p-value', 'Observed Mean Diff', 'Bootstrap p-value'])

    # Store DR rewards for each alpha to compare later
    dr_rewards_dict = {}

    for alpha in alpha_values:
        # Initialize LinUCB model with current alpha
        lin_UCB_DR_model = LinUCB_DR(n_actions, context_dim, alpha=alpha)
        lin_UCB_DR_model.calculate_propensity_scores(historical_actions)  # Calculate propensities internally
        
        # Train LinUCB model on historical data
        for i in range(len(historical_actions)):
            lin_UCB_DR_model.update(historical_actions[i], historical_rewards[i, historical_actions[i]], X_train[i])
        
        # Predict with LinUCB on validation set
        lin_UCB_preds = lin_UCB_DR_model.predict(X_val)
        lin_UCB_preds = np.argmax(lin_UCB_preds, axis=1)
        
        # Calculate doubly robust rewards
        dr_rewards = lin_UCB_DR_model.doubly_robust_estimator(X_val, lin_UCB_preds, historical_rewards, historical_actions, all_true_rewards)
        dr_rewards_avg = np.mean(dr_rewards)
        
        # Store the DR rewards for the current alpha value
        dr_rewards_dict[alpha] = dr_rewards

    # Perform paired t-test and bootstrap test between alpha=0.01 (without exploration) and other alphas
    alpha_no_exploration = 0.01
    dr_rewards_no_exploration = dr_rewards_dict[alpha_no_exploration]

    for alpha_with_exploration in alpha_values:
        if alpha_with_exploration == alpha_no_exploration:
            continue  # Skip comparison of 0.01 with itself

        dr_rewards_with_exploration = dr_rewards_dict[alpha_with_exploration]

        # Paired t-test between DR rewards with and without exploration
        t_stat, p_value_ttest = stats.ttest_rel(dr_rewards_no_exploration, dr_rewards_with_exploration)

        # Bootstrap Test
        n_bootstrap = 10000
        differences = dr_rewards_with_exploration - dr_rewards_no_exploration
        observed_mean_diff = np.mean(differences)
        
        # Bootstrap sampling
        bootstrap_means = np.array([
            np.mean(np.random.choice(differences, size=len(differences), replace=True)) 
            for _ in range(n_bootstrap)
        ])
        
        # Calculate the p-value as the proportion of bootstrap samples with a mean greater than or equal to the observed mean difference
        p_value_bootstrap = np.mean(bootstrap_means >= observed_mean_diff)
        
        # Append comparison results to the DataFrame
        temp_df = pd.DataFrame([{
            'Reward Type': reward_type,
            'Alpha Pair': f"{alpha_no_exploration} vs {alpha_with_exploration}",
            'Avg DR Rewards (No Exploration)': np.mean(dr_rewards_no_exploration),
            'Avg DR Rewards (With Exploration)': np.mean(dr_rewards_with_exploration),
            'Paired t-stat': t_stat,
            'Paired p-value': p_value_ttest,
            'Observed Mean Diff': observed_mean_diff,
            'Bootstrap p-value': p_value_bootstrap
        }])
        
        results_df = pd.concat([results_df, temp_df], ignore_index=True)

    return results_df

# Running the function with a loop over different reward types and alpha values
alpha_values = [0.01, 5.0, 20.0, 50.0]
results_all = pd.DataFrame()
for reward_type in ['lin1', 'lin2', 'lin3', 'lin4', 'lin5']:
    result_df = run_backtest_with_alpha_test_with_exploration(X, total_samples=1000, n_actions=2, optimal_actions=optimal_actions, context_dim=X.shape[1], reward_type=reward_type, alpha_values=alpha_values)
    results_all = pd.concat([results_all, result_df], ignore_index=True)

# Display the final results DataFrame
display(results_all)


Unnamed: 0,Reward Type,Alpha Pair,Avg DR Rewards (No Exploration),Avg DR Rewards (With Exploration),Paired t-stat,Paired p-value,Observed Mean Diff,Bootstrap p-value
0,lin1,0.01 vs 5.0,1.041305,1.008954,2.418891,0.017394,-0.032351,0.4974
1,lin1,0.01 vs 20.0,1.041305,0.993178,2.790936,0.006307,-0.048127,0.5019
2,lin1,0.01 vs 50.0,1.041305,0.985886,2.684119,0.008526,-0.05542,0.4957
3,lin2,0.01 vs 5.0,1.012937,1.012134,0.044447,0.964638,-0.000803,0.4963
4,lin2,0.01 vs 20.0,1.012937,0.948804,1.843242,0.068286,-0.064133,0.5052
5,lin2,0.01 vs 50.0,1.012937,0.963839,1.496359,0.137741,-0.049098,0.5082
6,lin3,0.01 vs 5.0,2.033489,2.044111,-0.553802,0.580963,0.010622,0.5063
7,lin3,0.01 vs 20.0,2.033489,2.033063,0.016283,0.987041,-0.000426,0.5031
8,lin3,0.01 vs 50.0,2.033489,2.042518,-0.365081,0.71583,0.009029,0.5049
9,lin4,0.01 vs 5.0,1.598786,1.599276,-0.022186,0.982344,0.00049,0.5218
