In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import minimize
from sklearn.preprocessing import StandardScaler

  from pandas.core import (


### Entropy Balancing

#### Outcome: Physical activity

In [2]:
dfg= pd.read_csv("testdata/2.PA_treatment.csv")
df = pd.read_csv("testdata/2.PA_control.csv")

In [3]:
df.columns

Index(['Age', 'Gender', 'BMI', 'Depression', 'Employment_status',
       'pain_medication_baseline', 'Baseline_Physical_Activity',
       'Baseline_Pain', 'Baseline_Quality_of_life', 'PA_change'],
      dtype='object')

In [4]:
dfg.columns

Index(['Age', 'Gender', 'BMI', 'Depression', 'Employment_status',
       'pain_medication_baseline', 'Baseline_Physical_Activity',
       'Baseline_Pain', 'Baseline_Quality_of_life', 'PA_change'],
      dtype='object')

In [5]:
df.shape

(1154, 10)

In [6]:
dfg.shape

(7603, 10)

In [7]:
df

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,pain_medication_baseline,Baseline_Physical_Activity,Baseline_Pain,Baseline_Quality_of_life,PA_change
0,75.0,1.0,27.2,1.0,3.0,0.0,1,0.0,75.00,3
1,77.0,1.0,33.6,1.0,3.0,0.0,1,50.0,37.50,3
2,76.0,1.0,31.0,2.0,3.0,0.0,1,10.0,56.25,6
3,69.0,1.0,31.8,2.0,1.0,0.0,1,30.0,62.50,2
4,74.0,1.0,29.6,1.0,1.0,0.0,1,70.0,50.00,4
...,...,...,...,...,...,...,...,...,...,...
1149,46.0,2.0,26.9,2.0,1.0,0.0,10,60.0,56.25,0
1150,54.0,1.0,27.6,2.0,1.0,0.0,10,20.0,68.75,0
1151,56.0,2.0,27.5,1.0,1.0,0.0,10,40.0,43.75,-4
1152,57.0,1.0,33.6,2.0,1.0,1.0,10,40.0,68.75,0


In [8]:
# Add treatment indicator
df['Treatment'] = 0  # Control group
dfg['Treatment'] = 1  # Treatment group

# Combine the datasets
combined_data = pd.concat([df, dfg], ignore_index=True)

# Define covariates and outcome
# Include 'Baseline_Physical_Activity' for calculating weights
covariates_for_weighting = ['Age', 'Gender', 'BMI', 'Depression', 'Employment_status', 
                            'Baseline_Physical_Activity', 'Baseline_Quality_of_life', 'Baseline_Pain']
# Exclude 'Baseline_Physical_Activity' and 'PA_change' in the final balancing
covariates_for_balancing = ['Age', 'Gender', 'BMI', 'Depression', 'Employment_status', 
                            'Baseline_Physical_Activity','Baseline_Quality_of_life', 'Baseline_Pain']
outcome = 'PA_change'
treatment = 'Treatment'

# Separate treatment and control groups
control_data = combined_data[combined_data[treatment] == 0]
treatment_data = combined_data[combined_data[treatment] == 1]

# Calculate target moments (means) from the treatment group
target_means = treatment_data[covariates_for_weighting].mean()

# Define the objective function for optimization
def objective(weights):
    # Apply weights to the control group's covariates
    weighted_covariates = control_data[covariates_for_weighting].multiply(weights, axis=0)
    # Calculate the weighted means of the covariates
    weighted_means = weighted_covariates.mean()
    # Objective is to minimize the sum of squared differences between weighted means and target means
    return np.sum((weighted_means - target_means) ** 2)

# Initial weights (starting with equal weights for all control group observations)
initial_weights = np.ones(len(control_data))

# Constraints: weights should sum to the number of control units
constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - len(control_data)})

# Bounds: weights should be non-negative
bounds = [(0, None) for _ in range(len(control_data))]

# Optimize weights using Sequential Least Squares Programming (SLSQP)
result = minimize(objective, initial_weights, method='SLSQP', constraints=constraints, bounds=bounds)

# Get the optimized weights from the result
optimized_weights = result.x

# Ensure weights are positive and normalized to sum to the number of control units
control_data['weights'] = np.abs(optimized_weights)
control_data['weights'] = control_data['weights'] / np.sum(control_data['weights']) * len(control_data)

# Apply the weights to the control data covariates for balancing check, excluding Baseline_Physical_Activity and PA_change
weighted_control_data = control_data[covariates_for_balancing].multiply(control_data['weights'], axis=0)
weighted_control_data['Baseline_Physical_Activity'] = control_data['Baseline_Physical_Activity']
weighted_control_data['PA_change'] = control_data['PA_change']

# Calculate the weighted means of the covariates in the control group
weighted_means = weighted_control_data[covariates_for_balancing].mean()
print("Weighted Control Group Means:\n", weighted_means)
print("\nTarget Treatment Group Means:\n", target_means[covariates_for_balancing])

# For the final balanced control group, we use the repetitions approach excluding Baseline_Physical_Activity and PA_change
control_data['reps'] = np.round(control_data['weights']).astype(int)

# Adjust repetitions to match the original control group size
while control_data['reps'].sum() != len(control_data):
    diff = len(control_data) - control_data['reps'].sum()
    idx = np.random.choice(control_data.index)
    control_data.loc[idx, 'reps'] += np.sign(diff)

# Create the balanced control group by repeating rows according to the reps
repeated_rows = control_data.index.repeat(control_data['reps'])
balanced_control_data = control_data.loc[repeated_rows].reset_index(drop=True)

# Ensure Baseline_Physical_Activity remains unchanged
balanced_control_data['Baseline_Physical_Activity'] = np.repeat(control_data['Baseline_Physical_Activity'].values, control_data['reps'].values)

# Remove the temporary columns used for weighting
balanced_control_data = balanced_control_data.drop(columns=['weights', 'reps'])

# Check the means of the covariates to ensure they match the target means
balanced_means = balanced_control_data[covariates_for_balancing].mean()
print("Balanced Control Group Means:\n", balanced_means)
print("\nTarget Treatment Group Means (for balancing):\n", target_means[covariates_for_balancing])

# Combine the balanced control group with the treatment group
balanced_combined_data = pd.concat([balanced_control_data, treatment_data], ignore_index=True)

# Display the balanced control group dataframe
print("Balanced Control Group Data:")
print(balanced_control_data)

print("balanced_control_data.shape", balanced_control_data.shape)
print("control_data.shape", df.shape)

Weighted Control Group Means:
 Age                           56.694363
Gender                         1.641701
BMI                           29.591453
Depression                     1.124355
Employment_status              1.152457
Baseline_Physical_Activity     5.662045
Baseline_Quality_of_life      45.054497
Baseline_Pain                 47.219001
dtype: float64

Target Treatment Group Means:
 Age                           56.692227
Gender                         1.700250
BMI                           29.592791
Depression                     1.069578
Employment_status              1.156649
Baseline_Physical_Activity     5.797843
Baseline_Quality_of_life      45.055406
Baseline_Pain                 47.219387
dtype: float64
Balanced Control Group Means:
 Age                           56.643847
Gender                         1.642114
BMI                           29.621664
Depression                     1.123917
Employment_status              1.145581
Baseline_Physical_Activity     5.820

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_data['weights'] = np.abs(optimized_weights)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_data['weights'] = control_data['weights'] / np.sum(control_data['weights']) * len(control_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_data['reps'] = np.round(control_data['wei

In [9]:
dfg

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,pain_medication_baseline,Baseline_Physical_Activity,Baseline_Pain,Baseline_Quality_of_life,PA_change,Treatment
0,50,2,22.68,1,1.0,2.0,10.0,65.0,31.25,0.0,1
1,45,1,29.07,1,1.0,1.0,7.0,96.0,25.00,3.0,1
2,46,2,26.10,1,1.0,1.0,3.0,39.0,68.75,6.0,1
3,53,2,28.03,1,1.0,2.0,4.0,15.0,62.50,0.0,1
4,68,2,27.82,1,1.0,1.0,8.0,15.0,87.50,2.0,1
...,...,...,...,...,...,...,...,...,...,...,...
7598,55,2,35.63,1,1.0,1.0,8.0,57.0,50.00,2.0,1
7599,61,2,29.34,1,1.0,1.0,3.0,55.0,25.00,3.0,1
7600,62,1,25.88,1,1.0,2.0,3.0,26.0,50.00,0.0,1
7601,47,2,26.51,1,1.0,2.0,4.0,47.0,43.75,-2.0,1


In [10]:
df

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,pain_medication_baseline,Baseline_Physical_Activity,Baseline_Pain,Baseline_Quality_of_life,PA_change,Treatment
0,75.0,1.0,27.2,1.0,3.0,0.0,1,0.0,75.00,3,0
1,77.0,1.0,33.6,1.0,3.0,0.0,1,50.0,37.50,3,0
2,76.0,1.0,31.0,2.0,3.0,0.0,1,10.0,56.25,6,0
3,69.0,1.0,31.8,2.0,1.0,0.0,1,30.0,62.50,2,0
4,74.0,1.0,29.6,1.0,1.0,0.0,1,70.0,50.00,4,0
...,...,...,...,...,...,...,...,...,...,...,...
1149,46.0,2.0,26.9,2.0,1.0,0.0,10,60.0,56.25,0,0
1150,54.0,1.0,27.6,2.0,1.0,0.0,10,20.0,68.75,0,0
1151,56.0,2.0,27.5,1.0,1.0,0.0,10,40.0,43.75,-4,0
1152,57.0,1.0,33.6,2.0,1.0,1.0,10,40.0,68.75,0,0


In [11]:
balanced_control_data

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,pain_medication_baseline,Baseline_Physical_Activity,Baseline_Pain,Baseline_Quality_of_life,PA_change,Treatment
0,74.0,1.0,29.6,1.0,1.0,0.0,1.0,70.0,50.00,4.0,0
1,49.0,2.0,33.8,1.0,1.0,0.0,1.0,60.0,56.25,3.0,0
2,49.0,2.0,33.8,1.0,1.0,0.0,1.0,60.0,56.25,3.0,0
3,49.0,2.0,33.8,1.0,1.0,0.0,1.0,60.0,56.25,3.0,0
4,66.0,2.0,33.2,1.0,3.0,1.0,1.0,60.0,68.75,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...
1149,56.0,2.0,27.5,1.0,1.0,0.0,10.0,40.0,43.75,-4.0,0
1150,56.0,2.0,27.5,1.0,1.0,0.0,10.0,40.0,43.75,-4.0,0
1151,56.0,2.0,27.5,1.0,1.0,0.0,10.0,40.0,43.75,-4.0,0
1152,53.0,2.0,30.1,1.0,1.0,0.0,10.0,50.0,50.00,-1.0,0


In [12]:
weighted_control_data

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Baseline_Quality_of_life,Baseline_Pain,PA_change
0,1.358127e+01,1.810836e-01,4.925474e+00,1.810836e-01,5.432508e-01,1.0,1.358127e+01,0.000000e+00,3.0
1,5.339690e-02,6.934662e-04,2.330046e-02,6.934662e-04,2.080399e-03,1.0,2.600498e-02,3.467331e-02,3.0
2,2.879858e-15,3.789287e-17,1.174679e-15,7.578573e-17,1.136786e-16,1.0,2.131474e-15,3.789287e-16,6.0
3,4.532833e+00,6.569323e-02,2.089045e+00,1.313865e-01,6.569323e-02,1.0,4.105827e+00,1.970797e+00,2.0
4,6.622398e+01,8.949187e-01,2.648959e+01,8.949187e-01,8.949187e-01,1.0,4.474593e+01,6.264431e+01,4.0
...,...,...,...,...,...,...,...,...,...
1149,3.688118e+01,1.603529e+00,2.156747e+01,1.603529e+00,8.017647e-01,10.0,4.509927e+01,4.810588e+01,0.0
1150,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,10.0,0.000000e+00,0.000000e+00,0.0
1151,1.465625e+02,5.234374e+00,7.197264e+01,2.617187e+00,2.617187e+00,10.0,1.145019e+02,1.046875e+02,-4.0
1152,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,10.0,0.000000e+00,0.000000e+00,0.0


In [13]:
# Control group (Real one)
mean_age_control = df['Age'].mean()
mean_gender_control = df['Gender'].mean()
mean_bmi_control = df['BMI'].mean()
mean_depression_control = df['Depression'].mean()
mean_employment_status_control = df['Employment_status'].mean()
mean_quality_of_life_control = df['Baseline_Quality_of_life'].mean()
mean_pain_control = df['Baseline_Pain'].mean()
mean_PA_control = df['Baseline_Physical_Activity'].mean()

# Weighted Control group 
weighted_mean_age_control = weighted_control_data['Age'].mean()
weighted_mean_gender_control = weighted_control_data['Gender'].mean()
weighted_mean_bmi_control = weighted_control_data['BMI'].mean()
weighted_mean_depression_control = weighted_control_data['Depression'].mean()
weighted_mean_employment_status_control = weighted_control_data['Employment_status'].mean()
weighted_mean_quality_of_life_control = weighted_control_data['Baseline_Quality_of_life'].mean()
weighted_mean_pain_control = weighted_control_data['Baseline_Pain'].mean()
weighted_mean_PA_control = weighted_control_data['Baseline_Physical_Activity'].mean()

# Balanced Control group
balanced_mean_age_control = balanced_control_data['Age'].mean()
balanced_mean_gender_control = balanced_control_data['Gender'].mean()
balanced_mean_bmi_control = balanced_control_data['BMI'].mean()
balanced_mean_depression_control = balanced_control_data['Depression'].mean()
balanced_mean_employment_status_control = balanced_control_data['Employment_status'].mean()
balanced_mean_quality_of_life_control = balanced_control_data['Baseline_Quality_of_life'].mean()
balanced_mean_pain_control = balanced_control_data['Baseline_Pain'].mean()
balanced_mean_PA_control = balanced_control_data['Baseline_Physical_Activity'].mean()

# Treatment group (unweighted)
mean_age_treatment = dfg['Age'].mean()
mean_gender_treatment = dfg['Gender'].mean()
mean_bmi_treatment = dfg['BMI'].mean()
mean_Depression_treatment = dfg['Depression'].mean()
mean_employment_status_treatment = dfg['Employment_status'].mean()
mean_QOL_treatment = dfg['Baseline_Quality_of_life'].mean()
mean_pain_treatment = dfg['Baseline_Pain'].mean()
mean_PA_treatment = dfg['Baseline_Physical_Activity'].mean()

# Result
print("Mean in control group:")
print(f"Mean Age (Control Group): {mean_age_control}")
print(f"Mean Gender (Control Group): {mean_gender_treatment}")
print(f"Mean BMI (Control Group): {mean_bmi_control}")
print(f"Mean Depression (Control Group): {mean_depression_control}")
print(f"Mean Employment Status (Control Group): {mean_employment_status_control}")
print(f"Mean Physical Activity (Control Group): {mean_PA_control}")
print(f"Mean QOL (Control Group): {mean_quality_of_life_control}")
print(f"Mean Pain (Control Group): {mean_pain_control}")



print("--------------------------------------------------")
print("Weighted mean in control group:")
print(f"Weighted Mean Age (Control Group): {weighted_mean_age_control}")
print(f"Weighted Mean Gender (Control Group): {weighted_mean_gender_control}")
print(f"Weighted Mean BMI (Control Group): {weighted_mean_bmi_control}")
print(f"Weighted Mean Depression (Control Group): {weighted_mean_depression_control}")
print(f"Weighted Mean Employment Status (Control Group): {weighted_mean_employment_status_control}")
print(f"Weighted Mean PA (Control Group): {weighted_mean_PA_control}")
print(f"Weighted Mean QOL (Control Group): {weighted_mean_quality_of_life_control}")
print(f"Weighted Mean Pain (Control Group): {weighted_mean_pain_control}")


print("--------------------------------------------------")
print("Balanced mean in control group:")
print(f"Balanced Mean Age (Control Group): {balanced_mean_age_control}")
print(f"Balanced Mean Gender (Control Group): {balanced_mean_gender_control}")
print(f"Balanced Mean BMI (Control Group): {balanced_mean_bmi_control}")
print(f"Balanced Mean Depression (Control Group): {balanced_mean_depression_control}")
print(f"Balanced Mean Employment Status (Control Group): {weighted_mean_employment_status_control}")
print(f"Balanced Mean PA (Control Group): {balanced_mean_PA_control}")
print(f"Balanced Mean QOL (Control Group): {balanced_mean_quality_of_life_control}")
print(f"Balanced Mean Pain (Control Group): {balanced_mean_pain_control}")

print("--------------------------------------------------")
print("Mean in treatnement group:")
print(f"Mean Age (Treatment Group): {mean_age_treatment}")
print(f"Mean Gender (Treatment Group): {mean_gender_control}")
print(f"Mean BMI (Treatment Group): {mean_bmi_treatment}")
print(f"Mean Depression (Treatment Group): {mean_Depression_treatment}")
print(f"Mean Employment Status (Treatment Group): {mean_employment_status_treatment}")
print(f"Mean Physical Activity (Treatment Group): {mean_PA_treatment}")
print(f"Mean QOL (Treatment Group): {mean_QOL_treatment}")
print(f"Mean Pain (Treatment Group): {mean_pain_treatment}")

Mean in control group:
Mean Age (Control Group): 61.149913344887345
Mean Gender (Control Group): 1.7002499013547283
Mean BMI (Control Group): 30.09974003466205
Mean Depression (Control Group): 1.3067590987868285
Mean Employment Status (Control Group): 1.6949740034662044
Mean Physical Activity (Control Group): 5.662045060658579
Mean QOL (Control Group): 51.83600519930676
Mean Pain (Control Group): 50.346620450606586
--------------------------------------------------
Weighted mean in control group:
Weighted Mean Age (Control Group): 56.69436308960669
Weighted Mean Gender (Control Group): 1.6417007273819746
Weighted Mean BMI (Control Group): 29.591453257849373
Weighted Mean Depression (Control Group): 1.1243553227498964
Weighted Mean Employment Status (Control Group): 1.152456727765731
Weighted Mean PA (Control Group): 5.662045060658579
Weighted Mean QOL (Control Group): 45.05449705099014
Weighted Mean Pain (Control Group): 47.219001378523195
----------------------------------------------

In [14]:
balanced_control_data.to_csv('testdata/3.PA_balanced.csv', index=False)

In [15]:
df.PA_change.value_counts()

PA_change
 0    354
 1    208
-1    190
-2    119
 2    114
-3     60
 3     55
-4     20
 4     15
 5      7
-6      4
 6      3
-5      3
 7      2
Name: count, dtype: int64

In [16]:
balanced_control_data.PA_change.value_counts()

PA_change
 0.0    344
 1.0    209
-1.0    169
 2.0    123
-2.0     91
-3.0     77
 3.0     66
 4.0     22
-4.0     22
 5.0     13
 7.0      6
-5.0      5
-6.0      4
 6.0      3
Name: count, dtype: int64

In [17]:
x = pd.read_csv('testdata/3.PA_balanced.csv')

In [18]:
x

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,pain_medication_baseline,Baseline_Physical_Activity,Baseline_Pain,Baseline_Quality_of_life,PA_change,Treatment
0,74.0,1.0,29.6,1.0,1.0,0.0,1.0,70.0,50.00,4.0,0
1,49.0,2.0,33.8,1.0,1.0,0.0,1.0,60.0,56.25,3.0,0
2,49.0,2.0,33.8,1.0,1.0,0.0,1.0,60.0,56.25,3.0,0
3,49.0,2.0,33.8,1.0,1.0,0.0,1.0,60.0,56.25,3.0,0
4,66.0,2.0,33.2,1.0,3.0,1.0,1.0,60.0,68.75,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...
1149,56.0,2.0,27.5,1.0,1.0,0.0,10.0,40.0,43.75,-4.0,0
1150,56.0,2.0,27.5,1.0,1.0,0.0,10.0,40.0,43.75,-4.0,0
1151,56.0,2.0,27.5,1.0,1.0,0.0,10.0,40.0,43.75,-4.0,0
1152,53.0,2.0,30.1,1.0,1.0,0.0,10.0,50.0,50.00,-1.0,0


In [19]:
df

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,pain_medication_baseline,Baseline_Physical_Activity,Baseline_Pain,Baseline_Quality_of_life,PA_change,Treatment
0,75.0,1.0,27.2,1.0,3.0,0.0,1,0.0,75.00,3,0
1,77.0,1.0,33.6,1.0,3.0,0.0,1,50.0,37.50,3,0
2,76.0,1.0,31.0,2.0,3.0,0.0,1,10.0,56.25,6,0
3,69.0,1.0,31.8,2.0,1.0,0.0,1,30.0,62.50,2,0
4,74.0,1.0,29.6,1.0,1.0,0.0,1,70.0,50.00,4,0
...,...,...,...,...,...,...,...,...,...,...,...
1149,46.0,2.0,26.9,2.0,1.0,0.0,10,60.0,56.25,0,0
1150,54.0,1.0,27.6,2.0,1.0,0.0,10,20.0,68.75,0,0
1151,56.0,2.0,27.5,1.0,1.0,0.0,10,40.0,43.75,-4,0
1152,57.0,1.0,33.6,2.0,1.0,1.0,10,40.0,68.75,0,0
