In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import minimize
from sklearn.preprocessing import StandardScaler

  from pandas.core import (


### Entropy Balancing

#### Outcome: Physical activity

In [2]:
dfg= pd.read_csv("data/4.PA_treatment.csv")
df = pd.read_csv("data/4.PA_control.csv")

In [3]:
df.columns

Index(['Age', 'Gender', 'BMI', 'Depression', 'Employment_status',
       'Baseline_Physical_Activity', 'Baseline_Quality_of_life',
       'Baseline_Pain', 'PA_change'],
      dtype='object')

In [4]:
dfg.columns

Index(['Age', 'Gender', 'BMI', 'Depression', 'Employment_status',
       'Baseline_Physical_Activity', 'Baseline_Quality_of_life',
       'Baseline_Pain', 'PA_change'],
      dtype='object')

In [5]:
# Add treatment indicator
df['Treatment'] = 0  # Control group
dfg['Treatment'] = 1  # Treatment group

# Combine the datasets
combined_data = pd.concat([df, dfg], ignore_index=True)

# Define covariates and outcome
# Include 'Baseline_Physical_Activity' for calculating weights
covariates_for_weighting = ['Age', 'Gender', 'BMI', 'Depression', 'Employment_status', 
                            'Baseline_Physical_Activity', 'Baseline_Quality_of_life', 'Baseline_Pain']
# Exclude 'Baseline_Physical_Activity' and 'PA_change' in the final balancing
covariates_for_balancing = ['Age', 'Gender', 'BMI', 'Depression', 'Employment_status', 
                            'Baseline_Physical_Activity','Baseline_Quality_of_life', 'Baseline_Pain']
outcome = 'PA_change'
treatment = 'Treatment'

# Separate treatment and control groups
control_data = combined_data[combined_data[treatment] == 0]
treatment_data = combined_data[combined_data[treatment] == 1]

# Calculate target moments (means) from the treatment group
target_means = treatment_data[covariates_for_weighting].mean()

# Define the objective function for optimization
def objective(weights):
    # Apply weights to the control group's covariates
    weighted_covariates = control_data[covariates_for_weighting].multiply(weights, axis=0)
    # Calculate the weighted means of the covariates
    weighted_means = weighted_covariates.mean()
    # Objective is to minimize the sum of squared differences between weighted means and target means
    return np.sum((weighted_means - target_means) ** 2)

# Initial weights (starting with equal weights for all control group observations)
initial_weights = np.ones(len(control_data))

# Constraints: weights should sum to the number of control units
constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - len(control_data)})

# Bounds: weights should be non-negative
bounds = [(0, None) for _ in range(len(control_data))]

# Optimize weights using Sequential Least Squares Programming (SLSQP)
result = minimize(objective, initial_weights, method='SLSQP', constraints=constraints, bounds=bounds)

# Get the optimized weights from the result
optimized_weights = result.x

# Ensure weights are positive and normalized to sum to the number of control units
control_data['weights'] = np.abs(optimized_weights)
control_data['weights'] = control_data['weights'] / np.sum(control_data['weights']) * len(control_data)

# Apply the weights to the control data covariates for balancing check, excluding Baseline_Physical_Activity and PA_change
weighted_control_data = control_data[covariates_for_balancing].multiply(control_data['weights'], axis=0)
weighted_control_data['Baseline_Physical_Activity'] = control_data['Baseline_Physical_Activity']
weighted_control_data['PA_change'] = control_data['PA_change']

# Calculate the weighted means of the covariates in the control group
weighted_means = weighted_control_data[covariates_for_balancing].mean()
print("Weighted Control Group Means:\n", weighted_means)
print("\nTarget Treatment Group Means:\n", target_means[covariates_for_balancing])

# For the final balanced control group, we use the repetitions approach excluding Baseline_Physical_Activity and PA_change
control_data['reps'] = np.round(control_data['weights']).astype(int)

# Adjust repetitions to match the original control group size
while control_data['reps'].sum() != len(control_data):
    diff = len(control_data) - control_data['reps'].sum()
    idx = np.random.choice(control_data.index)
    control_data.loc[idx, 'reps'] += np.sign(diff)

# Create the balanced control group by repeating rows according to the reps
repeated_rows = control_data.index.repeat(control_data['reps'])
balanced_control_data = control_data.loc[repeated_rows].reset_index(drop=True)

# Ensure Baseline_Physical_Activity remains unchanged
balanced_control_data['Baseline_Physical_Activity'] = np.repeat(control_data['Baseline_Physical_Activity'].values, control_data['reps'].values)

# Remove the temporary columns used for weighting
balanced_control_data = balanced_control_data.drop(columns=['weights', 'reps'])

# Check the means of the covariates to ensure they match the target means
balanced_means = balanced_control_data[covariates_for_balancing].mean()
print("Balanced Control Group Means:\n", balanced_means)
print("\nTarget Treatment Group Means (for balancing):\n", target_means[covariates_for_balancing])

# Combine the balanced control group with the treatment group
balanced_combined_data = pd.concat([balanced_control_data, treatment_data], ignore_index=True)

# Display the balanced control group dataframe
print("Balanced Control Group Data:")
print(balanced_control_data)

print("balanced_control_data.shape", balanced_control_data.shape)
print("control_data.shape", df.shape)

Weighted Control Group Means:
 Age                           56.691017
Gender                         1.592205
BMI                           29.588865
Depression                     1.249382
Employment_status              1.285521
Baseline_Physical_Activity     4.663520
Baseline_Quality_of_life      45.058284
Baseline_Pain                 47.220971
dtype: float64

Target Treatment Group Means:
 Age                           56.692227
Gender                         1.700250
BMI                           29.592791
Depression                     1.069578
Employment_status              1.156649
Baseline_Physical_Activity     4.797843
Baseline_Quality_of_life      45.055406
Baseline_Pain                 47.219387
dtype: float64
Balanced Control Group Means:
 Age                           56.708061
Gender                         1.591382
BMI                           29.579521
Depression                     1.253207
Employment_status              1.288308
Baseline_Physical_Activity     4.790

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_data['weights'] = np.abs(optimized_weights)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_data['weights'] = control_data['weights'] / np.sum(control_data['weights']) * len(control_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_data['reps'] = np.round(control_data['wei

In [6]:
dfg

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Baseline_Quality_of_life,Baseline_Pain,PA_change,Treatment
0,50,2,22.68,1.0,1.0,9,31.25,65.0,0,1
1,45,1,29.07,1.0,1.0,6,25.00,96.0,3,1
2,46,2,26.10,1.0,1.0,2,68.75,39.0,6,1
3,53,2,28.03,1.0,1.0,3,62.50,15.0,0,1
4,68,2,27.82,1.0,1.0,7,87.50,15.0,2,1
...,...,...,...,...,...,...,...,...,...,...
7598,55,2,35.63,1.0,1.0,7,50.00,57.0,2,1
7599,61,2,29.34,1.0,1.0,2,25.00,55.0,3,1
7600,62,1,25.88,1.0,1.0,2,50.00,26.0,0,1
7601,47,2,26.51,1.0,1.0,3,43.75,47.0,-2,1


In [7]:
df

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Baseline_Quality_of_life,Baseline_Pain,PA_change,Treatment
0,78.0,1.0,27.6,1.0,3.0,0,50.00,80.0,0,0
1,76.0,2.0,27.4,1.0,3.0,0,100.00,0.0,2,0
2,69.0,1.0,29.8,1.0,3.0,0,81.25,10.0,3,0
3,68.0,2.0,30.1,2.0,3.0,0,75.00,30.0,0,0
4,76.0,1.0,32.4,1.0,2.0,0,75.00,30.0,0,0
...,...,...,...,...,...,...,...,...,...,...
4126,56.0,1.0,24.1,1.0,1.0,9,100.00,0.0,-1,0
4127,59.0,2.0,30.5,1.0,1.0,9,43.75,70.0,-4,0
4128,70.0,1.0,33.9,1.0,3.0,9,100.00,0.0,-5,0
4129,52.0,1.0,30.7,1.0,1.0,9,31.25,50.0,-2,0


In [8]:
balanced_control_data

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Baseline_Quality_of_life,Baseline_Pain,PA_change,Treatment
0,70.0,1.0,28.9,1.0,1.0,0,81.25,30.0,0,0
1,77.0,1.0,33.6,1.0,3.0,0,37.50,50.0,3,0
2,49.0,1.0,23.5,1.0,1.0,0,81.25,20.0,6,0
3,67.0,2.0,34.7,1.0,3.0,0,62.50,0.0,3,0
4,56.0,2.0,23.9,1.0,1.0,0,100.00,0.0,5,0
...,...,...,...,...,...,...,...,...,...,...
4126,52.0,1.0,30.7,1.0,1.0,9,31.25,50.0,-2,0
4127,52.0,1.0,30.7,1.0,1.0,9,31.25,50.0,-2,0
4128,52.0,1.0,30.7,1.0,1.0,9,31.25,50.0,-2,0
4129,52.0,1.0,30.7,1.0,1.0,9,31.25,50.0,-2,0


In [9]:
weighted_control_data

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Baseline_Quality_of_life,Baseline_Pain,PA_change
0,3.481549e-15,4.463525e-17,1.231933e-15,4.463525e-17,1.339057e-16,0,2.231762e-15,3.570820e-15,0
1,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0,0.000000e+00,0.000000e+00,2
2,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0,0.000000e+00,0.000000e+00,3
3,7.422446e+00,2.183072e-01,3.285524e+00,2.183072e-01,3.274608e-01,0,8.186521e+00,3.274608e+00,0
4,4.789204e+00,6.301585e-02,2.041713e+00,6.301585e-02,1.260317e-01,0,4.726189e+00,1.890475e+00,0
...,...,...,...,...,...,...,...,...,...
4126,4.833758e-15,8.631710e-17,2.080242e-15,8.631710e-17,8.631710e-17,9,8.631710e-15,0.000000e+00,-1
4127,1.361072e+02,4.613804e+00,7.036051e+01,2.306902e+00,2.306902e+00,9,1.009270e+02,1.614831e+02,-4
4128,1.254773e-17,1.792533e-19,6.076688e-18,1.792533e-19,5.377600e-19,9,1.792533e-17,0.000000e+00,-5
4129,2.065817e+02,3.972725e+00,1.219627e+02,3.972725e+00,3.972725e+00,9,1.241477e+02,1.986362e+02,-2


In [10]:
# Control group (Real one)
mean_age_control = df['Age'].mean()
mean_gender_control = df['Gender'].mean()
mean_bmi_control = df['BMI'].mean()
mean_depression_control = df['Depression'].mean()
mean_employment_status_control = df['Employment_status'].mean()
mean_quality_of_life_control = df['Baseline_Quality_of_life'].mean()
mean_pain_control = df['Baseline_Pain'].mean()
mean_PA_control = df['Baseline_Physical_Activity'].mean()

# Weighted Control group 
weighted_mean_age_control = weighted_control_data['Age'].mean()
weighted_mean_gender_control = weighted_control_data['Gender'].mean()
weighted_mean_bmi_control = weighted_control_data['BMI'].mean()
weighted_mean_depression_control = weighted_control_data['Depression'].mean()
weighted_mean_employment_status_control = weighted_control_data['Employment_status'].mean()
weighted_mean_quality_of_life_control = weighted_control_data['Baseline_Quality_of_life'].mean()
weighted_mean_pain_control = weighted_control_data['Baseline_Pain'].mean()
weighted_mean_PA_control = weighted_control_data['Baseline_Physical_Activity'].mean()

# Balanced Control group
balanced_mean_age_control = balanced_control_data['Age'].mean()
balanced_mean_gender_control = balanced_control_data['Gender'].mean()
balanced_mean_bmi_control = balanced_control_data['BMI'].mean()
balanced_mean_depression_control = balanced_control_data['Depression'].mean()
balanced_mean_employment_status_control = balanced_control_data['Employment_status'].mean()
balanced_mean_quality_of_life_control = balanced_control_data['Baseline_Quality_of_life'].mean()
balanced_mean_pain_control = balanced_control_data['Baseline_Pain'].mean()
balanced_mean_PA_control = balanced_control_data['Baseline_Physical_Activity'].mean()

# Treatment group (unweighted)
mean_age_treatment = dfg['Age'].mean()
mean_gender_treatment = dfg['Gender'].mean()
mean_bmi_treatment = dfg['BMI'].mean()
mean_Depression_treatment = dfg['Depression'].mean()
mean_employment_status_treatment = dfg['Employment_status'].mean()
mean_QOL_treatment = dfg['Baseline_Quality_of_life'].mean()
mean_pain_treatment = dfg['Baseline_Pain'].mean()
mean_PA_treatment = dfg['Baseline_Physical_Activity'].mean()

# Result
print("Mean in control group:")
print(f"Mean Age (Control Group): {mean_age_control}")
print(f"Mean Gender (Control Group): {mean_gender_treatment}")
print(f"Mean BMI (Control Group): {mean_bmi_control}")
print(f"Mean Depression (Control Group): {mean_depression_control}")
print(f"Mean Employment Status (Control Group): {mean_employment_status_control}")
print(f"Mean Physical Activity (Control Group): {mean_PA_control}")
print(f"Mean QOL (Control Group): {mean_quality_of_life_control}")
print(f"Mean Pain (Control Group): {mean_pain_control}")



print("--------------------------------------------------")
print("Weighted mean in control group:")
print(f"Weighted Mean Age (Control Group): {weighted_mean_age_control}")
print(f"Weighted Mean Gender (Control Group): {weighted_mean_gender_control}")
print(f"Weighted Mean BMI (Control Group): {weighted_mean_bmi_control}")
print(f"Weighted Mean Depression (Control Group): {weighted_mean_depression_control}")
print(f"Weighted Mean Employment Status (Control Group): {weighted_mean_employment_status_control}")
print(f"Weighted Mean PA (Control Group): {weighted_mean_PA_control}")
print(f"Weighted Mean QOL (Control Group): {weighted_mean_quality_of_life_control}")
print(f"Weighted Mean Pain (Control Group): {weighted_mean_pain_control}")


print("--------------------------------------------------")
print("Balanced mean in control group:")
print(f"Balanced Mean Age (Control Group): {balanced_mean_age_control}")
print(f"Balanced Mean Gender (Control Group): {balanced_mean_gender_control}")
print(f"Balanced Mean BMI (Control Group): {balanced_mean_bmi_control}")
print(f"Balanced Mean Depression (Control Group): {balanced_mean_depression_control}")
print(f"Balanced Mean Employment Status (Control Group): {weighted_mean_employment_status_control}")
print(f"Balanced Mean PA (Control Group): {balanced_mean_PA_control}")
print(f"Balanced Mean QOL (Control Group): {balanced_mean_quality_of_life_control}")
print(f"Balanced Mean Pain (Control Group): {balanced_mean_pain_control}")

print("--------------------------------------------------")
print("Mean in treatnement group:")
print(f"Mean Age (Treatment Group): {mean_age_treatment}")
print(f"Mean Gender (Treatment Group): {mean_gender_control}")
print(f"Mean BMI (Treatment Group): {mean_bmi_treatment}")
print(f"Mean Depression (Treatment Group): {mean_Depression_treatment}")
print(f"Mean Employment Status (Treatment Group): {mean_employment_status_treatment}")
print(f"Mean Physical Activity (Treatment Group): {mean_PA_treatment}")
print(f"Mean QOL (Treatment Group): {mean_QOL_treatment}")
print(f"Mean Pain (Treatment Group): {mean_pain_treatment}")

Mean in control group:
Mean Age (Control Group): 61.08545146453643
Mean Gender (Control Group): 1.7002499013547283
Mean BMI (Control Group): 28.563519728879204
Mean Depression (Control Group): 1.2677317840716533
Mean Employment Status (Control Group): 1.6986201888162673
Mean Physical Activity (Control Group): 4.663519728879206
Mean QOL (Control Group): 67.79381505688696
Mean Pain (Control Group): 32.946017913338174
--------------------------------------------------
Weighted mean in control group:
Weighted Mean Age (Control Group): 56.69101749776496
Weighted Mean Gender (Control Group): 1.592205221203052
Weighted Mean BMI (Control Group): 29.588864762096406
Weighted Mean Depression (Control Group): 1.2493817086657293
Weighted Mean Employment Status (Control Group): 1.285521298788185
Weighted Mean PA (Control Group): 4.663519728879206
Weighted Mean QOL (Control Group): 45.05828386180154
Weighted Mean Pain (Control Group): 47.220971314856726
-----------------------------------------------

In [11]:
balanced_control_data.to_csv('data/5.PA_balanced.csv', index=False)

In [12]:
x = pd.read_csv('data/5.PA_balanced.csv')

In [13]:
x

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Baseline_Quality_of_life,Baseline_Pain,PA_change,Treatment
0,70.0,1.0,28.9,1.0,1.0,0,81.25,30.0,0,0
1,77.0,1.0,33.6,1.0,3.0,0,37.50,50.0,3,0
2,49.0,1.0,23.5,1.0,1.0,0,81.25,20.0,6,0
3,67.0,2.0,34.7,1.0,3.0,0,62.50,0.0,3,0
4,56.0,2.0,23.9,1.0,1.0,0,100.00,0.0,5,0
...,...,...,...,...,...,...,...,...,...,...
4126,52.0,1.0,30.7,1.0,1.0,9,31.25,50.0,-2,0
4127,52.0,1.0,30.7,1.0,1.0,9,31.25,50.0,-2,0
4128,52.0,1.0,30.7,1.0,1.0,9,31.25,50.0,-2,0
4129,52.0,1.0,30.7,1.0,1.0,9,31.25,50.0,-2,0
