In [136]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import minimize
from sklearn.preprocessing import StandardScaler

In [137]:
# OAI dataset
df = pd.read_csv("preprocessed_OAI.csv")
# GLAD dataset
dfg = pd.read_csv("preprocessed_Glad.csv")
df.shape
print("OAI shape:", df.shape, "GLAD shape:", dfg.shape)

OAI shape: (4131, 11) GLAD shape: (7603, 11)


In [138]:
dfg

Unnamed: 0,age,gender,fysb_BMI,ptb_14618_depression,ptb_3777_employment,ptb_4145,pt12_4145,ptb_koos_qol_score,pt12_koos_qol_score,ptb_3764,pt12_3764
0,50,2,22.68,1.0,1.0,10.0,10.0,31.25,56.25,65.0,17.0
1,45,1,29.07,1.0,1.0,7.0,10.0,25.00,43.75,96.0,52.0
2,46,2,26.10,1.0,1.0,3.0,9.0,68.75,62.50,39.0,25.0
3,53,2,28.03,1.0,1.0,4.0,4.0,62.50,87.50,15.0,2.0
4,68,2,27.82,1.0,1.0,8.0,10.0,87.50,93.75,15.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
7598,55,2,35.63,1.0,1.0,8.0,10.0,50.00,56.25,57.0,9.0
7599,61,2,29.34,1.0,1.0,3.0,6.0,25.00,68.75,55.0,8.0
7600,62,1,25.88,1.0,1.0,3.0,3.0,50.00,25.00,26.0,77.0
7601,47,2,26.51,1.0,1.0,4.0,2.0,43.75,18.75,47.0,92.0


In [139]:
df

Unnamed: 0,V00AGE,P02SEX,P01BMI,V00CESD6,V00CEMPLOY_employment,PA_baseline_category,PA_followup_category,V00KOOSQOL,V01KOOSQOL,knee_pain_baseline,knee_pain_follow
0,78.0,1.0,27.6,1.0,3.0,1,1,50.00,68.75,80.0,50.0
1,76.0,2.0,27.4,1.0,3.0,1,3,100.00,62.50,0.0,40.0
2,69.0,1.0,29.8,1.0,3.0,1,4,81.25,100.00,10.0,0.0
3,68.0,2.0,30.1,2.0,3.0,1,1,75.00,81.25,30.0,30.0
4,76.0,1.0,32.4,1.0,2.0,1,1,75.00,68.75,30.0,40.0
...,...,...,...,...,...,...,...,...,...,...,...
4126,56.0,1.0,24.1,1.0,1.0,10,9,100.00,100.00,0.0,0.0
4127,59.0,2.0,30.5,1.0,1.0,10,6,43.75,68.75,70.0,70.0
4128,70.0,1.0,33.9,1.0,3.0,10,5,100.00,100.00,0.0,0.0
4129,52.0,1.0,30.7,1.0,1.0,10,8,31.25,37.50,50.0,70.0


In [140]:
# Renaming columns in df for consistency
df.rename(columns={
    'V00AGE': 'Age',
    'P02SEX': 'Gender',
    'P01BMI': 'BMI',
    'V00CESD6': 'Depression',
    'V00CEMPLOY_employment': 'Employment_status',
    'V00KOOSQOL': 'Baseline_Quality_of_life',
    'V01KOOSQOL': 'Followup_Quality_of_life',
    'knee_pain_baseline': 'Baseline_Pain',
    'knee_pain_follow': 'Followup_Pain',
    'PA_baseline_category': 'Baseline_Physical_Activity',
    'PA_followup_category': 'Followup_Physical_Activity'
}, inplace=True)

dfg.rename(columns={
    'age': 'Age',
    'gender': 'Gender',
    'fysb_BMI': 'BMI',
    'ptb_14618_depression': 'Depression',
    'ptb_3777_employment': 'Employment_status',
    'ptb_koos_qol_score': 'Baseline_Quality_of_life',
    'pt12_koos_qol_score': 'Followup_Quality_of_life',
    'ptb_3764': 'Baseline_Pain',
    'pt12_3764': 'Followup_Pain',
    'ptb_4145': 'Baseline_Physical_Activity',
    'pt12_4145': 'Followup_Physical_Activity'
}, inplace=True)


In [141]:
df

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Followup_Physical_Activity,Baseline_Quality_of_life,Followup_Quality_of_life,Baseline_Pain,Followup_Pain
0,78.0,1.0,27.6,1.0,3.0,1,1,50.00,68.75,80.0,50.0
1,76.0,2.0,27.4,1.0,3.0,1,3,100.00,62.50,0.0,40.0
2,69.0,1.0,29.8,1.0,3.0,1,4,81.25,100.00,10.0,0.0
3,68.0,2.0,30.1,2.0,3.0,1,1,75.00,81.25,30.0,30.0
4,76.0,1.0,32.4,1.0,2.0,1,1,75.00,68.75,30.0,40.0
...,...,...,...,...,...,...,...,...,...,...,...
4126,56.0,1.0,24.1,1.0,1.0,10,9,100.00,100.00,0.0,0.0
4127,59.0,2.0,30.5,1.0,1.0,10,6,43.75,68.75,70.0,70.0
4128,70.0,1.0,33.9,1.0,3.0,10,5,100.00,100.00,0.0,0.0
4129,52.0,1.0,30.7,1.0,1.0,10,8,31.25,37.50,50.0,70.0


In [142]:
dfg

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Followup_Physical_Activity,Baseline_Quality_of_life,Followup_Quality_of_life,Baseline_Pain,Followup_Pain
0,50,2,22.68,1.0,1.0,10.0,10.0,31.25,56.25,65.0,17.0
1,45,1,29.07,1.0,1.0,7.0,10.0,25.00,43.75,96.0,52.0
2,46,2,26.10,1.0,1.0,3.0,9.0,68.75,62.50,39.0,25.0
3,53,2,28.03,1.0,1.0,4.0,4.0,62.50,87.50,15.0,2.0
4,68,2,27.82,1.0,1.0,8.0,10.0,87.50,93.75,15.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
7598,55,2,35.63,1.0,1.0,8.0,10.0,50.00,56.25,57.0,9.0
7599,61,2,29.34,1.0,1.0,3.0,6.0,25.00,68.75,55.0,8.0
7600,62,1,25.88,1.0,1.0,3.0,3.0,50.00,25.00,26.0,77.0
7601,47,2,26.51,1.0,1.0,4.0,2.0,43.75,18.75,47.0,92.0


##### Outcomes:

In [143]:
# OAI dataset
##### Physical activity
# Ensure the columns are of type 'category'
df['Followup_Physical_Activity'] = df['Followup_Physical_Activity'].astype('category')
df['Baseline_Physical_Activity'] = df['Baseline_Physical_Activity'].astype('category')

# Convert categories to codes if they are ordinal
df['Followup_Physical_Activity'] = df['Followup_Physical_Activity'].cat.codes
df['Baseline_Physical_Activity'] = df['Baseline_Physical_Activity'].cat.codes

df['PA_change'] = df['Followup_Physical_Activity'] - df['Baseline_Physical_Activity']

##### Quality of life
df['QOL_change'] = df['Followup_Quality_of_life'] - df['Baseline_Quality_of_life']

##### Pain
df['pain_change'] = df['Followup_Pain'] - df['Baseline_Pain']

In [144]:
# GLAD Dataset
##### Physical activity
dfg['Baseline_Physical_Activity'] = dfg['Baseline_Physical_Activity'].astype('category')
dfg['Baseline_Physical_Activity'] = dfg['Baseline_Physical_Activity'].cat.codes

dfg['Followup_Physical_Activity'] = dfg['Followup_Physical_Activity'].astype('category')
dfg['Followup_Physical_Activity'] = dfg['Followup_Physical_Activity'].cat.codes

dfg['PA_change'] = dfg['Followup_Physical_Activity'] - dfg['Baseline_Physical_Activity']

##### Quality of life
dfg['QOL_change'] = dfg['Followup_Quality_of_life'] - dfg['Baseline_Quality_of_life']

##### Pain
dfg['pain_change'] = dfg['Followup_Pain'] - dfg['Baseline_Pain']

In [145]:
df

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Followup_Physical_Activity,Baseline_Quality_of_life,Followup_Quality_of_life,Baseline_Pain,Followup_Pain,PA_change,QOL_change,pain_change
0,78.0,1.0,27.6,1.0,3.0,0,0,50.00,68.75,80.0,50.0,0,18.75,-30.0
1,76.0,2.0,27.4,1.0,3.0,0,2,100.00,62.50,0.0,40.0,2,-37.50,40.0
2,69.0,1.0,29.8,1.0,3.0,0,3,81.25,100.00,10.0,0.0,3,18.75,-10.0
3,68.0,2.0,30.1,2.0,3.0,0,0,75.00,81.25,30.0,30.0,0,6.25,0.0
4,76.0,1.0,32.4,1.0,2.0,0,0,75.00,68.75,30.0,40.0,0,-6.25,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4126,56.0,1.0,24.1,1.0,1.0,9,8,100.00,100.00,0.0,0.0,-1,0.00,0.0
4127,59.0,2.0,30.5,1.0,1.0,9,5,43.75,68.75,70.0,70.0,-4,25.00,0.0
4128,70.0,1.0,33.9,1.0,3.0,9,4,100.00,100.00,0.0,0.0,-5,0.00,0.0
4129,52.0,1.0,30.7,1.0,1.0,9,7,31.25,37.50,50.0,70.0,-2,6.25,20.0


In [146]:
dfg

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Followup_Physical_Activity,Baseline_Quality_of_life,Followup_Quality_of_life,Baseline_Pain,Followup_Pain,PA_change,QOL_change,pain_change
0,50,2,22.68,1.0,1.0,9,9,31.25,56.25,65.0,17.0,0,25.00,-48.0
1,45,1,29.07,1.0,1.0,6,9,25.00,43.75,96.0,52.0,3,18.75,-44.0
2,46,2,26.10,1.0,1.0,2,8,68.75,62.50,39.0,25.0,6,-6.25,-14.0
3,53,2,28.03,1.0,1.0,3,3,62.50,87.50,15.0,2.0,0,25.00,-13.0
4,68,2,27.82,1.0,1.0,7,9,87.50,93.75,15.0,1.0,2,6.25,-14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7598,55,2,35.63,1.0,1.0,7,9,50.00,56.25,57.0,9.0,2,6.25,-48.0
7599,61,2,29.34,1.0,1.0,2,5,25.00,68.75,55.0,8.0,3,43.75,-47.0
7600,62,1,25.88,1.0,1.0,2,2,50.00,25.00,26.0,77.0,0,-25.00,51.0
7601,47,2,26.51,1.0,1.0,3,1,43.75,18.75,47.0,92.0,-2,-25.00,45.0


In [147]:
df = df.drop(columns=["Followup_Physical_Activity", "Followup_Quality_of_life", "Followup_Pain", "QOL_change", "PA_change"])
dfg = dfg.drop(columns=["Followup_Physical_Activity", "Followup_Quality_of_life", "Followup_Pain", "QOL_change", "PA_change"])

In [148]:
df.to_csv('Pain_control.csv', index=False)
dfg.to_csv('Pain_treatment.csv', index=False)

In [134]:
df

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Baseline_Quality_of_life,Baseline_Pain,QOL_change
0,78.0,1.0,27.6,1.0,3.0,0,50.00,80.0,18.75
1,76.0,2.0,27.4,1.0,3.0,0,100.00,0.0,-37.50
2,69.0,1.0,29.8,1.0,3.0,0,81.25,10.0,18.75
3,68.0,2.0,30.1,2.0,3.0,0,75.00,30.0,6.25
4,76.0,1.0,32.4,1.0,2.0,0,75.00,30.0,-6.25
...,...,...,...,...,...,...,...,...,...
4126,56.0,1.0,24.1,1.0,1.0,9,100.00,0.0,0.00
4127,59.0,2.0,30.5,1.0,1.0,9,43.75,70.0,25.00
4128,70.0,1.0,33.9,1.0,3.0,9,100.00,0.0,0.00
4129,52.0,1.0,30.7,1.0,1.0,9,31.25,50.0,6.25


In [135]:
dfg

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Baseline_Quality_of_life,Baseline_Pain,QOL_change
0,50,2,22.68,1.0,1.0,9,31.25,65.0,25.00
1,45,1,29.07,1.0,1.0,6,25.00,96.0,18.75
2,46,2,26.10,1.0,1.0,2,68.75,39.0,-6.25
3,53,2,28.03,1.0,1.0,3,62.50,15.0,25.00
4,68,2,27.82,1.0,1.0,7,87.50,15.0,6.25
...,...,...,...,...,...,...,...,...,...
7598,55,2,35.63,1.0,1.0,7,50.00,57.0,6.25
7599,61,2,29.34,1.0,1.0,2,25.00,55.0,43.75
7600,62,1,25.88,1.0,1.0,2,50.00,26.0,-25.00
7601,47,2,26.51,1.0,1.0,3,43.75,47.0,-25.00


##### Run Entropy Balancing and calculater weights

In [21]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize


# Add treatment indicator
df['Treatment'] = 0  # Control group
dfg['Treatment'] = 1  # Treatment group

# Combine the datasets
combined_data = pd.concat([df, dfg], ignore_index=True)

# Define covariates and outcome
covariates = ['Age', 'Gender', 'BMI', 'Depression', 'Employment_status', 
              'Baseline_Physical_Activity', 'Baseline_Quality_of_life', 'Baseline_Pain']
outcome = 'PA_change'
treatment = 'Treatment'

# Separate treatment and control groups
control_data = combined_data[combined_data[treatment] == 0]
treatment_data = combined_data[combined_data[treatment] == 1]

# Calculate target moments (means) from the treatment group
target_means = treatment_data[covariates].mean()

# Define the objective function for optimization
def objective(weights):
    # Apply weights to the control group's covariates
    weighted_covariates = control_data[covariates].multiply(weights, axis=0)
    # Calculate the weighted means of the covariates
    weighted_means = weighted_covariates.mean()
    # Objective is to minimize the sum of squared differences between weighted means and target means
    return np.sum((weighted_means - target_means) ** 2)

# Initial weights (starting with equal weights for all control group observations)
initial_weights = np.ones(len(control_data))

# Constraints: weights should sum to the number of control units
constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - len(control_data)})

# Bounds: weights should be non-negative
bounds = [(0, None) for _ in range(len(control_data))]

# Optimize weights using Sequential Least Squares Programming (SLSQP)
result = minimize(objective, initial_weights, method='SLSQP', constraints=constraints, bounds=bounds)

# Get the optimized weights from the result
optimized_weights = result.x

# Ensure weights are positive and normalized
control_data['weights'] = np.abs(optimized_weights)
control_data['weights'] = control_data['weights'] / np.sum(control_data['weights']) * len(control_data)

# Print the control_data DataFrame with the weights column
print(control_data)


       Age  Gender   BMI  Depression  Employment_status  \
0     78.0     1.0  27.6         1.0                3.0   
1     76.0     2.0  27.4         1.0                3.0   
2     69.0     1.0  29.8         1.0                3.0   
3     68.0     2.0  30.1         2.0                3.0   
4     76.0     1.0  32.4         1.0                2.0   
...    ...     ...   ...         ...                ...   
4126  56.0     1.0  24.1         1.0                1.0   
4127  59.0     2.0  30.5         1.0                1.0   
4128  70.0     1.0  33.9         1.0                3.0   
4129  52.0     1.0  30.7         1.0                1.0   
4130  62.0     1.0  31.2         1.0                1.0   

      Baseline_Physical_Activity  Followup_Physical_Activity  \
0                              0                           0   
1                              0                           2   
2                              0                           3   
3                              0   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_data['weights'] = np.abs(optimized_weights)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_data['weights'] = control_data['weights'] / np.sum(control_data['weights']) * len(control_data)


In [None]:
# Calculate the weighted mean of the outcome variable 'Physical_Activity' in the control group
weighted_physical_activity_control = np.average(df['PA_change'], weights=df['weights'])

# Calculate the mean of the outcome variable 'Physical_Activity' in the treatment group
mean_physical_activity_treatment = dfg['PA_change'].mean()

# Display the results
print(f"Weighted Mean Physical Activity (Control Group): {weighted_physical_activity_control}")
print(f"Mean Physical Activity (Treatment Group): {mean_physical_activity_treatment}")
# Perform a statistical test to compare the weighted control group and treatment group
from scipy.stats import ttest_ind

# Expand the control group's physical activity data by weights to simulate a larger dataset
expanded_physical_activity_control = np.repeat(df['PA_change'], np.round(df['weights'] * len(df)).astype(int))

# Perform the t-test
t_stat, p_value = ttest_ind(expanded_physical_activity_control, dfg['PA_change'])

print(f"T-test: t-statistic = {t_stat}, p-value = {p_value}")


Weighted Mean Physical Activity (Control Group): 0.12713501512536113
Mean Physical Activity (Treatment Group): 0.34236485597790345
T-test: t-statistic = -9.752145364722248, p-value = 1.8077731134708035e-22


In [None]:
# Adding the weights to the control group dataframe
df['weights'] = optimal_weights

# Display the control group with weights
df

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Followup_Physical_Activity,Baseline_Quality_of_life,Followup_Quality_of_life,Baseline_Pain,Followup_Pain,PA_change,QOL_change,pain_change,weights
0,78.0,1.0,27.6,1.0,3.0,0,0,50.00,68.75,80.0,50.0,0,18.75,-30.0,0.025113
1,76.0,2.0,27.4,1.0,3.0,0,2,100.00,62.50,0.0,40.0,2,-37.50,40.0,0.009835
2,69.0,1.0,29.8,1.0,3.0,0,3,81.25,100.00,10.0,0.0,3,18.75,-10.0,0.012095
3,68.0,2.0,30.1,2.0,3.0,0,0,75.00,81.25,30.0,30.0,0,6.25,0.0,0.006005
4,76.0,1.0,32.4,1.0,2.0,0,0,75.00,68.75,30.0,40.0,0,-6.25,10.0,0.029793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4126,56.0,1.0,24.1,1.0,1.0,9,8,100.00,100.00,0.0,0.0,-1,0.00,0.0,0.022899
4127,59.0,2.0,30.5,1.0,1.0,9,5,43.75,68.75,70.0,70.0,-4,25.00,0.0,0.326603
4128,70.0,1.0,33.9,1.0,3.0,9,4,100.00,100.00,0.0,0.0,-5,0.00,0.0,0.002114
4129,52.0,1.0,30.7,1.0,1.0,9,7,31.25,37.50,50.0,70.0,-2,6.25,20.0,0.403195


In [None]:
weighted_mean_age = np.average(df['Age'], weights=df['weights'])
print(f"Weighted Mean Age: {weighted_mean_age}")

Weighted Mean Age: 56.6934865048841


In [None]:
# Calculate the weighted mean of the outcome variable 'Physical_Activity' in the control group
weighted_Age_control = np.average(df['Age'], weights=df['weights'])

# Calculate the mean of the outcome variable 'Physical_Activity' in the treatment group
mean_Age_treatment = dfg['Age'].mean()

print(f"Weighted Mean Physical Activity (Control Group): {weighted_Age_control}")
print(f"Mean Physical Activity (Treatment Group): {mean_Age_treatment}")

Weighted Mean Physical Activity (Control Group): 56.6934865048841
Mean Physical Activity (Treatment Group): 56.69222675259766


In [None]:
df

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Followup_Physical_Activity,Baseline_Quality_of_life,Followup_Quality_of_life,Baseline_Pain,Followup_Pain,PA_change,QOL_change,pain_change,weights
0,78.0,1.0,27.6,1.0,3.0,0,0,50.00,68.75,80.0,50.0,0,18.75,-30.0,0.025113
1,76.0,2.0,27.4,1.0,3.0,0,2,100.00,62.50,0.0,40.0,2,-37.50,40.0,0.009835
2,69.0,1.0,29.8,1.0,3.0,0,3,81.25,100.00,10.0,0.0,3,18.75,-10.0,0.012095
3,68.0,2.0,30.1,2.0,3.0,0,0,75.00,81.25,30.0,30.0,0,6.25,0.0,0.006005
4,76.0,1.0,32.4,1.0,2.0,0,0,75.00,68.75,30.0,40.0,0,-6.25,10.0,0.029793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4126,56.0,1.0,24.1,1.0,1.0,9,8,100.00,100.00,0.0,0.0,-1,0.00,0.0,0.022899
4127,59.0,2.0,30.5,1.0,1.0,9,5,43.75,68.75,70.0,70.0,-4,25.00,0.0,0.326603
4128,70.0,1.0,33.9,1.0,3.0,9,4,100.00,100.00,0.0,0.0,-5,0.00,0.0,0.002114
4129,52.0,1.0,30.7,1.0,1.0,9,7,31.25,37.50,50.0,70.0,-2,6.25,20.0,0.403195


In [None]:
def calculate_weighted_mean(df, variable, weights_column='weights'):
    weighted_mean = np.average(df[variable], weights=df[weights_column])
    return weighted_mean

# Control group (Real one)
mean_age_control = df['Age'].mean()
mean_gender_control = df['Gender'].mean()
mean_bmi_control = df['BMI'].mean()
mean_depression_control = df['Depression'].mean()
mean_employment_status_control = df['Employment_status'].mean()
mean_quality_of_life_control = df['Quality_of_life'].mean()
mean_pain_control = df['Pain'].mean()
mean_PA_control = df['Physical_Activity'].mean()

# Control group (with weights)
weighted_mean_age_control = calculate_weighted_mean(df, 'Age')
weighted_mean_gender_control = calculate_weighted_mean(df, 'Gender')
weighted_mean_bmi_control = calculate_weighted_mean(df, 'BMI')
weighted_mean_depression_control = calculate_weighted_mean(df, 'Depression')
weighted_mean_employment_status_control = calculate_weighted_mean(df, 'Employment_status')
weighted_mean_quality_of_life_control = calculate_weighted_mean(df, 'Quality_of_life')
weighted_mean_pain_control = calculate_weighted_mean(df, 'Pain')
weighted_mean_PA_control = calculate_weighted_mean(df, 'Physical_Activity')

# Treatment group (unweighted)
mean_age_treatment = dfg['Age'].mean()
mean_gender_treatment = dfg['Gender'].mean()
mean_bmi_treatment = dfg['BMI'].mean()
mean_Depression_treatment = dfg['Depression'].mean()
mean_employment_status_treatment = dfg['Employment_status'].mean()
mean_QOL_treatment = dfg['Quality_of_life'].mean()
mean_pain_treatment = dfg['Pain'].mean()
mean_PA_treatment = dfg['Physical_Activity'].mean()

# Result
print("Mean in control group:")
print(f"Mean Age (Control Group): {mean_age_control}")
print(f"Mean Gender (Control Group): {mean_gender_treatment}")
print(f"Mean BMI (Control Group): {mean_bmi_control}")
print(f"Mean Depression (Control Group): {mean_depression_control}")
print(f"Mean Employment Status (Control Group): {mean_employment_status_control}")
print(f"Mean Physical Activity (Control Group): {mean_PA_control}")
print(f"Mean QOL (Control Group): {mean_quality_of_life_control}")
print(f"Mean Pain (Control Group): {mean_pain_control}")

print("--------------------------------------------------")
print("Mean in treatnement group:")
print(f"Mean Age (Treatment Group): {mean_age_treatment}")
print(f"Mean Gender (Treatment Group): {mean_gender_control}")
print(f"Mean BMI (Treatment Group): {mean_bmi_treatment}")
print(f"Mean Depression (Treatment Group): {mean_Depression_treatment}")
print(f"Mean Employment Status (Treatment Group): {mean_employment_status_treatment}")
print(f"Mean Physical Activity (Treatment Group): {mean_PA_treatment}")
print(f"Mean QOL (Treatment Group): {mean_QOL_treatment}")
print(f"Mean Pain (Treatment Group): {mean_pain_treatment}")

print("--------------------------------------------------")
print("Weighted mean in control group:")
print(f"Weighted Mean Age (Control Group): {weighted_mean_age_control}")
print(f"Weighted Mean Gender (Control Group): {weighted_mean_gender_control}")
print(f"Weighted Mean BMI (Control Group): {weighted_mean_bmi_control}")
print(f"Weighted Mean Depression (Control Group): {weighted_mean_depression_control}")
print(f"Weighted Mean Employment Status (Control Group): {weighted_mean_employment_status_control}")
print(f"Weighted Mean PA (Control Group): {weighted_mean_PA_control}")
print(f"Weighted Mean QOL (Control Group): {weighted_mean_quality_of_life_control}")
print(f"Weighted Mean Pain (Control Group): {weighted_mean_pain_control}")


In [None]:
df.Gender.describe()

In [None]:
df.Gender.value_counts()

##### New DataFrame with rows replicated according to weights

In [None]:
df

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Followup_Physical_Activity,Baseline_Quality_of_life,Followup_Quality_of_life,Baseline_Pain,Followup_Pain,PA_change,QOL_change,pain_change,weights
0,78.0,1.0,27.6,1.0,3.0,0,0,50.00,68.75,80.0,50.0,0,18.75,-30.0,0.025113
1,76.0,2.0,27.4,1.0,3.0,0,2,100.00,62.50,0.0,40.0,2,-37.50,40.0,0.009835
2,69.0,1.0,29.8,1.0,3.0,0,3,81.25,100.00,10.0,0.0,3,18.75,-10.0,0.012095
3,68.0,2.0,30.1,2.0,3.0,0,0,75.00,81.25,30.0,30.0,0,6.25,0.0,0.006005
4,76.0,1.0,32.4,1.0,2.0,0,0,75.00,68.75,30.0,40.0,0,-6.25,10.0,0.029793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4126,56.0,1.0,24.1,1.0,1.0,9,8,100.00,100.00,0.0,0.0,-1,0.00,0.0,0.022899
4127,59.0,2.0,30.5,1.0,1.0,9,5,43.75,68.75,70.0,70.0,-4,25.00,0.0,0.326603
4128,70.0,1.0,33.9,1.0,3.0,9,4,100.00,100.00,0.0,0.0,-5,0.00,0.0,0.002114
4129,52.0,1.0,30.7,1.0,1.0,9,7,31.25,37.50,50.0,70.0,-2,6.25,20.0,0.403195


In [None]:
# Function to create a new DataFrame with rows replicated according to weights
def create_weighted_df(df, weight_column='weights'):
    # Normalize weights to integer values
    max_weight = df[weight_column].max()
    normalized_weights = (df[weight_column] / max_weight * 100).round().astype(int)
    
    # Create a list to store repeated rows
    weighted_rows = []
    
    # Repeat each row according to its normalized weight
    for i, weight in enumerate(normalized_weights):
        repeated_rows = [df.iloc[i]] * weight
        weighted_rows.extend(repeated_rows)
    
    # Convert the list of rows into a new DataFrame
    weighted_df = pd.DataFrame(weighted_rows).reset_index(drop=True)
    return weighted_df

# Create the new weighted DataFrame
weighted_control_df = create_weighted_df(df)

In [None]:
weighted_control_df

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Followup_Physical_Activity,Baseline_Quality_of_life,Followup_Quality_of_life,Baseline_Pain,Followup_Pain,PA_change,QOL_change,pain_change,weights
0,76.0,1.0,32.4,1.0,2.0,0.0,0.0,75.00,68.75,30.0,40.0,0.0,-6.25,10.0,0.029793
1,70.0,1.0,28.9,1.0,1.0,0.0,0.0,81.25,100.00,30.0,50.0,0.0,18.75,20.0,0.070179
2,77.0,1.0,33.6,1.0,3.0,0.0,3.0,37.50,50.00,50.0,30.0,3.0,12.50,-20.0,0.055870
3,62.0,1.0,33.9,1.0,1.0,0.0,2.0,62.50,100.00,80.0,10.0,2.0,37.50,-70.0,0.128902
4,62.0,1.0,33.9,1.0,1.0,0.0,2.0,62.50,100.00,80.0,10.0,2.0,37.50,-70.0,0.128902
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11321,52.0,1.0,30.7,1.0,1.0,9.0,7.0,31.25,37.50,50.0,70.0,-2.0,6.25,20.0,0.403195
11322,52.0,1.0,30.7,1.0,1.0,9.0,7.0,31.25,37.50,50.0,70.0,-2.0,6.25,20.0,0.403195
11323,52.0,1.0,30.7,1.0,1.0,9.0,7.0,31.25,37.50,50.0,70.0,-2.0,6.25,20.0,0.403195
11324,52.0,1.0,30.7,1.0,1.0,9.0,7.0,31.25,37.50,50.0,70.0,-2.0,6.25,20.0,0.403195


In [None]:
df

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Followup_Physical_Activity,Baseline_Quality_of_life,Followup_Quality_of_life,Baseline_Pain,Followup_Pain,PA_change,QOL_change,pain_change,weights
0,78.0,1.0,27.6,1.0,3.0,0,0,50.00,68.75,80.0,50.0,0,18.75,-30.0,0.025113
1,76.0,2.0,27.4,1.0,3.0,0,2,100.00,62.50,0.0,40.0,2,-37.50,40.0,0.009835
2,69.0,1.0,29.8,1.0,3.0,0,3,81.25,100.00,10.0,0.0,3,18.75,-10.0,0.012095
3,68.0,2.0,30.1,2.0,3.0,0,0,75.00,81.25,30.0,30.0,0,6.25,0.0,0.006005
4,76.0,1.0,32.4,1.0,2.0,0,0,75.00,68.75,30.0,40.0,0,-6.25,10.0,0.029793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4126,56.0,1.0,24.1,1.0,1.0,9,8,100.00,100.00,0.0,0.0,-1,0.00,0.0,0.022899
4127,59.0,2.0,30.5,1.0,1.0,9,5,43.75,68.75,70.0,70.0,-4,25.00,0.0,0.326603
4128,70.0,1.0,33.9,1.0,3.0,9,4,100.00,100.00,0.0,0.0,-5,0.00,0.0,0.002114
4129,52.0,1.0,30.7,1.0,1.0,9,7,31.25,37.50,50.0,70.0,-2,6.25,20.0,0.403195


In [None]:
# Assuming df is the control group and dfg is the treatment group, and weights are applied to df

# Calculate the weighted mean of the outcome variable 'Physical_Activity' in the control group
weighted_mean_physical_activity_control = np.average(df['PA_change'], weights=df['weights'])

# Calculate the mean of the outcome variable 'Physical_Activity' in the treatment group
mean_physical_activity_treatment = dfg['PA_change'].mean()

# Display the results
print(f"Weighted Mean Physical Activity (Control Group): {weighted_mean_physical_activity_control}")
print(f"Mean Physical Activity (Treatment Group): {mean_physical_activity_treatment}")

# Perform a statistical test to compare the weighted control group and treatment group
# Expand the control group's physical activity data by weights to simulate a larger dataset
expanded_physical_activity_control = np.repeat(df['PA_change'], np.round(df['weights'] * len(df)).astype(int))

# Perform the t-test
t_stat, p_value = ttest_ind(expanded_physical_activity_control, dfg['PA_change'])

print(f"T-test: t-statistic = {t_stat}, p-value = {p_value}")


Weighted Mean Physical Activity (Control Group): 0.12713501512536113
Mean Physical Activity (Treatment Group): 0.34236485597790345
T-test: t-statistic = -9.752145364722248, p-value = 1.8077731134708035e-22


##### Balanced dataframe:

In [None]:
# Function to create a new DataFrame with rows replicated according to weights
def create_weighted_df(df, weight_column='weights'):
    # Normalize weights to integer values
    max_weight = df[weight_column].max()
    normalized_weights = (df[weight_column] / max_weight * 100).round().astype(int)
    
    # Create a list to store repeated rows
    weighted_rows = []
    
    # Repeat each row according to its normalized weight
    for i, weight in enumerate(normalized_weights):
        repeated_rows = [df.iloc[i]] * weight
        weighted_rows.extend(repeated_rows)
    
    # Convert the list of rows into a new DataFrame
    weighted_df = pd.DataFrame(weighted_rows).reset_index(drop=True)
    return weighted_df

# Create the new weighted DataFrame
weighted_control_df = create_weighted_df(df)


In [None]:
weighted_control_df.to_csv('weighted_control.csv', index=False)

In [None]:
df.isna().sum().sum()

0

In [None]:
weighted_control_df.isna().sum().sum()

0

In [None]:
weighted_df = df.copy()

# Apply weights only to baseline covariates
for column in features:
    weighted_df[column] = weighted_df[column] * weighted_df['weights']

# Keep follow-up and outcome variables as they are
follow_up_columns = ['Followup_Physical_Activity', 'Followup_Quality_of_life', 'Followup_Pain', 'PA_change', 'QOL_change', 'pain_change']
for column in follow_up_columns:
    weighted_df[column] = df[column]

# Display the new DataFrame with weighted baseline covariates


In [None]:
weighted_df

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Followup_Physical_Activity,Baseline_Quality_of_life,Followup_Quality_of_life,Baseline_Pain,Followup_Pain,PA_change,QOL_change,pain_change,weights
0,1.958800,0.025113,0.693114,0.025113,0.075338,0.000000,0,1.255641,68.75,2.009025,50.0,0,18.75,-30.0,0.025113
1,0.747436,0.019669,0.269470,0.009835,0.029504,0.000000,2,0.983468,62.50,0.000000,40.0,2,-37.50,40.0,0.009835
2,0.834537,0.012095,0.360423,0.012095,0.036284,0.000000,3,0.982698,100.00,0.120947,0.0,3,18.75,-10.0,0.012095
3,0.408335,0.012010,0.180748,0.012010,0.018015,0.000000,0,0.450369,81.25,0.180148,30.0,0,6.25,0.0,0.006005
4,2.264266,0.029793,0.965292,0.029793,0.059586,0.000000,0,2.234473,68.75,0.893789,40.0,0,-6.25,10.0,0.029793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4126,1.282321,0.022899,0.551856,0.022899,0.022899,0.206087,8,2.289859,100.00,0.000000,0.0,-1,0.00,0.0,0.022899
4127,19.269592,0.653206,9.961399,0.326603,0.326603,2.939429,5,14.288892,68.75,22.862227,70.0,-4,25.00,0.0,0.326603
4128,0.147958,0.002114,0.071654,0.002114,0.006341,0.019023,4,0.211369,100.00,0.000000,0.0,-5,0.00,0.0,0.002114
4129,20.966166,0.403195,12.378102,0.403195,0.403195,3.628759,7,12.599859,37.50,20.159775,70.0,-2,6.25,20.0,0.403195


In [None]:
df

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Followup_Physical_Activity,Baseline_Quality_of_life,Followup_Quality_of_life,Baseline_Pain,Followup_Pain,PA_change,QOL_change,pain_change,weights
0,78.0,1.0,27.6,1.0,3.0,0,0,50.00,68.75,80.0,50.0,0,18.75,-30.0,0.025113
1,76.0,2.0,27.4,1.0,3.0,0,2,100.00,62.50,0.0,40.0,2,-37.50,40.0,0.009835
2,69.0,1.0,29.8,1.0,3.0,0,3,81.25,100.00,10.0,0.0,3,18.75,-10.0,0.012095
3,68.0,2.0,30.1,2.0,3.0,0,0,75.00,81.25,30.0,30.0,0,6.25,0.0,0.006005
4,76.0,1.0,32.4,1.0,2.0,0,0,75.00,68.75,30.0,40.0,0,-6.25,10.0,0.029793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4126,56.0,1.0,24.1,1.0,1.0,9,8,100.00,100.00,0.0,0.0,-1,0.00,0.0,0.022899
4127,59.0,2.0,30.5,1.0,1.0,9,5,43.75,68.75,70.0,70.0,-4,25.00,0.0,0.326603
4128,70.0,1.0,33.9,1.0,3.0,9,4,100.00,100.00,0.0,0.0,-5,0.00,0.0,0.002114
4129,52.0,1.0,30.7,1.0,1.0,9,7,31.25,37.50,50.0,70.0,-2,6.25,20.0,0.403195


In [None]:
weighted_mean_age = np.average(df['Age'], weights=df['weights'])
print(f"Weighted Mean Age: {weighted_mean_age}")

Weighted Mean Age: 56.6934865048841


In [None]:
weighted_mean_age = np.average(weighted_df['Age'], weights=weighted_df['weights'])
print(f"Weighted Mean Age: {weighted_mean_age}")

Weighted Mean Age: 41.88798083467781


In [None]:
weighted_mean_age = np.average(weighted_control_df['Age'], weights=weighted_control_df['weights'])
print(f"Weighted Mean Age: {weighted_mean_age}")

Weighted Mean Age: 54.79791603308498


In [101]:
balance = pd.read_csv("1.Balanced_control_data.csv")
weighted_control = pd.read_csv("1.Weighted_control_data.csv")
control = pd.read_csv("control.csv")
treatment = pd.read_csv("treatment.csv")

In [102]:
weighted_control

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Quality_of_life,Baseline_Pain,Baseline_Physical_Activity,PA_change
0,3.481549e-15,4.463525e-17,1.231933e-15,4.463525e-17,1.339057e-16,2.231762e-15,3.570820e-15,0,0
1,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0,2
2,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0,3
3,7.422446e+00,2.183072e-01,3.285524e+00,2.183072e-01,3.274608e-01,8.186521e+00,3.274608e+00,0,0
4,4.789204e+00,6.301585e-02,2.041713e+00,6.301585e-02,1.260317e-01,4.726189e+00,1.890475e+00,0,0
...,...,...,...,...,...,...,...,...,...
4126,4.833758e-15,8.631710e-17,2.080242e-15,8.631710e-17,8.631710e-17,8.631710e-15,0.000000e+00,9,-1
4127,1.361072e+02,4.613804e+00,7.036051e+01,2.306902e+00,2.306902e+00,1.009270e+02,1.614831e+02,9,-4
4128,1.254773e-17,1.792533e-19,6.076688e-18,1.792533e-19,5.377600e-19,1.792533e-17,0.000000e+00,9,-5
4129,2.065817e+02,3.972725e+00,1.219627e+02,3.972725e+00,3.972725e+00,1.241477e+02,1.986362e+02,9,-2


In [103]:
balance

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Baseline_Quality_of_life,Baseline_Pain,PA_change,Treatment,Followup_Physical_Activity,Followup_Quality_of_life,Followup_Pain,QOL_change,pain_change
0,70.0,1.0,28.9,1.0,1.0,0,81.25,30.0,0,0,,,,,
1,77.0,1.0,33.6,1.0,3.0,0,37.50,50.0,3,0,,,,,
2,49.0,1.0,23.5,1.0,1.0,0,81.25,20.0,6,0,,,,,
3,67.0,2.0,34.7,1.0,3.0,0,62.50,0.0,3,0,,,,,
4,56.0,2.0,23.9,1.0,1.0,0,100.00,0.0,5,0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4126,59.0,2.0,30.5,1.0,1.0,9,43.75,70.0,-4,0,,,,,
4127,52.0,1.0,30.7,1.0,1.0,9,31.25,50.0,-2,0,,,,,
4128,52.0,1.0,30.7,1.0,1.0,9,31.25,50.0,-2,0,,,,,
4129,52.0,1.0,30.7,1.0,1.0,9,31.25,50.0,-2,0,,,,,


In [104]:
control

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Baseline_Quality_of_life,Baseline_Pain,PA_change
0,78.0,1.0,27.6,1.0,3.0,0,50.00,80.0,0
1,76.0,2.0,27.4,1.0,3.0,0,100.00,0.0,2
2,69.0,1.0,29.8,1.0,3.0,0,81.25,10.0,3
3,68.0,2.0,30.1,2.0,3.0,0,75.00,30.0,0
4,76.0,1.0,32.4,1.0,2.0,0,75.00,30.0,0
...,...,...,...,...,...,...,...,...,...
4126,56.0,1.0,24.1,1.0,1.0,9,100.00,0.0,-1
4127,59.0,2.0,30.5,1.0,1.0,9,43.75,70.0,-4
4128,70.0,1.0,33.9,1.0,3.0,9,100.00,0.0,-5
4129,52.0,1.0,30.7,1.0,1.0,9,31.25,50.0,-2


In [105]:
treatment

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Baseline_Quality_of_life,Baseline_Pain,PA_change
0,50,2,22.68,1.0,1.0,9,31.25,65.0,0
1,45,1,29.07,1.0,1.0,6,25.00,96.0,3
2,46,2,26.10,1.0,1.0,2,68.75,39.0,6
3,53,2,28.03,1.0,1.0,3,62.50,15.0,0
4,68,2,27.82,1.0,1.0,7,87.50,15.0,2
...,...,...,...,...,...,...,...,...,...
7598,55,2,35.63,1.0,1.0,7,50.00,57.0,2
7599,61,2,29.34,1.0,1.0,2,25.00,55.0,3
7600,62,1,25.88,1.0,1.0,2,50.00,26.0,0
7601,47,2,26.51,1.0,1.0,3,43.75,47.0,-2


In [106]:
weighted_mean_age = np.average(weighted_control['Age'], weights=weighted_control['weights'])
print(f"Weighted Mean Age: {weighted_mean_age}")

KeyError: 'weights'