In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import minimize

In [14]:
# OAI dataset
df = pd.read_csv("preprocessed_OAI.csv")
# GLAD dataset
dfg = pd.read_csv("preprocessed_Glad.csv")
df.shape
print("OAI shape:", df.shape, "GLAD shape:", dfg.shape)

OAI shape: (4131, 11) GLAD shape: (7603, 11)


In [15]:
dfg

Unnamed: 0,age,gender,fysb_BMI,ptb_14618_depression,ptb_3777_employment,ptb_4145,pt12_4145,ptb_koos_qol_score,pt12_koos_qol_score,ptb_3764,pt12_3764
0,50,2,22.68,1.0,1.0,10.0,10.0,31.25,56.25,65.0,17.0
1,45,1,29.07,1.0,1.0,7.0,10.0,25.00,43.75,96.0,52.0
2,46,2,26.10,1.0,1.0,3.0,9.0,68.75,62.50,39.0,25.0
3,53,2,28.03,1.0,1.0,4.0,4.0,62.50,87.50,15.0,2.0
4,68,2,27.82,1.0,1.0,8.0,10.0,87.50,93.75,15.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
7598,55,2,35.63,1.0,1.0,8.0,10.0,50.00,56.25,57.0,9.0
7599,61,2,29.34,1.0,1.0,3.0,6.0,25.00,68.75,55.0,8.0
7600,62,1,25.88,1.0,1.0,3.0,3.0,50.00,25.00,26.0,77.0
7601,47,2,26.51,1.0,1.0,4.0,2.0,43.75,18.75,47.0,92.0


In [16]:
df

Unnamed: 0,V00AGE,P02SEX,P01BMI,V00CESD6,V00CEMPLOY_employment,PA_baseline_category,PA_followup_category,V00KOOSQOL,V01KOOSQOL,knee_pain_baseline,knee_pain_follow
0,78.0,1.0,27.6,1.0,3.0,1,1,50.00,68.75,80.0,50.0
1,76.0,2.0,27.4,1.0,3.0,1,3,100.00,62.50,0.0,40.0
2,69.0,1.0,29.8,1.0,3.0,1,4,81.25,100.00,10.0,0.0
3,68.0,2.0,30.1,2.0,3.0,1,1,75.00,81.25,30.0,30.0
4,76.0,1.0,32.4,1.0,2.0,1,1,75.00,68.75,30.0,40.0
...,...,...,...,...,...,...,...,...,...,...,...
4126,56.0,1.0,24.1,1.0,1.0,10,9,100.00,100.00,0.0,0.0
4127,59.0,2.0,30.5,1.0,1.0,10,6,43.75,68.75,70.0,70.0
4128,70.0,1.0,33.9,1.0,3.0,10,5,100.00,100.00,0.0,0.0
4129,52.0,1.0,30.7,1.0,1.0,10,8,31.25,37.50,50.0,70.0


In [17]:
# Renaming columns in df for consistency
df.rename(columns={
    'V00AGE': 'Age',
    'P02SEX': 'Gender',
    'P01BMI': 'BMI',
    'V00CESD6': 'Depression',
    'V00CEMPLOY_employment': 'Employment_status',
    'V00KOOSQOL': 'Baseline_Quality_of_life',
    'V01KOOSQOL': 'Followup_Quality_of_life',
    'knee_pain_baseline': 'Baseline_Pain',
    'knee_pain_follow': 'Followup_Pain',
    'PA_baseline_category': 'Baseline_Physical_Activity',
    'PA_followup_category': 'Followup_Physical_Activity'
}, inplace=True)

dfg.rename(columns={
    'age': 'Age',
    'gender': 'Gender',
    'fysb_BMI': 'BMI',
    'ptb_14618_depression': 'Depression',
    'ptb_3777_employment': 'Employment_status',
    'ptb_koos_qol_score': 'Baseline_Quality_of_life',
    'pt12_koos_qol_score': 'Followup_Quality_of_life',
    'ptb_3764': 'Baseline_Pain',
    'pt12_3764': 'Followup_Pain',
    'ptb_4145': 'Baseline_Physical_Activity',
    'pt12_4145': 'Followup_Physical_Activity'
}, inplace=True)


In [18]:
df

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Followup_Physical_Activity,Baseline_Quality_of_life,Followup_Quality_of_life,Baseline_Pain,Followup_Pain
0,78.0,1.0,27.6,1.0,3.0,1,1,50.00,68.75,80.0,50.0
1,76.0,2.0,27.4,1.0,3.0,1,3,100.00,62.50,0.0,40.0
2,69.0,1.0,29.8,1.0,3.0,1,4,81.25,100.00,10.0,0.0
3,68.0,2.0,30.1,2.0,3.0,1,1,75.00,81.25,30.0,30.0
4,76.0,1.0,32.4,1.0,2.0,1,1,75.00,68.75,30.0,40.0
...,...,...,...,...,...,...,...,...,...,...,...
4126,56.0,1.0,24.1,1.0,1.0,10,9,100.00,100.00,0.0,0.0
4127,59.0,2.0,30.5,1.0,1.0,10,6,43.75,68.75,70.0,70.0
4128,70.0,1.0,33.9,1.0,3.0,10,5,100.00,100.00,0.0,0.0
4129,52.0,1.0,30.7,1.0,1.0,10,8,31.25,37.50,50.0,70.0


In [19]:
dfg

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Followup_Physical_Activity,Baseline_Quality_of_life,Followup_Quality_of_life,Baseline_Pain,Followup_Pain
0,50,2,22.68,1.0,1.0,10.0,10.0,31.25,56.25,65.0,17.0
1,45,1,29.07,1.0,1.0,7.0,10.0,25.00,43.75,96.0,52.0
2,46,2,26.10,1.0,1.0,3.0,9.0,68.75,62.50,39.0,25.0
3,53,2,28.03,1.0,1.0,4.0,4.0,62.50,87.50,15.0,2.0
4,68,2,27.82,1.0,1.0,8.0,10.0,87.50,93.75,15.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
7598,55,2,35.63,1.0,1.0,8.0,10.0,50.00,56.25,57.0,9.0
7599,61,2,29.34,1.0,1.0,3.0,6.0,25.00,68.75,55.0,8.0
7600,62,1,25.88,1.0,1.0,3.0,3.0,50.00,25.00,26.0,77.0
7601,47,2,26.51,1.0,1.0,4.0,2.0,43.75,18.75,47.0,92.0


##### Outcomes:

In [20]:
# OAI dataset
##### Physical activity
# Ensure the columns are of type 'category'
df['Followup_Physical_Activity'] = df['Followup_Physical_Activity'].astype('category')
df['Baseline_Physical_Activity'] = df['Baseline_Physical_Activity'].astype('category')

# Convert categories to codes if they are ordinal
df['Followup_Physical_Activity'] = df['Followup_Physical_Activity'].cat.codes
df['Baseline_Physical_Activity'] = df['Baseline_Physical_Activity'].cat.codes

df['PA_change'] = df['Followup_Physical_Activity'] - df['Baseline_Physical_Activity']

##### Quality of life
df['QOL_change'] = df['Followup_Quality_of_life'] - df['Baseline_Quality_of_life']

##### Pain
df['pain_change'] = df['Followup_Pain'] - df['Baseline_Pain']

In [21]:
# GLAD Dataset
##### Physical activity
dfg['Baseline_Physical_Activity'] = dfg['Baseline_Physical_Activity'].astype('category')
dfg['Baseline_Physical_Activity'] = dfg['Baseline_Physical_Activity'].cat.codes

dfg['Followup_Physical_Activity'] = dfg['Followup_Physical_Activity'].astype('category')
dfg['Followup_Physical_Activity'] = dfg['Followup_Physical_Activity'].cat.codes

dfg['PA_change'] = dfg['Followup_Physical_Activity'] - dfg['Baseline_Physical_Activity']

##### Quality of life
dfg['QOL_change'] = dfg['Followup_Quality_of_life'] - dfg['Baseline_Quality_of_life']

##### Pain
dfg['pain_change'] = dfg['Followup_Pain'] - dfg['Baseline_Pain']

In [22]:
df

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Followup_Physical_Activity,Baseline_Quality_of_life,Followup_Quality_of_life,Baseline_Pain,Followup_Pain,PA_change,QOL_change,pain_change
0,78.0,1.0,27.6,1.0,3.0,0,0,50.00,68.75,80.0,50.0,0,18.75,-30.0
1,76.0,2.0,27.4,1.0,3.0,0,2,100.00,62.50,0.0,40.0,2,-37.50,40.0
2,69.0,1.0,29.8,1.0,3.0,0,3,81.25,100.00,10.0,0.0,3,18.75,-10.0
3,68.0,2.0,30.1,2.0,3.0,0,0,75.00,81.25,30.0,30.0,0,6.25,0.0
4,76.0,1.0,32.4,1.0,2.0,0,0,75.00,68.75,30.0,40.0,0,-6.25,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4126,56.0,1.0,24.1,1.0,1.0,9,8,100.00,100.00,0.0,0.0,-1,0.00,0.0
4127,59.0,2.0,30.5,1.0,1.0,9,5,43.75,68.75,70.0,70.0,-4,25.00,0.0
4128,70.0,1.0,33.9,1.0,3.0,9,4,100.00,100.00,0.0,0.0,-5,0.00,0.0
4129,52.0,1.0,30.7,1.0,1.0,9,7,31.25,37.50,50.0,70.0,-2,6.25,20.0


In [23]:
dfg

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Followup_Physical_Activity,Baseline_Quality_of_life,Followup_Quality_of_life,Baseline_Pain,Followup_Pain,PA_change,QOL_change,pain_change
0,50,2,22.68,1.0,1.0,9,9,31.25,56.25,65.0,17.0,0,25.00,-48.0
1,45,1,29.07,1.0,1.0,6,9,25.00,43.75,96.0,52.0,3,18.75,-44.0
2,46,2,26.10,1.0,1.0,2,8,68.75,62.50,39.0,25.0,6,-6.25,-14.0
3,53,2,28.03,1.0,1.0,3,3,62.50,87.50,15.0,2.0,0,25.00,-13.0
4,68,2,27.82,1.0,1.0,7,9,87.50,93.75,15.0,1.0,2,6.25,-14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7598,55,2,35.63,1.0,1.0,7,9,50.00,56.25,57.0,9.0,2,6.25,-48.0
7599,61,2,29.34,1.0,1.0,2,5,25.00,68.75,55.0,8.0,3,43.75,-47.0
7600,62,1,25.88,1.0,1.0,2,2,50.00,25.00,26.0,77.0,0,-25.00,51.0
7601,47,2,26.51,1.0,1.0,3,1,43.75,18.75,47.0,92.0,-2,-25.00,45.0


##### Run Entropy Balancing and calculater weights

In [24]:
# Extract features and treatment indicator
features = ['Age', 'Gender', 'BMI', 'Depression', 'Employment_status', 'Baseline_Physical_Activity', 'Baseline_Quality_of_life', 'Baseline_Pain']
X_control = df[features].values
X_treatment = dfg[features].values

# Balance conditions: means of control and treatment groups should match
mean_treatment = X_treatment.mean(axis=0)

# Entropy balancing objective and constraints
def entropy_balancing(weights, X_control, mean_treatment):
    weighted_mean_control = np.average(X_control, axis=0, weights=weights)
    constraint = weighted_mean_control - mean_treatment
    return np.sum(weights * np.log(weights)), constraint

# Initial weights
initial_weights = np.ones(len(X_control)) / len(X_control)

# Constraint function
def constraints(weights):
    return entropy_balancing(weights, X_control, mean_treatment)[1]

# Optimization
result = minimize(
    lambda w: entropy_balancing(w, X_control, mean_treatment)[0],
    initial_weights,
    constraints={'type': 'eq', 'fun': constraints},
    bounds=[(0, None)] * len(X_control)
)

optimal_weights = result.x

# Apply weights to control group
df['weights'] = optimal_weights

# Display weights
print(df[['weights']])

  return np.sum(weights * np.log(weights)), constraint
  return np.sum(weights * np.log(weights)), constraint


       weights
0     0.025113
1     0.009835
2     0.012095
3     0.006005
4     0.029793
...        ...
4126  0.022899
4127  0.326603
4128  0.002114
4129  0.403195
4130  0.067924

[4131 rows x 1 columns]


In [25]:
df

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Followup_Physical_Activity,Baseline_Quality_of_life,Followup_Quality_of_life,Baseline_Pain,Followup_Pain,PA_change,QOL_change,pain_change,weights
0,78.0,1.0,27.6,1.0,3.0,0,0,50.00,68.75,80.0,50.0,0,18.75,-30.0,0.025113
1,76.0,2.0,27.4,1.0,3.0,0,2,100.00,62.50,0.0,40.0,2,-37.50,40.0,0.009835
2,69.0,1.0,29.8,1.0,3.0,0,3,81.25,100.00,10.0,0.0,3,18.75,-10.0,0.012095
3,68.0,2.0,30.1,2.0,3.0,0,0,75.00,81.25,30.0,30.0,0,6.25,0.0,0.006005
4,76.0,1.0,32.4,1.0,2.0,0,0,75.00,68.75,30.0,40.0,0,-6.25,10.0,0.029793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4126,56.0,1.0,24.1,1.0,1.0,9,8,100.00,100.00,0.0,0.0,-1,0.00,0.0,0.022899
4127,59.0,2.0,30.5,1.0,1.0,9,5,43.75,68.75,70.0,70.0,-4,25.00,0.0,0.326603
4128,70.0,1.0,33.9,1.0,3.0,9,4,100.00,100.00,0.0,0.0,-5,0.00,0.0,0.002114
4129,52.0,1.0,30.7,1.0,1.0,9,7,31.25,37.50,50.0,70.0,-2,6.25,20.0,0.403195


In [26]:
# Calculate the weighted mean of the outcome variable 'Physical_Activity' in the control group
weighted_physical_activity_control = np.average(df['Physical_Activity'], weights=df['weights'])

# Calculate the mean of the outcome variable 'Physical_Activity' in the treatment group
mean_physical_activity_treatment = dfg['Physical_Activity'].mean()

# Display the results
print(f"Weighted Mean Physical Activity (Control Group): {weighted_physical_activity_control}")
print(f"Mean Physical Activity (Treatment Group): {mean_physical_activity_treatment}")
# Perform a statistical test to compare the weighted control group and treatment group
from scipy.stats import ttest_ind

# Expand the control group's physical activity data by weights to simulate a larger dataset
expanded_physical_activity_control = np.repeat(df['Physical_Activity'], np.round(df['weights'] * len(df)).astype(int))

# Perform the t-test
t_stat, p_value = ttest_ind(expanded_physical_activity_control, dfg['Physical_Activity'])

print(f"T-test: t-statistic = {t_stat}, p-value = {p_value}")


KeyError: 'Physical_Activity'

In [None]:
# Adding the weights to the control group dataframe
df['weights'] = optimal_weights

# Display the control group with weights
df

In [None]:
weighted_mean_age = np.average(df['Age'], weights=df['weights'])
print(f"Weighted Mean Age: {weighted_mean_age}")

In [None]:
# Calculate the weighted mean of the outcome variable 'Physical_Activity' in the control group
weighted_Age_control = np.average(df['Age'], weights=df['weights'])

# Calculate the mean of the outcome variable 'Physical_Activity' in the treatment group
mean_Age_treatment = dfg['Age'].mean()

print(f"Weighted Mean Physical Activity (Control Group): {weighted_Age_control}")
print(f"Mean Physical Activity (Treatment Group): {mean_Age_treatment}")

In [None]:
df

In [None]:
df

In [None]:
def calculate_weighted_mean(df, variable, weights_column='weights'):
    weighted_mean = np.average(df[variable], weights=df[weights_column])
    return weighted_mean

# Control group (Real one)
mean_age_control = df['Age'].mean()
mean_gender_control = df['Gender'].mean()
mean_bmi_control = df['BMI'].mean()
mean_depression_control = df['Depression'].mean()
mean_employment_status_control = df['Employment_status'].mean()
mean_quality_of_life_control = df['Quality_of_life'].mean()
mean_pain_control = df['Pain'].mean()
mean_PA_control = df['Physical_Activity'].mean()

# Control group (with weights)
weighted_mean_age_control = calculate_weighted_mean(df, 'Age')
weighted_mean_gender_control = calculate_weighted_mean(df, 'Gender')
weighted_mean_bmi_control = calculate_weighted_mean(df, 'BMI')
weighted_mean_depression_control = calculate_weighted_mean(df, 'Depression')
weighted_mean_employment_status_control = calculate_weighted_mean(df, 'Employment_status')
weighted_mean_quality_of_life_control = calculate_weighted_mean(df, 'Quality_of_life')
weighted_mean_pain_control = calculate_weighted_mean(df, 'Pain')
weighted_mean_PA_control = calculate_weighted_mean(df, 'Physical_Activity')

# Treatment group (unweighted)
mean_age_treatment = dfg['Age'].mean()
mean_gender_treatment = dfg['Gender'].mean()
mean_bmi_treatment = dfg['BMI'].mean()
mean_Depression_treatment = dfg['Depression'].mean()
mean_employment_status_treatment = dfg['Employment_status'].mean()
mean_QOL_treatment = dfg['Quality_of_life'].mean()
mean_pain_treatment = dfg['Pain'].mean()
mean_PA_treatment = dfg['Physical_Activity'].mean()

# Result
print("Mean in control group:")
print(f"Mean Age (Control Group): {mean_age_control}")
print(f"Mean Gender (Control Group): {mean_gender_treatment}")
print(f"Mean BMI (Control Group): {mean_bmi_control}")
print(f"Mean Depression (Control Group): {mean_depression_control}")
print(f"Mean Employment Status (Control Group): {mean_employment_status_control}")
print(f"Mean Physical Activity (Control Group): {mean_PA_control}")
print(f"Mean QOL (Control Group): {mean_quality_of_life_control}")
print(f"Mean Pain (Control Group): {mean_pain_control}")

print("--------------------------------------------------")
print("Mean in treatnement group:")
print(f"Mean Age (Treatment Group): {mean_age_treatment}")
print(f"Mean Gender (Treatment Group): {mean_gender_control}")
print(f"Mean BMI (Treatment Group): {mean_bmi_treatment}")
print(f"Mean Depression (Treatment Group): {mean_Depression_treatment}")
print(f"Mean Employment Status (Treatment Group): {mean_employment_status_treatment}")
print(f"Mean Physical Activity (Treatment Group): {mean_PA_treatment}")
print(f"Mean QOL (Treatment Group): {mean_QOL_treatment}")
print(f"Mean Pain (Treatment Group): {mean_pain_treatment}")

print("--------------------------------------------------")
print("Weighted mean in control group:")
print(f"Weighted Mean Age (Control Group): {weighted_mean_age_control}")
print(f"Weighted Mean Gender (Control Group): {weighted_mean_gender_control}")
print(f"Weighted Mean BMI (Control Group): {weighted_mean_bmi_control}")
print(f"Weighted Mean Depression (Control Group): {weighted_mean_depression_control}")
print(f"Weighted Mean Employment Status (Control Group): {weighted_mean_employment_status_control}")
print(f"Weighted Mean PA (Control Group): {weighted_mean_PA_control}")
print(f"Weighted Mean QOL (Control Group): {weighted_mean_quality_of_life_control}")
print(f"Weighted Mean Pain (Control Group): {weighted_mean_pain_control}")


In [None]:
df.Gender.describe()

In [None]:
df.Gender.value_counts()

In [None]:
# Function to create a new DataFrame with rows replicated according to weights
def create_weighted_df(df, weight_column='weights'):
    # Normalize weights to integer values
    max_weight = df[weight_column].max()
    normalized_weights = (df[weight_column] / max_weight * 100).round().astype(int)
    
    # Create a list to store repeated rows
    weighted_rows = []
    
    # Repeat each row according to its normalized weight
    for i, weight in enumerate(normalized_weights):
        repeated_rows = [df.iloc[i]] * weight
        weighted_rows.extend(repeated_rows)
    
    # Convert the list of rows into a new DataFrame
    weighted_df = pd.DataFrame(weighted_rows).reset_index(drop=True)
    return weighted_df

# Create the new weighted DataFrame
weighted_control_df = create_weighted_df(df)

In [None]:
dfg

In [None]:
weighted_control_df

In [None]:
import numpy as np
from scipy.stats import ttest_ind

# Assuming df is the control group and dfg is the treatment group, and weights are applied to df

# Calculate the weighted mean of the outcome variable 'Physical_Activity' in the control group
weighted_mean_physical_activity_control = np.average(df['Physical_Activity'], weights=df['weights'])

# Calculate the mean of the outcome variable 'Physical_Activity' in the treatment group
mean_physical_activity_treatment = dfg['Physical_Activity'].mean()

# Display the results
print(f"Weighted Mean Physical Activity (Control Group): {weighted_mean_physical_activity_control}")
print(f"Mean Physical Activity (Treatment Group): {mean_physical_activity_treatment}")

# Perform a statistical test to compare the weighted control group and treatment group
# Expand the control group's physical activity data by weights to simulate a larger dataset
expanded_physical_activity_control = np.repeat(df['Physical_Activity'], np.round(df['weights'] * len(df)).astype(int))

# Perform the t-test
t_stat, p_value = ttest_ind(expanded_physical_activity_control, dfg['Physical_Activity'])

print(f"T-test: t-statistic = {t_stat}, p-value = {p_value}")


##### Balanced dataframe:

In [None]:
# Function to create a new DataFrame with rows replicated according to weights
def create_weighted_df(df, weight_column='weights'):
    # Normalize weights to integer values
    max_weight = df[weight_column].max()
    normalized_weights = (df[weight_column] / max_weight * 100).round().astype(int)
    
    # Create a list to store repeated rows
    weighted_rows = []
    
    # Repeat each row according to its normalized weight
    for i, weight in enumerate(normalized_weights):
        repeated_rows = [df.iloc[i]] * weight
        weighted_rows.extend(repeated_rows)
    
    # Convert the list of rows into a new DataFrame
    weighted_df = pd.DataFrame(weighted_rows).reset_index(drop=True)
    return weighted_df

# Create the new weighted DataFrame
weighted_control_df = create_weighted_df(df)


In [None]:
weighted_control_df.to_csv('weighted_control.csv', index=False)

In [None]:
df