In [31]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu
import pandas as pd
from statsmodels.stats.proportion import proportions_ztest

import matplotlib.pyplot as plt
import seaborn as sns


#### Physical activity

In [32]:
dfPA_treatment = pd.read_csv("testdata/2.PA_treatment.csv")
dfPA_balanced = pd.read_csv("testdata/3.PA_balanced.csv").drop(columns='Treatment')
dfPA_control = pd.read_csv("testdata/2.PA_control.csv")

print("Physical activity Treatment group:", dfPA_treatment.shape)
print("Physical activity Balanced group: ", dfPA_balanced.shape)
print("Physical activity Control group: ", dfPA_control.shape)

Physical activity Treatment group: (7603, 9)
Physical activity Balanced group:  (1156, 9)
Physical activity Control group:  (1156, 9)


In [33]:
print("Balanced:")
print(dfPA_balanced.Age.describe())
print("Treatment:")
print(dfPA_treatment.Age.describe())

Balanced:
count    1156.000000
mean       56.586505
std         7.814203
min        45.000000
25%        51.000000
50%        55.000000
75%        61.000000
max        79.000000
Name: Age, dtype: float64
Treatment:
count    7603.000000
mean       56.692227
std         6.818423
min        23.000000
25%        53.000000
50%        57.000000
75%        61.000000
max        85.000000
Name: Age, dtype: float64


##### Create column outcome in categorical:

In [34]:
# Define the function to categorize PA_changes
def categorize_change(PA_change):
    if PA_change > 0:
        return 'Increased'
    elif PA_change < 0:
        return 'Decreased'
    else:
        return 'Unchanged'

# Apply the function to create a new column in df_PA_Balanced_before
dfPA_treatment['PA_change_category'] = dfPA_treatment['PA_change'].apply(categorize_change)
dfPA_balanced['PA_change_category'] = dfPA_balanced['PA_change'].apply(categorize_change)

##### Calculate Chi2:

In [35]:
dfPA_treatment['Group'] = 'Treatment'
dfPA_balanced['Group'] = 'Balanced'


df_combined = pd.concat([dfPA_treatment, dfPA_balanced])

# Create a contingency table for PA_change_category across Treatment and Balanced groups
contingency_table = pd.crosstab(df_combined['PA_change_category'], df_combined['Group'])

# Perform chi-squared test
chi2, p, dof, ex = chi2_contingency(contingency_table)

print(f"Chi-squared Test for PA_change_category across Treatment and Balanced groups")
print(f"Chi2: {chi2}, p-value: {p}, Degrees of Freedom: {dof}")
print("\nExpected Frequencies:")
print(ex)
print("\nObserved Frequencies:")
print(contingency_table)


Chi-squared Test for PA_change_category across Treatment and Balanced groups
Chi2: 8.689487923389134, p-value: 0.012974829875972458, Degrees of Freedom: 2

Expected Frequencies:
[[ 327.57072725 2154.42927275]
 [ 471.42733189 3100.57266811]
 [ 357.00194086 2347.99805914]]

Observed Frequencies:
Group               Balanced  Treatment
PA_change_category                     
Decreased                369       2113
Increased                441       3131
Unchanged                346       2359


#### Quality of life

In [36]:
dfQ_treatment = pd.read_csv("testdata/2.QOL_treatment.csv")
dfQ_control = pd.read_csv("testdata/2.QOL_control.csv")
dfQ_balanced = pd.read_csv("testdata/3.QOL_balanced.csv")

In [37]:
# List of file paths and corresponding dataframe names
files = [
    ("testdata/2.QOL_treatment.csv", "dfQ_treatment"),
    ("testdata/2.QOL_control.csv", "dfQ_control"),
    ("testdata/3.QOL_balanced.csv", "dfQ_balanced"),
]

# Dictionary to store dataframes
dataframes = {}

# Loop through each file and read the csv into a dataframe, then store it in the dictionary
for file, name in files:
    dataframes[name] = pd.read_csv(file)
    print(f"{name}: {dataframes[name].shape}")

dfQ_treatment: (7603, 9)
dfQ_control: (1156, 9)
dfQ_balanced: (1156, 10)


In [38]:
# Define the function to categorize PA_changes
def QOL_categorize_significant_change(QOL_change):
    if QOL_change >= 10:
        return 'Increased 10+ points'
    elif QOL_change <= -10:
        return 'Decreased 10+ points'
    else:
        return 'Maintained'

dfQ_treatment['QOL_change_category'] = dfQ_treatment['QOL_change'].apply(QOL_categorize_significant_change)
dfQ_control['QOL_change_category'] = dfQ_control['QOL_change'].apply(QOL_categorize_significant_change)
dfQ_balanced['QOL_change_category'] = dfQ_balanced['QOL_change'].apply(QOL_categorize_significant_change)

In [39]:
dfQ_balanced

Unnamed: 0,Age,Gender,BMI,Depression,Employment_status,Baseline_Physical_Activity,Baseline_Pain,Baseline_Quality_of_life,QOL_change,Treatment,QOL_change_category
0,74.0,1.0,29.6,1.0,1.0,1.0,70.0,50.00,-6.25,0,Maintained
1,49.0,2.0,33.8,1.0,1.0,1.0,60.0,56.25,-18.75,0,Decreased 10+ points
2,49.0,2.0,33.8,1.0,1.0,1.0,60.0,56.25,-18.75,0,Decreased 10+ points
3,49.0,2.0,33.8,1.0,1.0,1.0,60.0,56.25,-18.75,0,Decreased 10+ points
4,74.0,1.0,27.1,1.0,3.0,1.0,60.0,12.50,18.75,0,Increased 10+ points
...,...,...,...,...,...,...,...,...,...,...,...
1151,56.0,2.0,27.5,1.0,1.0,10.0,40.0,43.75,12.50,0,Increased 10+ points
1152,56.0,2.0,27.5,1.0,1.0,10.0,40.0,43.75,12.50,0,Increased 10+ points
1153,56.0,2.0,27.5,1.0,1.0,10.0,40.0,43.75,12.50,0,Increased 10+ points
1154,53.0,2.0,30.1,1.0,1.0,10.0,50.0,50.00,6.25,0,Maintained


In [40]:
# Add a new column to identify the group
dfQ_treatment['Group'] = 'Treatment'
dfQ_balanced['Group'] = 'Balanced'

# Combine the dataframes
dfQ_combined = pd.concat([dfQ_treatment, dfQ_balanced])

# Create a contingency table for QOL_change_category across Treatment and Balanced groups
contingency_table = pd.crosstab(dfQ_combined['QOL_change_category'], dfQ_combined['Group'])

# Perform chi-squared test
chi2, p, dof, ex = chi2_contingency(contingency_table)

# Display results
print(f"Chi-squared Test for QOL_change_category across Treatment and Balanced groups")
print(f"Chi2: {chi2}, p-value: {p}, Degrees of Freedom: {dof}")
print("\nExpected Frequencies:")
print(ex)
print("\nObserved Frequencies:")
print(contingency_table)


Chi-squared Test for QOL_change_category across Treatment and Balanced groups
Chi2: 30.68986309802045, p-value: 2.1666107971020013e-07, Degrees of Freedom: 2

Expected Frequencies:
[[ 141.21703391  928.78296609]
 [ 547.3149903  3599.6850097 ]
 [ 467.4679758  3074.5320242 ]]

Observed Frequencies:
Group                 Balanced  Treatment
QOL_change_category                      
Decreased 10+ points       157        913
Increased 10+ points       460       3687
Maintained                 539       3003


#### Pain

In [41]:
dfpain_treatment = pd.read_csv("testdata/2.Pain_treatment.csv")
dfpain_control = pd.read_csv("testdata/2.Pain_control.csv")
dfpain_balanced = pd.read_csv("testdata/3.Pain_balanced.csv")

In [42]:
# List of file paths and corresponding dataframe names
files = [
    ("testdata/2.Pain_treatment.csv", "dfpain_treatment"),
    ("testdata/2.Pain_control.csv", "dfpain_control"),
    ("testdata/3.Pain_balanced.csv", "dfpain_balanced")
]

# Dictionary to store dataframes
dataframes = {}

# Loop through each file and read the csv into a dataframe, then store it in the dictionary
for file, name in files:
    dataframes[name] = pd.read_csv(file)
    print(f"{name}: {dataframes[name].shape}")


dfpain_treatment: (7603, 9)
dfpain_control: (1156, 9)
dfpain_balanced: (1156, 10)


In [43]:
# Define the function to categorize changes of at least 20 points
def pain_categorize_significant_change(pain_change):
    if pain_change <= -20:
        return 'Decreased 20+ points'
    elif pain_change >= 20:
        return 'Increased 20+ points'
    else:
        return 'Maintained'  # This now clearly means the pain changed but less than 20 points either way

dfpain_treatment['pain_change_category'] = dfpain_treatment['pain_change'].apply(pain_categorize_significant_change)
dfpain_control['pain_change_category'] = dfpain_control['pain_change'].apply(pain_categorize_significant_change)
dfpain_balanced['pain_change_category'] = dfpain_balanced['pain_change'].apply(pain_categorize_significant_change)


In [44]:
# Add a new column to identify the group
dfpain_treatment['Group'] = 'Treatment'
dfpain_balanced['Group'] = 'Balanced'

# Combine the dataframes
dfpain_combined = pd.concat([dfpain_treatment, dfpain_balanced])

# Create a contingency table for pain_change_category across Treatment and Balanced groups
contingency_table = pd.crosstab(dfpain_combined['pain_change_category'], dfpain_combined['Group'])

# Perform chi-squared test
chi2, p, dof, ex = chi2_contingency(contingency_table)

# Display results
print(f"Chi-squared Test for pain_change_category across Treatment and Balanced groups")
print(f"Chi2: {chi2}, p-value: {p}, Degrees of Freedom: {dof}")
print("\nExpected Frequencies:")
print(ex)
print("\nObserved Frequencies:")
print(contingency_table)


Chi-squared Test for pain_change_category across Treatment and Balanced groups
Chi2: 273.6757423957542, p-value: 3.733082805701768e-60, Degrees of Freedom: 2

Expected Frequencies:
[[ 436.05708414 2867.94291586]
 [ 113.76549834  748.23450166]
 [ 606.17741751 3986.82258249]]

Observed Frequencies:
Group                 Balanced  Treatment
pain_change_category                     
Decreased 20+ points       312       2992
Increased 20+ points       265        597
Maintained                 579       4014
