### Data comparison
normality test: shapiro when n <= 50, else D’Agostino’s K^2 test 
report: median(iqr), since all ofcoms are not normally distributed
comparison:
unrelated: mann-whitney u
related: Wilcoxon signed-rank test

In [1]:

#import libraries
import pandas as pd
import os
import utils

### Read files

In [2]:
# gives the path
path = os.path.realpath("BAMParsedCsvProcessor.ipynb")
dir = os.path.dirname(path)
dir = dir.replace('scripts', 'input_data')
  
# read the user_data.csv and graphdb data
FC2022_user_data = pd.read_csv(dir + "/230328_FC2022_user_data.csv")

FC2022_graphdb_data = pd.read_csv(dir +"/graphdb_data/230328_FC2022_GraphDB.csv")
FC2022_graphdb_data  = FC2022_graphdb_data.rename(columns ={"externalId": "external_id"})

#calculate the actual energy in kcal
FC2022_graphdb_data["actual_energy_kcal"] = FC2022_graphdb_data.productSize/100 * FC2022_graphdb_data.energyKcalValue


dietcoach_categories = pd.read_csv(dir + "/eatfit_dietcoach_categories.csv")


### Clean the data

In [3]:
cleaned_user_data = utils.clean_user_data(FC2022_user_data, 1668002400)
print(f"""{cleaned_user_data.shape[0]} participants joined the study after 2022-11-09 15pm (1668002400),
 and finished 9 weeks". 
 
 Experiment participants: {cleaned_user_data[cleaned_user_data.type == "Experiment"].shape[0]}

 Control participants: {cleaned_user_data[cleaned_user_data.type == "Control"].shape[0]}""")

176 participants joined the study after 2022-11-09 15pm (1668002400),
 and finished 9 weeks". 
 
 Experiment participants: 92

 Control participants: 84


### Merge surveys with receipts

In [4]:

# merge receipts with survey
FC2022_graphdb_data_with_survey = FC2022_graphdb_data.merge(cleaned_user_data, how = "inner", on="external_id")
print(f"""
Total users with data: {len(FC2022_graphdb_data_with_survey.external_id.unique())}
""")
      
# exclude experiment participants who never verified their email account, i.e., never logged into the app
valid_FC2022_graphdb_data_with_survey = FC2022_graphdb_data_with_survey[~((FC2022_graphdb_data_with_survey.type == "Experiment")&(FC2022_graphdb_data_with_survey.verified == 0))]

print(f"""
After filtering out experiment users who did not verify their email:
Total users with data: {len(valid_FC2022_graphdb_data_with_survey.external_id.unique())}
""")
      

control_graphdb_data_with_survey = valid_FC2022_graphdb_data_with_survey[valid_FC2022_graphdb_data_with_survey.type == "Control"]
experiment_graphdb_data_with_survey = valid_FC2022_graphdb_data_with_survey[valid_FC2022_graphdb_data_with_survey.type == "Experiment"]



print(f"""
After filtering out users who did not donate their receipts:
Control users with data: {len(control_graphdb_data_with_survey.external_id.unique())}
Experiment users with data: {len(experiment_graphdb_data_with_survey.external_id.unique())}
""")
      





Total users with data: 62


After filtering out experiment users who did not verify their email:
Total users with data: 61


After filtering out users who did not donate their receipts:
Control users with data: 33
Experiment users with data: 28



### Export valid users list for analyzing post-study surveys

In [5]:

valid_FC2022_users_with_survey_ids = valid_FC2022_graphdb_data_with_survey.loc[:, ('bamId', 'external_id','type', "post_survey_id")].drop_duplicates()

# valid_FC2022_users_with_survey_ids.to_csv("230328_valid_FC2022_users_with_survey_ids.csv", index = False)


In [7]:
valid_FC2022_users_with_survey_ids.shape

(61, 4)

### Check data in the last month

In [8]:
utils.get_valid_users_with_last_xd_data(valid_FC2022_graphdb_data_with_survey, 45)

(285210, 49) (0, 49)


(61, 0)

In [9]:
# get valid surveys and separate control surveys and experiment surveys
surveys = utils.get_valid_onboarding_survey(cleaned_user_data)
valid_surveys = utils.get_valid_onboarding_survey(valid_FC2022_graphdb_data_with_survey)
valid_control_surveys = valid_surveys[valid_surveys.usertype == "Control"]
valid_experiment_surveys = valid_surveys[valid_surveys.usertype == "Experiment"]


### Get the sample demographics data

In [10]:
#get the demographics
overall_median_iqr_summary, overall_value_count_summary = utils.get_demographic_summary(valid_surveys)
control_median_iqr_summary, control_value_count_summary = utils.get_demographic_summary(valid_control_surveys)
experiment_median_iqr_summary, experiment_value_count_summary = utils.get_demographic_summary(valid_experiment_surveys)

In [11]:
overall_value_count_summary

{'loyaltyCards':         loyaltyCards_count  loyaltyCards (%)
 both                    37         60.655738
 migros                  14         22.950820
 coop                    10         16.393443,
 'gender':         gender_count  gender (%)
 Female            34   55.737705
 Male              27   44.262295,
 'fromSanitas':        fromSanitas_count  fromSanitas (%)
 True                  48        78.688525
 False                 13        21.311475,
 'historyApps':                        historyApps_count  historyApps (%)
                                       48        78.688525
 Lifesum                                4         6.557377
 Yazio                                  3         4.918033
 MyFitnessPal                           2         3.278689
 Fddb                                   1         1.639344
 Lifsum                                 1         1.639344
 MyFitnesspal                           1         1.639344
 Lifesum, MyFitnessPal                  1         1.63

In [12]:
overall_median_iqr_summary

Unnamed: 0,count,median,IQR
bmi,61,23.5,4.8
percShoppingMigros,61,50.0,60.0
percShoppingCoop,61,30.0,58.0
usageMigros,61,99.0,20.0
usageCoop,61,95.0,80.0
percFruits,61,80.0,74.0
percVegetables,61,80.0,70.0
percProteinFoods,61,90.0,65.0
percProcessedFoods,61,80.0,85.0
percCarbs,61,90.0,70.0


### Between group and within group ofcom comparisons

In [13]:
# Check whether there are significant differents bewteen ofcom values of 2 groups
ofcom_field_ls = ['OfComValue']
# , 'ofComNSalt', 'ofComNSaturatedFat',
#        'ofComNSugar', 'ofComPDietaryFiber', 'ofComPFVPN', 'ofComPProtein'
for ofcom_field in ofcom_field_ls:
    ofcom_median_iqr, ofcom_dfs = utils.get_ofcom_group_specific_data(valid_FC2022_graphdb_data_with_survey, ofcom_field)
    significant_diff, insignificant_diff = utils.ofcom_comparisons(ofcom_dfs)
    print(f"{ofcom_field}")
    print(f"{ofcom_median_iqr}")
    print(f"-------")
    print(f"{significant_diff}")
    print(f"-------")
    print(f"{insignificant_diff}")
    print(f"-------")
    # print(f"""
    # {ofcom_field} significant results: {significant_diff}
    # Insignificant results: {insignificant_diff}""")


OfComValue
    num_users  is_normal  median/mean  iqr/std
0          53      False          5.5      6.6
1          29      False          6.2      6.5
2          24      False          4.7      7.0
3          52      False          5.0      7.8
4          28      False          4.5      9.3
5          24      False          5.0      6.3
6          50      False          5.4      8.4
7          27      False          6.1      8.7
8          23      False          3.3      8.2
9          42      False          6.0      7.0
10         21      False          5.9      4.8
11         21      False          6.1     11.8
-------
[]
-------
[('T0 control', 'T1 control', False, 0.30549813024041295), ('T0 control', 'T2 control', False, 0.10232187258757604), ('T0 control', 'T3 control', False, 0.3765220642089844), ('T0 experiment', 'T1 experiment', False, 0.20015525817871094), ('T0 experiment', 'T2 experiment', False, 0.2428417205810547), ('T0 experiment', 'T3 experiment', False, 0.47490501403808