## Basic cleaning

- Data from 2022
- Keep only users that have both biometrics and exercise data
- Drop records where we dont have a timestamp
- Drop duplicates

In [None]:
import os
import pandas as pd
dataframes = {}
data_folder = 'data/'
for filename in os.listdir(data_folder):
    if filename.endswith('.csv'):
        file_path = os.path.join(data_folder, filename)
        df = pd.read_csv(file_path)
        dataframes[filename[:-4]] = df

### Check ranges

In [None]:
biometrics = pd.concat([dataframes['bio1'], dataframes['bio2']])
biometrics['MeasuredOnUTC'] = pd.to_datetime(biometrics['MeasuredOnUTC'], errors='coerce')
biometrics = biometrics.dropna(subset=['MeasuredOnUTC'])
time_range = (biometrics['MeasuredOnUTC'].min(), biometrics['MeasuredOnUTC'].max())
print(f"Time range for the combined dataframe: {time_range[0]} to {time_range[1]}")

Time range for the combined dataframe: 2022-01-02 01:34:04.924000+00:00 to 2022-12-31 23:27:32.374000+00:00


In [None]:
exercises = pd.concat([dataframes['ex1'], dataframes['ex2']])
exercises['DoneOnUTC'] = pd.to_datetime(exercises['DoneOnUTC'], errors='coerce')
exercises = exercises.dropna(subset=['DoneOnUTC'])
time_range = (exercises['DoneOnUTC'].min(), exercises['DoneOnUTC'].max())
print(f"Time range for the combined dataframe: {time_range[0]} to {time_range[1]}")

Time range for the combined dataframe: 2022-01-02 00:00:45.421000+00:00 to 2022-12-31 23:59:42.223000+00:00


### Keep only users for which we have both biometrics and exercise data

In [None]:
distinct_users_exercises = exercises['CloudId'].nunique()
print(f"Number of distinct users in exercises dataframe: {distinct_users_exercises}")
distinct_users_biometrics = biometrics['CloudId'].nunique()
print(f"Number of distinct users in biometrics dataframe: {distinct_users_biometrics}")
common_users = set(biometrics['CloudId']).intersection(set(exercises['CloudId']))
biometrics_common = biometrics[biometrics['CloudId'].isin(common_users)]
exercises_common = exercises[exercises['CloudId'].isin(common_users)]
num_common_users = len(common_users)
print(f"Number of common users in both dataframes: {num_common_users}")
biometrics_filtered = biometrics[biometrics['CloudId'].isin(common_users)]
exercises_filtered = exercises[exercises['CloudId'].isin(common_users)]
print(f"Filtered biometrics dataframe shape: {biometrics_filtered.shape}")
print(f"Filtered exercises dataframe shape: {exercises_filtered.shape}")

Number of distinct users in exercises dataframe: 6360
Number of distinct users in biometrics dataframe: 8312


#### Drop duplicates

In [None]:
biometrics_filtered = biometrics_filtered.drop_duplicates()
exercises_filtered = exercises_filtered.drop_duplicates()
exercises = exercises_filtered
biometrics = biometrics_filtered
print(f"Updated exercises dataframe shape: {exercises.shape}")
print(f"Updated biometrics dataframe shape: {biometrics.shape}")
print(exercises['CloudId'].nunique())
print(biometrics['CloudId'].nunique())
biometrics.to_csv('biometrics_cleaned.csv', index=False)
exercises.to_csv('exercises_cleaned.csv', index=False)

biometrics_filtered shape after removing duplicates: (4889207, 7)
exercises_filtered shape after removing duplicates: (3315940, 13)
