Filtering and Understanding how the data files are stored and updated in the SH app export

In [6]:
import pandas as pd
import numpy as np
import os
import re
import json

Saving the list of file names from individual backups

In [7]:
# backup folders path
main_folder_path = r"D:\Projects\Project_7\FitnessTracker\data\raw\Samsung Health"
backup_files = {}

# saving names of all the backups and the respective csv files in them in a dict 
def savebackups(main_folder_path):
    for backups in os.listdir(main_folder_path):
        backups_path = os.path.join(main_folder_path, backups)
        if os.path.isdir(backups_path):
            filelist =[]
            for file in os.listdir(backups_path):
                if file.lower().endswith('.csv'):
                    filelist.append(file)
            backup_files[backups] = filelist
    # Checking number of files in each backup
    print("Number of CSV files in each Backup:")
    for i in backup_files:
        print(f"{i} : {len(backup_files[i])}")

savebackups(main_folder_path)

Number of CSV files in each Backup:
samsunghealth_shaikhmubashir197_20250705144506 : 11
samsunghealth_shaikhmubashir197_20250705150287 : 11
samsunghealth_shaikhmubashir197_20250705150978 : 16
samsunghealth_shaikhmubashir197_20250710185427 : 18
samsunghealth_shaikhmubashir197_20250718195331 : 26
samsunghealth_shaikhmubashir197_20250719170440 : 33
samsunghealth_shaikhmubashir197_20250723145868 : 38


Cleaning Filenames and Saving the records in a json file

In [8]:
# Removing dates/timestamp from filenames for normalization and ease of comparing
def remove_timestamp(filename):
    # This regex matches a dot, 14 digits, then .csv at the end
    return re.sub(r'\.\d{14}\.csv$','.csv',filename)

# normalizing file names
norm_backup_csv = {}
def normalise_filenames(non_normalised_files):
    for backups in non_normalised_files:
        filelist = []
        for csv in non_normalised_files[backups]:
            filelist.append(remove_timestamp(csv))        
        norm_backup_csv[backups] = filelist
    # Listing total files in norm_backup_csv (norm), it matches the no. of files in backup_files (non norm)
    print("Number of CSV files in each Backup:")
    for i in norm_backup_csv:
        print(f"{i}:{len(norm_backup_csv[i])}")

def savefilenames(filenameslist):
    # adding filenames in json
    with open("filenames.json","w") as f:
        json.dump(filenameslist,f,indent=4)

In [9]:
normalise_filenames(backup_files)
savefilenames(norm_backup_csv)

Number of CSV files in each Backup:
samsunghealth_shaikhmubashir197_20250705144506:11
samsunghealth_shaikhmubashir197_20250705150287:11
samsunghealth_shaikhmubashir197_20250705150978:16
samsunghealth_shaikhmubashir197_20250710185427:18
samsunghealth_shaikhmubashir197_20250718195331:26
samsunghealth_shaikhmubashir197_20250719170440:33
samsunghealth_shaikhmubashir197_20250723145868:38


Finding newly added files in the backups

In [12]:
# Finding new (UNIQUE) files in normalised file list
def findnewfiles(normalised_files):
    seen_files = set()
    print("Newly Added files:")
    for backups in sorted(normalised_files):
        print(f"\n {backups}:")
        new_files =[]

        for files in sorted(normalised_files[backups]):
            if files not in seen_files:
                new_files.append(files)
                seen_files.add(files)
        
        for index,files in enumerate(new_files,start=1):
            print(f"    {index}: {files}")
        
findnewfiles(norm_backup_csv)

Newly Added files:

 samsunghealth_shaikhmubashir197_20250705144506:
    1: com.samsung.health.device_profile.csv
    2: com.samsung.health.user_profile.csv
    3: com.samsung.shealth.activity.day_summary.csv
    4: com.samsung.shealth.badge.csv
    5: com.samsung.shealth.calories_burned.details.csv
    6: com.samsung.shealth.exercise.periodization_training_program.csv
    7: com.samsung.shealth.exercise.periodization_training_schedule.csv
    8: com.samsung.shealth.preferences.csv
    9: com.samsung.shealth.service_preferences.csv
    10: com.samsung.shealth.social.service_status.csv
    11: com.samsung.shealth.tracker.floors_day_summary.csv

 samsunghealth_shaikhmubashir197_20250705150287:
    1: com.samsung.shealth.exercise.csv

 samsunghealth_shaikhmubashir197_20250705150978:
    1: com.samsung.shealth.hsp.references.csv
    2: com.samsung.shealth.step_daily_trend.csv
    3: com.samsung.shealth.tracker.pedometer_day_summary.csv
    4: com.samsung.shealth.tracker.pedometer_step_coun