In [9]:
import json
import os
import sys
import warnings
import pandas as pd

sys.path.append(os.path.join(os.getcwd()))
from feature_processor import generate_train_feature

# Suppress specific warnings
warnings.filterwarnings("ignore", category=DeprecationWarning, message=".*Conversion of an array with ndim > 0 to a scalar is deprecated.*")

In [10]:
# 读取 Excel 文件
sample_freq = 125
sample_window_minute = 0.5
university_userProfile_file = r'C:\Users\Administrator\Desktop\PPG_dataset\Xinlian_dataset\label\healthy_scale_results.xlsx'
hospital_userProfile_file = r'C:\Users\Administrator\Desktop\Backend-Algorithm\Algorithm\Data\unhealthy_clinic_results.xlsx'
seven_grade_dir = r'D:\index_calculation_ppg\seven.xlsx'
eight_grade_dir = r'D:\index_calculation_ppg\eight.xlsx'

university_file_dir = r'C:\Users\Administrator\Desktop\PPG_dataset\PPG_Dataset_ALL\healthy'
hospital_file_dir = r'C:\Users\Administrator\Desktop\PPG_dataset\PPG_Dataset_ALL\unhealthy'
middle_file_dir = r'C:\Users\Administrator\Desktop\PPG_dataset\PPG_Dataset_ALL\middle_school\ppg_data'

In [20]:
### READ THE UNIVERSITY SUBJECTS
df_university = pd.read_excel(university_userProfile_file, sheet_name='SDS')

valid_cols = df_university.columns.values[3::]

df_university['score'] = df_university[valid_cols].sum(axis=1) / 80 * 100

university_positive_subjects_SDS = df_university[df_university['score'] >= 60]['ID'].tolist()
print("University Positive Depression subjects: ", university_positive_subjects_SDS)

university_negative_subjects_SDS = df_university[df_university['score'] < 60]['ID'].tolist()
print("University Negative Depression subjects: ", university_negative_subjects_SDS)

University Positive Depression subjects:  ['A04', 'A07', 'A09', 'A11', 'A15', 'A21', 'A25', 'A27', 'A29', 'A32', 'A33', 'A39', 'A48', 'A52']
University Negative Depression subjects:  ['A01', 'A02', 'A03', 'A05', 'A06', 'A08', 'A10', 'A12', 'A13', 'A14', 'A16', 'A17', 'A18', 'A19', 'A20', 'A22', 'A23', 'A24', 'A26', 'A28', 'A30', 'A34', 'A35', 'A36', 'A37', 'A38', 'A40', 'A41', 'A42', 'A43', 'A44', 'A45', 'A46', 'A47', 'A49', 'A50', 'A51', 'A53', 'A54', 'A55']


In [11]:
### READ THE HOSPITAL SUBJECTS
df_hospital = pd.read_excel(hospital_userProfile_file, usecols=['床号','性别','年龄'])
hospital_patients_name = df_hospital['床号'].to_list()

### READ THE UNIVERSITY SUBJECTS
df_university = pd.read_excel(university_userProfile_file, sheet_name='POMS', usecols=['ID','Sex','Age (years)'])
# university_students_name = df_university['ID'].to_list()
university_students_name = ['A01', 'A02', 'A03', 'A05', 'A06', 'A08', 'A10', 'A12', 'A13', 'A14', 'A16', 'A17', 'A18', 'A19', 'A20', 'A22', 'A23', 'A24', 'A26', 'A28', 'A30', 'A34', 'A35', 'A36', 'A37', 'A38', 'A40', 'A41', 'A42', 'A43', 'A44', 'A45', 'A46', 'A47', 'A49', 'A50', 'A51', 'A53', 'A54', 'A55']
bachelors_university_students_name = ['A01', 'A02', 'A03', 'A04', 'A05', 'A06', 'A07', 'A08', 'A09', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16', 'A17', 'A18', 'A19', 'A20', 'A21', 'A22', 'A23', 'A24', 'A25', 'A26', 'A27', 'A28', 'A29', 'A30']
graduate_university_students = list(set(university_students_name) - set(bachelors_university_students_name))


In [12]:
### READING CLASS  7 AND CLASS 8 DATA
seven = pd.read_excel(seven_grade_dir, usecols=['user_id', 'gender'])
seven['age'] = 13
eight = pd.read_excel(eight_grade_dir, usecols=['user_id', 'gender'])
eight['age'] = 14

middle_school = pd.concat([seven,eight])

middle_school_dir_names= os.listdir(middle_file_dir)
middle_school_dir_names = [str(i).split('.json')[0] for i in middle_school_dir_names]

middle_school = middle_school[middle_school['user_id'].isin(middle_school_dir_names)]

middle_school_students_name = middle_school['user_id'].to_list()
# data.loc[data['id'].isin(eight_subject), 'age'] = 14

In [13]:
def load_wave_university_or_hospital(path):
    with open(path, 'r', encoding='utf-8') as file:  
            data = json.load(file)
            list_data = data['MesureData']['Datas']
            final = [i for i in list_data]
            return final
    
def load_wave_bachelors_university(path):
    with open(path, 'r', encoding='utf-8') as file:  
            data = json.load(file)
            list_data = data['MesureData']['Datas']
            final = [data for i in list_data for data in i['Datas']]
            return final

def load_wave_middleSchool(path):
    try:
        with open(path, 'r', encoding='utf-8') as file:  
            data = json.load(file)
            list_data = data['RawResults'][0]['rawDatas']
            final = [i/100000 for i in list_data]
        return final
    except (KeyError, IndexError, TypeError):
        with open(path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return data["datas"]

In [14]:
profile_dict = dict()
for _, row in middle_school.iterrows():
    userId = row['user_id']
    gender = row['gender']
    age = row['age']
    profile_dict[userId] = (gender, age)

for _, row in df_hospital.iterrows():
    userId = row['床号']
    gender = 'Male' if row['性别'] == '男' else 'Female'
    age = row['年龄']
    profile_dict[userId] = (gender, age)

for _, row in df_university.iterrows():
    userId = row['ID']
    gender = 'Male' if row['Sex'] == '男' else 'Female'
    age = row['Age (years)']
    profile_dict[userId] = (gender, age)

In [15]:
def generate_dataset(subject_collection, data_aug_times=1):
    ignored_id = []
    dataset = pd.DataFrame()
    for subject in subject_collection:
            subject_wave = []

            # The university subjects.
            if subject in (graduate_university_students):
                files = os.listdir(university_file_dir)
                for file in files:
                    if file.startswith(subject) and file.endswith('.json'):
                        file_path = os.path.join(university_file_dir, file)
                        print(f"university file path: {file_path}")
                        tmp_wave = load_wave_university_or_hospital(file_path)
                        subject_wave += tmp_wave

            if subject in (bachelors_university_students_name):
                files = os.listdir(university_file_dir)
                for file in files:
                    if file.startswith(subject) and file.endswith('.json'):
                        file_path = os.path.join(university_file_dir, file)
                        print(f"university file path: {file_path}")
                        tmp_wave = load_wave_bachelors_university(file_path)
                        subject_wave += tmp_wave
            
            # The hospital subjects.
            elif subject in hospital_patients_name:
                files = os.listdir(hospital_file_dir)
                for file in files:
                    if file.startswith(subject) and file.endswith('.json'):
                        file_path = os.path.join(hospital_file_dir, file)
                        print(f"hospital file path: {file_path}")
                        tmp_wave = load_wave_university_or_hospital(file_path)
                        subject_wave += tmp_wave
            
            # The middle school subjects.
            elif subject in (middle_school_students_name):
                files = os.listdir(middle_file_dir)
                for file in files:
                    if file.startswith(subject) and file.endswith('.json'):
                        file_path = os.path.join(middle_file_dir, file)
                        print(f"middle file path: {file_path}")
                        tmp_wave = load_wave_middleSchool(file_path)
                        subject_wave += tmp_wave

            if len(subject_wave) < (sample_freq * sample_window_minute * 60):
                ignored_id.append({'id': id, 'Time Length of data in minute ': f"{len(subject_wave)/(125 * 60):.02f}"})
                continue
            else:
                subject_dataset = generate_train_feature(subject, subject_wave, profile_dict[subject][0], profile_dict[subject][1], sample_freq, sample_window_minute, data_aug_times)
                dataset = pd.concat([dataset, subject_dataset], ignore_index=True)
    return dataset





In [16]:
middle_school_dataset = generate_dataset(middle_school_students_name)

In [17]:
university_dataset = generate_dataset(university_students_name)
university_dataset.to_excel('./samples/university_dataset.xlsx', index= False)


university file path: C:\Users\Administrator\Desktop\PPG_dataset\PPG_Dataset_ALL\healthy\A01_PPG.json
university file path: C:\Users\Administrator\Desktop\PPG_dataset\PPG_Dataset_ALL\healthy\A02_PPG.json
university file path: C:\Users\Administrator\Desktop\PPG_dataset\PPG_Dataset_ALL\healthy\A03_PPG.json
university file path: C:\Users\Administrator\Desktop\PPG_dataset\PPG_Dataset_ALL\healthy\A05_PPG_01.json
university file path: C:\Users\Administrator\Desktop\PPG_dataset\PPG_Dataset_ALL\healthy\A05_PPG_02.json
university file path: C:\Users\Administrator\Desktop\PPG_dataset\PPG_Dataset_ALL\healthy\A06_PPG.json
university file path: C:\Users\Administrator\Desktop\PPG_dataset\PPG_Dataset_ALL\healthy\A08_PPG.json
university file path: C:\Users\Administrator\Desktop\PPG_dataset\PPG_Dataset_ALL\healthy\A10_PPG.json
university file path: C:\Users\Administrator\Desktop\PPG_dataset\PPG_Dataset_ALL\healthy\A12_PPG.json
university file path: C:\Users\Administrator\Desktop\PPG_dataset\PPG_Dataset

In [18]:
hospital_dataset = generate_dataset(hospital_patients_name)

In [None]:
middle_school_dataset.to_excel('./samples/middle_school_dataset.xlsx', index = False)

hospital_dataset.to_excel('./samples/hospital_dataset.xlsx', index = False)