In [1]:
import datetime
import json
import os
import random
import sys
import warnings
import pandas as pd

sys.path.append(os.path.join(os.getcwd(), '../../../common'))
from ppg_feature_processor import generate_train_feature

# Suppress specific warnings
warnings.filterwarnings("ignore", category=DeprecationWarning, message=".*Conversion of an array with ndim > 0 to a scalar is deprecated.*")

In [8]:
middleSchool_userProfile_file_new = r'C:\Users\Administrator\Desktop\Backend-Algorithm\Algorithm\PPG\ppg_depression\data_preprocess\new_data_2024_09_20\outdir\MiddleSchool_Depression_Anxiety.xlsx'
middleSchool_userProfile_file = r'C:\Users\Administrator\Desktop\Backend-Algorithm\Algorithm\PPG\ppg_depression\data_preprocess\outdir\MiddleSchool_Depression_Anxiety.xlsx'
hospital_userProfile_file = '../../../../Data/unhealthy_clinic_results.xlsx'

In [9]:
anxiety_positive_threshold = 15
anxiety_negative_threshold = 6

In [12]:
### READ THE MIDDLESCHOOL SUBJECTS old
df_middle_old = pd.read_excel(middleSchool_userProfile_file)
df_middle_old_pos = df_middle_old[df_middle_old['anxiety_sum'] >= anxiety_positive_threshold]
df_middle_old_neg = df_middle_old[df_middle_old['anxiety_sum'] <= anxiety_negative_threshold]

middle_subjects_old_pos = df_middle_old_pos['user_id'].values.tolist()
middle_subjects_old_neg = df_middle_old_neg['user_id'].values.tolist()
print(f"middle depression positive subjects count old: {len(middle_subjects_old_pos)}")
print(f"middle depression positive subjects count old: {len(middle_subjects_old_neg)}")

middle depression positive subjects count old: 161
middle depression positive subjects count old: 105


In [13]:
### READ THE MIDDLESCHOOL SUBJECTS
df_middle = pd.read_excel(middleSchool_userProfile_file_new, sheet_name= 'INMHT')
middle_pos_subjects = df_middle[df_middle['anxiety_sum'] >= anxiety_positive_threshold]['subjectid'].values.tolist()
middle_neg_subjects = df_middle[df_middle['anxiety_sum'] <= anxiety_negative_threshold]['subjectid'].values.tolist()
print(f"middle anxiety positive subjects count: {len(middle_pos_subjects)}")
print(f"middle anxiety negative subjects count: {len(middle_neg_subjects)}")

middle anxiety positive subjects count: 15
middle anxiety negative subjects count: 26


In [19]:
profile_dict = dict()

for _, row in df_middle.iterrows():
    userId = row['subjectid']
    gender = row['gender']
    age = row['age']
    profile_dict[userId] = (gender, age)

for _,row in df_middle_old.iterrows():
    userId = row['user_id']
    gender = row['gender']
    age = row['age']
    profile_dict[userId] = (gender, age)



In [20]:
def load_wave_middleSchool(path):
    try:
        with open(path, 'r', encoding='utf-8') as file:  
            data = json.load(file)
        return data["ppg_raw_data"]
    except:
        print("Subject Not Loaded")
        pass

def load_wave_middleSchool_old(path):
    try:
        with open(path, 'r', encoding='utf-8') as file:  
            data = json.load(file)
            list_data = data['RawResults'][0]['rawDatas']
            final = [i/100000 for i in list_data]
        return final
    except (KeyError, IndexError, TypeError):
        with open(path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return data["datas"]
    

In [16]:
sample_freq = 125
sample_window_minute = 0.5

middle_file_old_dir = r'C:\Users\Administrator\Desktop\PPG_dataset\PPG_Dataset_ALL\middle_school\ppg_data'
middle_file_dir = r'C:\Users\Administrator\Desktop\middle_school_data\ppg_data'
hospital_file_dir = r'C:\Users\Administrator\Desktop\PPG_dataset\PPG_Dataset_ALL\unhealthy'

In [21]:
all_subjects_neg = middle_subjects_old_neg + middle_neg_subjects
all_subjects_pos = middle_subjects_old_pos + middle_pos_subjects

In [22]:
def generate_dataset(subject_collection, label, data_aug_times=1):
    ignored_id = []
    dataset = pd.DataFrame()
    for subject in subject_collection:
            subject_wave = []

            if subject in (middle_pos_subjects + middle_neg_subjects ):
                files = os.listdir(middle_file_dir)
                for file in files:
                    if file.startswith(subject) and file.endswith('.json'):
                        file_path = os.path.join(middle_file_dir, file)
                        print(f"middle file path: {file_path}")
                        tmp_wave = load_wave_middleSchool(file_path)
                        subject_wave += tmp_wave

            elif subject in (middle_subjects_old_neg + middle_subjects_old_pos):
                files = os.listdir(middle_file_old_dir)
                for file in files:
                    if file.startswith(subject) and file.endswith('.json'):
                        file_path = os.path.join(middle_file_old_dir, file)
                        print(f"middle file path old: {file_path}")
                        tmp_wave = load_wave_middleSchool_old(file_path)
                        subject_wave += tmp_wave
            
            if len(subject_wave) < (sample_freq * (sample_window_minute) * 60):
                print(f"Small time for subj: {subject} is: {len(subject_wave)/(sample_freq*60)}")
                ignored_id.append({'id': id, 'Time Length of data in minute ': f"{len(subject_wave)/(125 * 60):.02f}"})
                continue
            else:
                print(f"Recording time for subj: {subject} is: {len(subject_wave)/(sample_freq*60)}")
                subject_dataset = generate_train_feature(subject, subject_wave, label, profile_dict[subject][0], profile_dict[subject][1], sample_freq, sample_window_minute, data_aug_times, if_gender = True)
                dataset = pd.concat([dataset, subject_dataset], ignore_index=True)
    return dataset




In [23]:
pos_dataset = generate_dataset(all_subjects_pos, label=1, data_aug_times=4)
neg_dataset = generate_dataset(all_subjects_neg, label=0, data_aug_times=1)

middle file path old: C:\Users\Administrator\Desktop\PPG_dataset\PPG_Dataset_ALL\middle_school\ppg_data\138681313570679646396533197328690174535.json
Recording time for subj: 138681313570679646396533197328690174535 is: 2.95
middle file path old: C:\Users\Administrator\Desktop\PPG_dataset\PPG_Dataset_ALL\middle_school\ppg_data\82498448730405369670702745834853438447.json
Recording time for subj: 82498448730405369670702745834853438447 is: 2.966666666666667
middle file path old: C:\Users\Administrator\Desktop\PPG_dataset\PPG_Dataset_ALL\middle_school\ppg_data\70055199254646997137347024168883235522.json
Recording time for subj: 70055199254646997137347024168883235522 is: 2.9833333333333334
middle file path old: C:\Users\Administrator\Desktop\PPG_dataset\PPG_Dataset_ALL\middle_school\ppg_data\85636101400926141949023704298355735329.json
Recording time for subj: 85636101400926141949023704298355735329 is: 3.0
middle file path old: C:\Users\Administrator\Desktop\PPG_dataset\PPG_Dataset_ALL\middle_

In [28]:
columns_drop = []
for col in pos_dataset.columns[:-4]:
    nan_count = (pos_dataset[col] == 0).sum() 
    if nan_count > (len(pos_dataset) // 4): 
        columns_drop.append(col)


pos_dataset_final = pos_dataset.drop(columns=columns_drop, inplace=False)
neg_dataset_final = neg_dataset.drop(columns=columns_drop, inplace=False)

In [29]:
print(f"After remove the invalid columns, train pos shape: {pos_dataset_final.shape}")
print(f"After remove the invalid columns, train neg shape: {neg_dataset_final.shape}")

After remove the invalid columns, train pos shape: (4541, 29)
After remove the invalid columns, train neg shape: (1197, 29)


In [30]:
profile_columns = ['wave_quality', 'age', 'gender', 'label', 'id']

In [31]:
# 合并训练集
train_combined = pd.concat([pos_dataset_final, neg_dataset_final])
# 需要归一化的列
columns_to_normalize = train_combined.columns.difference(profile_columns)
current_date = datetime.datetime.now().strftime("%Y-%m-%d")
outdir = os.path.join(os.getcwd(), 'outdir')
# 从CSV文件中读取最小值和最大值
min_max_df = pd.read_csv(f'{outdir}/ppg_anxiety_norm_minmax_values_{current_date}.csv', index_col=0)
min_values = min_max_df['min']
max_values = min_max_df['max']

# 定义min-max归一化函数
def normalize(df, min_values, max_values):
    return (df - min_values) / (max_values - min_values)


# 对测试集进行归一化
pos_dataset_final[columns_to_normalize] = normalize(pos_dataset_final[columns_to_normalize], min_values, max_values)
neg_dataset_final[columns_to_normalize] = normalize(neg_dataset_final[columns_to_normalize], min_values, max_values)


In [32]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [33]:
sample_folder = os.path.join(os.getcwd(), 'samples')
os.makedirs(sample_folder, exist_ok=True)

In [34]:
test_pos_path = os.path.join(sample_folder, 'testing_positive.pkl')
test_neg_path = os.path.join(sample_folder, 'testing_negative.pkl')

In [35]:
pos_dataset_final.to_pickle(test_pos_path)
neg_dataset_final.to_pickle(test_neg_path)