In [5]:
import pickle
import os 
import pandas as pd

In [6]:
file_path = './'
data_set = 'Tongji'
data_file = 'tjh_dataset_formatted.csv'
data_path = os.path.join(file_path, data_set, data_file)
each_cond_data_num = 100

In [7]:
def load_data(path):
    df = pd.read_csv(path)
    return df

data_df = load_data(data_path)


In [8]:
columns = data_df.columns.to_list()
columns

['PatientID',
 'RecordTime',
 'AdmissionTime',
 'DischargeTime',
 'Outcome',
 'LOS',
 'Sex',
 'Age',
 'Hypersensitive cardiac troponinI',
 'hemoglobin',
 'Serum chloride',
 'Prothrombin time',
 'procalcitonin',
 'eosinophils(%)',
 'Interleukin 2 receptor',
 'Alkaline phosphatase',
 'albumin',
 'basophil(%)',
 'Interleukin 10',
 'Total bilirubin',
 'Platelet count',
 'monocytes(%)',
 'antithrombin',
 'Interleukin 8',
 'indirect bilirubin',
 'Red blood cell distribution width ',
 'neutrophils(%)',
 'total protein',
 'Quantification of Treponema pallidum antibodies',
 'Prothrombin activity',
 'HBsAg',
 'mean corpuscular volume',
 'hematocrit',
 'White blood cell count',
 'Tumor necrosis factorα',
 'mean corpuscular hemoglobin concentration',
 'fibrinogen',
 'Interleukin 1β',
 'Urea',
 'lymphocyte count',
 'PH value',
 'Red blood cell count',
 'Eosinophil count',
 'Corrected calcium',
 'Serum potassium',
 'glucose',
 'neutrophils count',
 'Direct bilirubin',
 'Mean platelet volume',
 'ferr

In [9]:
# 根据列名进行简单的筛选
ans_dict = {}
def filter_columns(data_df, max_column_num):
    import random
    for i in range(1, max_column_num + 1):
        instruct_template  = "please select data from {} datasets which columns' name are in {}"
        column_name_list = random.sample(columns, i)
        instruct = instruct_template.format(data_file, column_name_list)
        data = data_df[column_name_list]
        ans_dict[instruct] = data
for i in range(each_cond_data_num):
    filter_columns(data_df, 3)
print(ans_dict)

{"please select data from tjh_dataset_formatted.csv datasets which columns' name are in ['Direct bilirubin']":       Direct bilirubin
0                  4.0
1                  NaN
2                  2.9
3                  5.5
4                  3.6
...                ...
1699               NaN
1700               NaN
1701               9.9
1702               NaN
1703              52.8

[1704 rows x 1 columns], "please select data from tjh_dataset_formatted.csv datasets which columns' name are in ['lymphocyte count', 'Age']":       lymphocyte count   Age
0                 0.80  73.0
1                  NaN  73.0
2                 1.79  73.0
3                 1.32  73.0
4                 2.60  73.0
...                ...   ...
1699               NaN  68.0
1700               NaN  68.0
1701               NaN  68.0
1702               NaN  68.0
1703              0.55  68.0

[1704 rows x 2 columns], "please select data from tjh_dataset_formatted.csv datasets which columns' name are in ['Total c

In [10]:
# 判断每个特征列的类型信息
def check_column_types(df):
    column_types = {}
    for column in df.columns:
        column_data = df[column]
        # 首先检查是否为数值型数据
        if pd.api.types.is_numeric_dtype(column_data):
            unique_values = column_data.dropna().nunique()
            if unique_values <= 10:  # 假设数字种类较少时为类别类型
                column_types[column] = 'Categorical'
            else:
                column_types[column] = 'Numeric'
        else:
            # 对于非数值型数据，尝试将其转换为日期时间类型
            try:
                pd.to_datetime(column_data, errors='raise')
                column_types[column] = 'Datetime'
            except ValueError:
                column_types[column] = 'Other'
    return column_types

check_column_types = check_column_types(data_df)

In [11]:
type_to_columns = {}
for column, cur_type in check_column_types.items():
    type_to_columns.setdefault(cur_type, []).append(column)
type_to_columns

{'Numeric': ['PatientID',
  'LOS',
  'Age',
  'Hypersensitive cardiac troponinI',
  'hemoglobin',
  'Serum chloride',
  'Prothrombin time',
  'procalcitonin',
  'eosinophils(%)',
  'Interleukin 2 receptor',
  'Alkaline phosphatase',
  'albumin',
  'basophil(%)',
  'Interleukin 10',
  'Total bilirubin',
  'Platelet count',
  'monocytes(%)',
  'antithrombin',
  'Interleukin 8',
  'indirect bilirubin',
  'Red blood cell distribution width ',
  'neutrophils(%)',
  'total protein',
  'Quantification of Treponema pallidum antibodies',
  'Prothrombin activity',
  'HBsAg',
  'mean corpuscular volume',
  'hematocrit',
  'White blood cell count',
  'Tumor necrosis factorα',
  'mean corpuscular hemoglobin concentration',
  'fibrinogen',
  'Interleukin 1β',
  'Urea',
  'lymphocyte count',
  'PH value',
  'Red blood cell count',
  'Eosinophil count',
  'Corrected calcium',
  'Serum potassium',
  'glucose',
  'neutrophils count',
  'Direct bilirubin',
  'Mean platelet volume',
  'ferritin',
  'RBC d

In [12]:
def calculate_static_info(data_df, type_to_columns):
    statistic_info  = {}
    for column in data_df.columns:
        if column in type_to_columns['Numeric']:
            statistic_info[column] = {}
            statistic_info[column]['min'] = data_df[column].dropna().min()
            statistic_info[column]['max'] = data_df[column].dropna().max()
            statistic_info[column]['std'] = data_df[column].dropna().std()
            statistic_info[column]['mean'] = data_df[column].dropna().mean()
        elif column in type_to_columns['Datetime']:
            print(column)
            statistic_info[column] = {}
            earliest_time = pd.to_datetime(data_df[column], format= '%Y-%m-%d').min()
            latest_time = pd.to_datetime(data_df[column], format= '%Y-%m-%d').max()
            statistic_info[column]['earliest_time'] = earliest_time
            statistic_info[column]['latest_time'] = latest_time
        elif column in type_to_columns['Categorical']:
            statistic_info[column] = {}
            types = data_df[column].dropna().unique().tolist()
            statistic_info[column]['types'] = types
    return statistic_info

statistic_infos = calculate_static_info(data_df= data_df, type_to_columns= type_to_columns)
statistic_infos

RecordTime
AdmissionTime
DischargeTime


{'PatientID': {'min': 1.0,
  'max': 375.0,
  'std': 103.71568509108091,
  'mean': 165.91784037558685},
 'RecordTime': {'earliest_time': Timestamp('2020-01-10 00:00:00'),
  'latest_time': Timestamp('2020-02-18 00:00:00')},
 'AdmissionTime': {'earliest_time': Timestamp('2020-01-10 00:00:00'),
  'latest_time': Timestamp('2020-02-17 00:00:00')},
 'DischargeTime': {'earliest_time': Timestamp('2020-01-23 00:00:00'),
  'latest_time': Timestamp('2020-03-04 00:00:00')},
 'Outcome': {'types': [0.0, 1.0]},
 'LOS': {'min': 0.0,
  'max': 35.0,
  'std': 6.829252340292876,
  'mean': 7.61443661971831},
 'Sex': {'types': [1.0, 0.0]},
 'Age': {'min': 18.0,
  'max': 95.0,
  'std': 16.32498057724461,
  'mean': 57.499413145539904},
 'Hypersensitive cardiac troponinI': {'min': 1.9,
  'max': 50000.0,
  'std': 5398.389001269163,
  'mean': 1196.0896921017402},
 'hemoglobin': {'min': 6.4,
  'max': 178.0,
  'std': 20.06704063090589,
  'mean': 124.10998942917549},
 'Serum chloride': {'min': 71.5,
  'max': 140.2,


In [13]:
def generate_random_time_between(start_time, end_time):
    """
    生成一个在 start_time 和 end_time 之间的随机时间
    :param start_time: 开始时间（datetime 对象）
    :param end_time: 结束时间（datetime 对象）
    :return: 生成的随机时间（datetime 对象）
    """
    import numpy as np
    start_timestamp = pd.Timestamp(start_time).timestamp()
    end_timestamp = pd.Timestamp(end_time).timestamp()
    random_timestamp = np.random.uniform(start_timestamp, end_timestamp)
    random_time = pd.Timestamp.fromtimestamp(random_timestamp)
    return random_time

def filter_columns_by_condition(data_df, ans_dict):
    import random
    for condition_num in range(1, 4):
        total_columns = data_df.columns.to_list()
        columns = random.sample(total_columns, condition_num)
        # print(columns)
        instruct = "please select data from {} which ".format(data_file)
        filter_df = data_df
        for idx, column in enumerate(columns):
            if column in type_to_columns['Numeric']:
                rand1 = random.uniform(statistic_infos[column]['min'], statistic_infos[column]['max'])
                rand2 = random.uniform(statistic_infos[column]['min'], statistic_infos[column]['max'])
                sub_instruct = "the number of column {} is between {} and {}, ".format(column, min(rand1, rand2), max(rand1, rand2))
                filter_df = filter_df[(data_df[column] < max(rand2, rand1)) & (data_df[column] > min(rand1, rand2))]
            elif column in type_to_columns['Datetime']:
                rand_time1 = generate_random_time_between(statistic_infos[column]['earliest_time'], statistic_infos[column]['latest_time'])
                rand_time2 = generate_random_time_between(statistic_infos[column]['earliest_time'], statistic_infos[column]['latest_time'])
                sub_instruct = "the date of column {} is between {} and {}, ".format(column, min(rand_time1, rand_time2), max(rand_time1, rand_time2))
                filter_df = filter_df[(pd.to_datetime(data_df[column], format= '%Y-%m-%d') < max(rand_time1, rand_time2)) & ( pd.to_datetime(data_df[column], format= '%Y-%m-%d') > min(rand_time1, rand_time2))]
            elif column in type_to_columns['Categorical']:
                types = statistic_infos[column]['types']
                rand_class = random.sample(types, 1)
                sub_instruct = "the type of column {} is {}, ".format(column, rand_class)
                filter_df = filter_df[data_df[column].isin(rand_class)]
            else:
                print("hh")
                continue
            if idx == len(columns) - 1:
                sub_instruct = sub_instruct[: len(sub_instruct) - 2] + '.'
            instruct += sub_instruct
        # print(f"instruct is {instruct}")
        ans_dict[instruct] = filter_df
for i in range(each_cond_data_num):
    filter_columns_by_condition(data_df, ans_dict)

  filter_df = filter_df[(data_df[column] < max(rand2, rand1)) & (data_df[column] > min(rand1, rand2))]
  filter_df = filter_df[(data_df[column] < max(rand2, rand1)) & (data_df[column] > min(rand1, rand2))]
  filter_df = filter_df[(data_df[column] < max(rand2, rand1)) & (data_df[column] > min(rand1, rand2))]
  filter_df = filter_df[(data_df[column] < max(rand2, rand1)) & (data_df[column] > min(rand1, rand2))]
  filter_df = filter_df[(data_df[column] < max(rand2, rand1)) & (data_df[column] > min(rand1, rand2))]
  filter_df = filter_df[(data_df[column] < max(rand2, rand1)) & (data_df[column] > min(rand1, rand2))]
  filter_df = filter_df[(data_df[column] < max(rand2, rand1)) & (data_df[column] > min(rand1, rand2))]
  filter_df = filter_df[(data_df[column] < max(rand2, rand1)) & (data_df[column] > min(rand1, rand2))]
  filter_df = filter_df[(data_df[column] < max(rand2, rand1)) & (data_df[column] > min(rand1, rand2))]
  filter_df = filter_df[(data_df[column] < max(rand2, rand1)) & (data_df[

In [14]:
def clear_folder(folder_path):
    """
    使用 os 和 os.path 模块清空文件夹
    :param folder_path: 要清空的文件夹路径
    """
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        try:
            if os.path.isfile(file_path):
                os.remove(file_path)
        except Exception as e:
            print(f"Failed to delete {file_path}. Reason: {e}")

In [15]:
ans_dir = os.path.join(file_path, 'data_analyse_result')
os.makedirs(ans_dir, exist_ok= True)
clear_folder(ans_dir)
map_file_name = 'question_to_answer.json'
map_file_dict = {}
for idx, (question, df) in enumerate(ans_dict.items()):
    tar_file_name = str(idx) + '.csv'
    tar_file_path = os.path.join(ans_dir, tar_file_name)
    map_file_dict[question] = tar_file_name
    df.to_csv(tar_file_path)
import json
with open(os.path.join(ans_dir, map_file_name), 'w') as file:
    json.dump(map_file_dict, file)