In [1]:
import pandas as pd

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 9999999)

In [2]:
def generate_feature(row):
    # 将两个数字拼接
    list = []
    list.append(row['hypertension'])
    list.append(row['heart_disease'])
    return list

def generate_text(row):
    return f"""
gender: {row['gender']}
age: {row['age']}
ever_married: {row['ever_married']}
work_type: {row['work_type']}
Residence_type: {row['Residence_type']}
avg_glucose_level: {row['avg_glucose_level']}
bmi: {row['bmi']}
smoking_status: {row['smoking_status']}
""".strip()

def generate_label(row):
    return row['stroke']

In [3]:
# 打开data\healthcare-dataset-stroke-data.csv文件
df_1 = pd.read_csv('./dataset.csv') # 数据地址: https://www.kaggle.com/datasets/shashwatwork/cerebral-stroke-predictionimbalaced-dataset
df_2 = pd.read_csv('./healthcare-dataset-stroke-data.csv') # 数据地址: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset/

# 两个数据集column一样, 在row上合并去重
df = pd.concat([df_1, df_2]).drop_duplicates().reset_index(drop=True)

# 将数据集中的数据按照一定的规则进行处理
new_df = pd.DataFrame()
new_df['feature'] = df.apply(generate_feature, axis=1)
new_df['text'] = df.apply(generate_text, axis=1)
new_df['label'] = df.apply(generate_label, axis=1)

# 将数据集分为训练集和验证集 比例为8:2
train_df = new_df.sample(frac=0.9)
eval_df = new_df.drop(train_df.index)

# # train 保留 1024 条 0 标签数据 1 不变
train_df = pd.concat([train_df[train_df['label'] == 0].sample(n=1000), train_df[train_df['label'] == 1]])
# # eval 保留 256 条 0 标签数据 1 不变
eval_df = pd.concat([eval_df[eval_df['label'] == 0].sample(n=80), eval_df[eval_df['label'] == 1]])

# 查看处理后的数据
print(train_df.head())
print(eval_df.head())


      feature                                               text  label
8547   [0, 0]  gender: Female\nage: 76.0\never_married: Yes\n...      0
24886  [0, 0]  gender: Male\nage: 0.48\never_married: No\nwor...      0
20659  [0, 0]  gender: Female\nage: 39.0\never_married: Yes\n...      0
22981  [0, 0]  gender: Male\nage: 28.0\never_married: Yes\nwo...      0
7909   [1, 0]  gender: Male\nage: 34.0\never_married: Yes\nwo...      0
      feature                                               text  label
20992  [0, 0]  gender: Male\nage: 82.0\never_married: Yes\nwo...      0
11319  [0, 0]  gender: Male\nage: 47.0\never_married: Yes\nwo...      0
22406  [0, 0]  gender: Male\nage: 1.48\never_married: No\nwor...      0
11157  [0, 0]  gender: Female\nage: 46.0\never_married: Yes\n...      0
19729  [0, 0]  gender: Female\nage: 16.0\never_married: No\nw...      0


In [4]:
# 将新数据以datasets库中的数据集的形式保存
from datasets import Dataset, DatasetDict, load_from_disk
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

# 融合为一个数据集
dataset_dict = DatasetDict({
    'train': train_dataset, 
    'eval': eval_dataset
})

In [5]:
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['feature', 'text', 'label', '__index_level_0__'],
        num_rows: 1932
    })
    eval: Dataset({
        features: ['feature', 'text', 'label', '__index_level_0__'],
        num_rows: 153
    })
})


In [6]:
# 保存数据集
dataset_dict.save_to_disk('./healthcare_stroke')

Saving the dataset (0/1 shards):   0%|          | 0/1932 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/153 [00:00<?, ? examples/s]

In [7]:
# 加载查看
train_dataset = load_from_disk('healthcare_stroke')['train']
print(train_dataset)
eval_dataset = load_from_disk('healthcare_stroke')['eval']
print(eval_dataset)

Dataset({
    features: ['feature', 'text', 'label', '__index_level_0__'],
    num_rows: 1932
})
Dataset({
    features: ['feature', 'text', 'label', '__index_level_0__'],
    num_rows: 153
})


In [8]:
# 查看两个集的类别分布
zero = 0
one = 0
for i in train_dataset['label']:
    if i == 0:
        zero += 1
    else:
        one += 1
print('train_dataset:', zero, one)

train_dataset: 1000 932


In [9]:
zero = 0
one = 0
for i in eval_dataset['label']:
    if i == 0:
        zero += 1
    else:
        one += 1
print('eval_dataset:', zero, one)

eval_dataset: 80 73
