In [1]:
import pandas as pd
import numpy as np

In [2]:
def convert_effect_to_int(effect: str) -> int:
    if effect == 'Extremely Severe Side Effects':
        return 1
    elif effect == 'Severe Side Effects':
        return 2
    elif effect == 'Moderate Side Effects':
        return 3
    elif effect == 'Mild Side Effects':
        return 4
    elif effect == 'No Side Effects':
        return 5

    print("没有该 side effect")
    return -1

In [3]:
# 获取标签最多的那一类
def get_most_label(data):
    data_label = data.iloc[:, -1]
    label_sort = data_label.value_counts(sort=True)
    return label_sort.keys()[0]

In [4]:
train_path = "../决策树数据集/training.csv"
train_data = pd.read_csv(train_path)
# 预处理，删除无用的列
train_data = train_data.drop(columns=['recordId', 'drugName', 'condition', 'reviewComment', 'date'])
train_data['sideEffects'] = train_data['sideEffects'].apply(lambda effect: convert_effect_to_int(effect))
print(train_data)
# 统计每个特征的取值情况
column_count = dict([(ds, list(pd.unique(train_data[ds]))) for ds in train_data.iloc[:, :-1].columns])
print(column_count)
# 先给出连续的特征
continuous_features = ['usefulCount']
most_label = get_most_label(train_data)

      usefulCount  sideEffects  rating
0              22            4       5
1              17            2       4
2               3            5       5
3              35            4       5
4               4            2       5
...           ...          ...     ...
6994           11            4       5
6995            0            5       4
6996           79            4       5
6997            1            5       1
6998            2            5       3

[6999 rows x 3 columns]
{'usefulCount': [22, 17, 3, 35, 4, 13, 1, 32, 21, 7, 57, 19, 44, 14, 26, 24, 9, 15, 43, 27, 10, 18, 47, 38, 42, 0, 12, 55, 75, 20, 6, 49, 2, 5, 104, 16, 36, 125, 51, 33, 88, 228, 80, 45, 34, 159, 58, 31, 60, 11, 224, 63, 84, 8, 28, 48, 62, 82, 81, 23, 25, 30, 59, 41, 142, 110, 29, 52, 37, 90, 68, 175, 40, 66, 171, 73, 150, 101, 70, 117, 470, 61, 157, 86, 69, 145, 50, 206, 77, 46, 76, 113, 283, 89, 199, 67, 182, 91, 212, 134, 121, 124, 193, 126, 163, 39, 64, 114, 154, 54, 108, 152, 65, 191, 85, 116, 381

In [5]:
# 计算信息熵
def cal_information_entropy(data):
    data_label = data.iloc[:, -1]
    # 总共有多少类
    label_class = data_label.value_counts()
    ent = 0
    for k in label_class.keys():
        p_k = label_class[k] / len(data_label)
        ent += -p_k * np.log2(p_k)
    return ent

In [6]:
# 对于离散特征a，计算给定数据属性a的信息增益
def cal_information_gain(data, a):
    ent = cal_information_entropy(data)
    # 特征有多少种可能
    feature_class = data[a].value_counts()
    gain = 0
    for v in feature_class.keys():
        weight = feature_class[v] / data.shape[0]
        ent_val = cal_information_entropy(data.loc[data[a] == v])
        gain += weight * ent_val
    return ent - gain

In [7]:
# 对于连续特征b，计算给定数据属性b的信息增益
def cal_information_gain_continuous(data, a):
    # 总共有n条数据，会产生n-1个划分点，选择信息增益最大的作为最优划分点
    n = len(data)
    # 从小到大排序
    data_a_value = sorted(data[a].values)
    # 原始数据集的信息熵Ent(D)
    ent = cal_information_entropy(data)
    select_points = []
    for i in range(n - 1):
        # 两个值中间取值为划分点
        val = (data_a_value[i] + data_a_value[i + 1]) / 2
        data_left = data.loc[data[a] < val]
        data_right = data.loc[data[a] > val]
        ent_left = cal_information_entropy(data_left)
        ent_right = cal_information_entropy(data_right)
        result = ent - len(data_left) / n * ent_left - len(data_right) / n * ent_right
        select_points.append([val, result])
    # 按照信息增益排序
    select_points.sort(key=lambda x: x[1], reverse=True)
    # 返回信息增益最大的点, 以及对应的信息增益
    return select_points[0][0], select_points[0][1]

In [8]:
# 获取最佳划分特征
def get_best_feature(data):
    features = data.columns[:-1]
    res = {}
    for a in features:
        if a in continuous_features:
            temp_val, temp = cal_information_gain_continuous(data, a)
            res[a] = [temp_val, temp]
        else:
            temp = cal_information_gain(data, a)
            res[a] = [-1, temp]  #离散值没有划分点，用-1代替

    res = sorted(res.items(), key=lambda x: x[1][1], reverse=True)
    return res[0][0], res[0][1][0]

In [9]:
# 将数据转化为（属性值：数据）的元组形式返回，并删除之前的特征列，只针对离散数据
def drop_exist_feature(data, best_feature):
    attr = pd.unique(data[best_feature])
    new_data = [(nd, data[data[best_feature] == nd]) for nd in attr]
    new_data = [(n[0], n[1].drop([best_feature], axis=1)) for n in new_data]
    return new_data

In [10]:
# 创建决策树
def create_tree(data):
    if len(data) == 0:
        # 没有数据，设置为 examples 中最普遍的数据值
        return most_label
    data_label = data.iloc[:, -1]
    # 只有一类
    if len(data_label.value_counts()) == 1:
        return data_label.values[0]
    # 所有数据的特征值一样，选样本最多的类作为分类结果
    if all(len(data[i].value_counts()) == 1 for i in data.iloc[:, :-1].columns):
        return get_most_label(data)
    # 根据信息增益得到的最优划分特征
    best_feature, best_feature_val = get_best_feature(data)
    # 连续值
    if best_feature in continuous_features:
        node_name = best_feature + '<' + str(best_feature_val)
        # 用字典形式存储决策树
        tree = {node_name: {}}
        tree[node_name]['是'] = create_tree(data.loc[data[best_feature] < best_feature_val])
        tree[node_name]['否'] = create_tree(data.loc[data[best_feature] > best_feature_val])
    else:
        tree = {best_feature: {}}
        # 当前数据下最佳特征的取值
        exist_vals = pd.unique(data[best_feature])
        # 如果特征的取值相比于原来的少了
        if len(exist_vals) != len(column_count[best_feature]):
            # 少的那些特征
            no_exist_attr = set(column_count[best_feature]) - set(exist_vals)
            for no_feat in no_exist_attr:
                # 缺失的特征分类为当前类别最多的
                tree[best_feature][no_feat] = get_most_label(data)
        # 根据特征值的不同递归创建决策树
        for item in drop_exist_feature(data, best_feature):
            tree[best_feature][item[0]] = create_tree(item[1])
    return tree

In [11]:
#根据创建的决策树进行分类
def predict(tree, predict_data):
    first_feature = list(tree.keys())[0]
    if (feature_name := first_feature.split('<')[0]) in continuous_features:
        second_dict = tree[first_feature]
        val = float(first_feature.split('<')[-1])
        input_first = predict_data.get(feature_name)
        if input_first < val:
            input_value = second_dict['是']
        else:
            input_value = second_dict['否']
    else:
        second_dict = tree[first_feature]
        input_first = predict_data.get(first_feature)
        input_value = second_dict[input_first]
    # 判断分支还是不是字典
    if isinstance(input_value, dict):
        class_label = predict(input_value, predict_data)
    else:
        class_label = input_value
    return class_label

In [12]:
decision_tree = create_tree(train_data)
print(decision_tree)

{'usefulCount<5.0': {'是': {'usefulCount<3.0': {'是': {'usefulCount<2.0': {'是': {'usefulCount<0.0': {'是': 5, '否': {'usefulCount<1.0': {'是': 5, '否': 5}}}}, '否': 5}}, '否': {'usefulCount<4.0': {'是': 5, '否': 5}}}}, '否': {'usefulCount<7.0': {'是': {'usefulCount<6.0': {'是': 5, '否': 5}}, '否': {'usefulCount<8.0': {'是': 5, '否': {'usefulCount<9.0': {'是': 5, '否': {'usefulCount<12.0': {'是': {'usefulCount<10.0': {'是': 5, '否': {'usefulCount<11.0': {'是': 5, '否': 5}}}}, '否': {'usefulCount<14.0': {'是': {'usefulCount<13.0': {'是': 5, '否': 5}}, '否': {'usefulCount<15.0': {'是': 5, '否': {'usefulCount<17.0': {'是': {'usefulCount<16.0': {'是': 5, '否': 5}}, '否': {'usefulCount<18.0': {'是': 5, '否': {'usefulCount<22.0': {'是': {'usefulCount<20.0': {'是': {'usefulCount<19.0': {'是': 5, '否': 5}}, '否': {'usefulCount<21.0': {'是': 5, '否': 5}}}}, '否': {'usefulCount<23.0': {'是': 5, '否': {'usefulCount<25.0': {'是': {'usefulCount<24.0': {'是': 5, '否': 5}}, '否': {'usefulCount<28.0': {'是': {'usefulCount<27.0': {'是': {'usefulCount<26.0

In [15]:
def convert_data_for_predict(data: pd.Series) -> dict:
    drop_data = data.drop(labels=['recordId', 'drugName', 'condition', 'reviewComment', 'date', 'rating'])
    drop_data['sideEffects'] = convert_effect_to_int(drop_data['sideEffects'])
    data_dict = drop_data.to_dict()

    return data_dict

In [17]:
# 测试集测试
test_path = '../决策树数据集/testing.csv'
test_data = pd.read_csv(test_path)
test_data['rating'] = test_data.apply(lambda data: predict(decision_tree, convert_data_for_predict(data)), axis=1)
print(test_data)

# 存储到 testing_output.csv 中
test_output_path = '../决策树数据集/testing_output.csv'
test_data.to_csv(test_output_path)

      recordId                 drugName                          condition  \
0       219597  Microgestin Fe 1.5 / 30                      Birth Control   
1       134044               Prednisone                  Cluster Headaches   
2        68176                   Plan B            Emergency Contraception   
3       200538              Varenicline                  Smoking Cessation   
4        46409                Modafinil                         Narcolepsy   
...        ...                      ...                                ...   
1793    132278                   Ativan                           Insomnia   
1794    126842             Erythromycin  Upper Respiratory Tract Infection   
1795     68153                   Plan B            Emergency Contraception   
1796    126865                 Dilaudid                               Pain   
1797     40841               Leuprolide                      Endometriosis   

                                          reviewComment       d

In [24]:
from sklearn import metrics

# 使用 micro 和 macro f1 来进行验证
valid_path = '../决策树数据集/validation.csv'
valid_data = pd.read_csv(valid_path)
true_labels = valid_data['rating'].copy()
predict_labels = valid_data.apply(lambda data : predict(decision_tree, convert_data_for_predict(data)), axis=1)
micro_score = metrics.f1_score(true_labels, predict_labels, average='micro')
macro_score = metrics.f1_score(true_labels, predict_labels, average='macro')
print("Micro_F1=" + str(micro_score))
print("Macro_F1=" + str(macro_score))

Micro_F1=0.493744787322769
Macro_F1=0.13221663874930206
