In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import collections
import random

## 0、数据处理成csv形式

In [46]:
columns = ['age', 'workclass', 'fnlwgt', 'education', 'educationNum', 'maritalStatus', 'occupation', 'relationship', 'race', 'sex',
          'capitalGain', 'capitalLoss', 'hoursPerWeek', 'nativeCountry', 'income']
df_train_set = pd.read_csv('./adult.data', names=columns)
df_test_set = pd.read_csv('./adult.test', names=columns, skiprows=1) #第一行是非法数据

print(df_train_set.head())
print(df_test_set.head())
df_train_set.to_csv('./train_adult.csv', index=False)
df_test_set.to_csv('./test_adult.csv', index=False)

In [None]:
df_test_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,educationNum,maritalStatus,occupation,relationship,race,sex,capitalGain,capitalLoss,hoursPerWeek,nativeCountry,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [None]:
len(df_train_set), len(df_test_set), len(df_test_set.columns)

## 1、数据读取

In [None]:
df_train_set = pd.read_csv('./train_adult.csv')
df_train_set

## 2、数据预处理

### 2.1 删除对应属性

In [None]:
df_train_set.drop(['fnlwgt', 'educationNum'], axis=1, inplace=True) # fnlwgt列用处不大，educationNum与education类似
print(df_train_set.columns)

### 2.2 重复行记录处理

In [7]:
df_train_set.drop_duplicates(inplace=True) # 去除重复行

In [None]:
df_train_set

### 2.3 缺失值处理

In [None]:
df_train_set[df_train_set.isna().values == True] # 输出有缺失值的数据行

In [10]:
df_train_set.dropna(inplace=True) # 去除空行 

### 2.4 查看列类型

In [None]:
df_train_set.dtypes

### 2.5 异常值处理

In [None]:
df_train_set[df_train_set['workclass'].str.contains(r'\?', regex=True)] # 查找异常值, 避免与正则表达式的?冲突需要转义

In [None]:
df_train_set=df_train_set[~df_train_set['workclass'].str.contains(r'\?', regex=True)]
df_train_set

In [None]:
#删除有异常值的行
new_columns = ['workclass', 'education', 'maritalStatus', 'occupation', 'relationship', 'race', 'sex',
               'nativeCountry', 'income']
for col in new_columns:
        df_train_set = df_train_set[~df_train_set[col].str.contains(r'\?', regex=True)]
df_train_set.head()

### 2.6 数据可视化，以年龄为例

In [None]:
df_train_set['age'].plot.hist()

In [None]:
df_train_set['age'].value_counts().sort_index().plot(kind="line")

In [None]:
# 画出年龄与收入的关系

df_train_set = df_train_set.reset_index(drop=True) #重置索引
df_train_set['age'].isnull() == True
s=df_train_set['age'].value_counts()
k=df_train_set['age'][df_train_set['income']==' >50K'].value_counts()
sns.set_style("whitegrid")
f, ax = plt.subplots(figsize=(18, 9))
sns.set_color_codes("pastel")
sns.barplot(x=s.index,y=s.values,label='total',color="b")
sns.barplot(x=k.index,y=k.values,label='income>50K',color="g")
ax.legend(ncol=2, loc="upper left", frameon=True)

### 2.7 连续型变量处理

In [18]:
continuous_column = ['age', 'capitalGain', 'capitalLoss', 'hoursPerWeek']

In [None]:
df_train_set['age'].max(), df_train_set['age'].min()

In [None]:
df_train_set['age'].head() 

In [21]:
bins = [0, 25, 50, 75, 100] # 分箱区间左开右闭 (0, 25], (25, 50], ...
df_train_set['age'] = pd.cut(df_train_set['age'], bins, labels=False)

In [None]:
df_train_set['age'].head() 

In [None]:
for colname in continuous_column[1:]:
    print(colname)
    max_, min_ = df_train_set[colname].max(), df_train_set[colname].min()
    print(max_, min_)
    bins = np.linspace(min_-1, max_+1, 5).tolist()
    bins = [int(ele) for ele in bins]
    print("分箱:", bins)
    df_train_set[colname] = pd.cut(df_train_set[colname], bins, labels=False)
    df_train_set[colname].head(2)
    print(df_train_set[colname].value_counts().sort_index())

### 2.8 离散型变量处理

In [24]:
discrete_column = ['workclass', 'education', 'maritalStatus', 'occupation', 'relationship', 'race', 'sex', 'nativeCountry', 'income']
discrete_map = dict()

In [None]:
df_train_set['workclass'].value_counts()

In [None]:
df_train_set['workclass'].head() #展示前五条

In [None]:
df_train_set['workclass'].value_counts().keys()

In [28]:
workclass_mapping = {' Private': 0, ' Self-emp-not-inc': 1, ' Self-emp-inc': 1, ' Local-gov': 2, 
                     ' State-gov': 2, ' Federal-gov': 2, ' Without-pay': 3, ' Never-worked': 3}
df_train_set['workclass'] = df_train_set['workclass'].map(workclass_mapping)

In [29]:
df_train_set['workclass'].head()
discrete_map['workclass'] = workclass_mapping

In [None]:
for colname in discrete_column[1:]:
    print('*'*10, colname, '*'*10)
    print(df_train_set[colname].head().tolist())
    uq = pd.unique(df_train_set[colname])
    tempd = {uq[i]: i for i in range(len(uq))}
    discrete_map[colname] = tempd
    df_train_set[colname] = df_train_set[colname].map(tempd)
    print(df_train_set[colname].head().tolist())

### 数据数值化完成

In [None]:
print(df_train_set.dtypes) 

## 3. 构造决策树，进行训练

In [32]:
def calc_gini(df:pd.DataFrame)->float:
    """
    计算基尼指数
    Args:
        df (pd.DataFrame): _description_ 传入的df

    Returns:
        float: _description_ 根据income计算的基尼指数
    """
    cls_cnt = df['income'].value_counts()
    summary = len(df)
    gini = 1 -sum((count/summary)**2 for count in cls_cnt)
    return gini
    

def split_dataset(df:pd.DataFrame, index:str, value:int)->tuple[pd.DataFrame, pd.DataFrame]:
    """
    根据index列的值分隔df
    Args:
        df (pd.DataFrame): _description_ 待分割的df
        index (str): _description_ 列名
        value (int): _description_ 列分割值

    Returns:
        tuple(df, df): _description_ 左右两个df
    """
    left_df = df.loc[df.loc[:, index] <= value]
    right_df = df.loc[df.loc[:, index] > value]
    return left_df, right_df
    
    
def choose_best_feature_to_split(df:pd.DataFrame, columns:list[str])->tuple:
    """
    选择最佳特征进行分割
    Args:
        df (_type_): _description_
        columns (_type_): 当前df能用来分隔的列属性

    Raises:
        ValueError: _description_
        ValueError: _description_

    Returns:
        tuple: _description_ 分割的相关结果
    """
    if len(df)==0:
        raise ValueError("df is empty")
    if columns[-1] != 'income':
        raise ValueError("columns[-1] is not 'income'")
    
    best_gain = 0
    best_value = (None, None)
    best_df = (None, None)
    cur_gini = calc_gini(df)
    for index in columns[:-1]:
        for value in df.loc[:, index].unique():
            left_df, right_df = split_dataset(df, index, value)
            if len(left_df) == 0 or len(right_df) == 0:
                continue
            left_gini  = calc_gini(left_df)
            right_gini = calc_gini(right_df)
            temp_gini = (len(left_df)/len(df))*left_gini + (len(right_df)/len(df))*right_gini
            
            gain = cur_gini - temp_gini ##temp_gini变小更好，说明更趋近于在同一类
            if gain > best_gain:
                best_gain = gain
                best_value = (index, value)
                best_df = (left_df, right_df)
    return best_value, best_df, best_gain
            


def build_decision_tree(df:pd.DataFrame, columns:list[str])->dict:
    """
    构建CART决策树
    Args:
        df (pd.DataFrame): _description_ 当前要构建的df
        columns (list[str]): _description_ 当前df能用来分隔的列属性

    Returns:
        dict: _description_ 构建的决策树
    """
    # 递归结束情况1: 若当前集合的所有样本标签相等,即样本已被分"纯",则可以返回该标签值作为一个叶子节点
    # 递归结束情况2: 若当前训练集的所有特征都被使用完毕,当前无可用特征但样本仍未分"纯"，则返回样本最多的标签作为结果
   
    if len(df['income'].unique()) == 1:
        return df['income'].iloc[0]
    
    if len(columns) == 1 and columns[0] == 'income':
        return df['income'].mode()[0]
    
    best_value, (left_df, right_df), best_gain = choose_best_feature_to_split(df, columns.copy())
    
    index, value = best_value
    
    if index is None:
        return df['income'].mode()[0]
    
    tree = {f"feature_{index}": {}}
    
    index_int = columns.index(index)
    left_cols = columns[:index_int] + columns[index_int+1:]
    right_cols = columns[:index_int] + columns[index_int+1:]
    
    tree[f"feature_{index}"][f"sep_colname"] = index
    tree[f"feature_{index}"][f"sep_value"] = value
    tree[f"feature_{index}"][f"<= {value}"] = build_decision_tree(left_df, left_cols.copy())
    tree[f"feature_{index}"][f"> {value}"] = build_decision_tree(right_df, right_cols.copy())
    
    return tree
    
    
    
    
def save_decision_tree(cart):
    """
    决策树的存储
    :param cart: 训练好的决策树
    :return: void
    """
    np.save('cart.npy', cart)
    
    
def load_decision_tree():
    """
    决策树的加载
    :return: 保存的决策树
    """    
    cart = np.load('cart.npy', allow_pickle=True)
    return cart.item()


In [33]:
df_train = df_train_set.copy() #防止预处理重新来
# df_train.head()

In [34]:
a = [1,2,3]


In [35]:
columns = df_train.columns.to_list()
flags = [0 for i in range(len(columns))]
flags[-1] = 1

In [36]:
cart = build_decision_tree(df_train, columns)
save_decision_tree(cart)

In [None]:
a = type(df_train.iloc[0, -1])
print(a)
b = 1
print(isinstance(b,a))

## 4. 评估

In [38]:
def classify(cart:dict, df_row:pd.Series, targettype:type)->int:
    """
    对一行df_row进行分类
    Args:
        cart (dict): _description_ 决策树
        df_row (pd.Series): _description_ 传入的一行数据
        targettype (type): _description_ 递归的终止类型

    Returns:
        int: _description_
    """
    key = list(cart.keys())[0]
    sep_colname = cart[key]['sep_colname']
    sep_value = cart[key]['sep_value']
    left_df = cart[key][f'<= {sep_value}']
    right_df = cart[key][f'> {sep_value}']
        
    while True:
        if df_row[sep_colname] <= sep_value:
            cart = left_df
        else:
            cart = right_df
        if isinstance(cart, targettype):
            break
        key = list(cart.keys())[0]
        sep_colname = cart[key]['sep_colname']
        sep_value = cart[key]['sep_value']
        left_df = cart[key][f'<= {sep_value}']
        right_df = cart[key][f'> {sep_value}']
    
    return cart
        
    
    
def predict(cart:dict, df)->list[np.int64]:
    """
    预测df中的数据
    Args:
        cart (_type_): _description_ 决策树信息
        df (_type_): _description_ 待预测数据

    Returns:
        _type_: _description_ 预测结果列表
    """
    targettype = type(df.iloc[0, -1])
    pred_list = []
    for i in range(len(df)):
        pred_label = classify(cart, df.iloc[i,:], targettype)
        if pred_label == -1:
            pred_label = random.randint(0, 1) # 防止classify执行到返回-1,但一般不会执行到返回-1
        pred_list.append(pred_label)
    return pred_list

def calc_acc(pred_list:list[np.int64], test_list:list[np.int64])->int:
    """_summary_

    Args:
        pred_list (list[np.int64]): _description_ 预测列表
        test_list (list[np.int64]): _description_ 实际列表

    Returns:
        int: _description_ 准确率
    """
    pred = np.array(pred_list)
    test = np.array(test_list)
    acc = np.sum(pred_list == test_list) / len(test_list)
    return acc


## 5. Demo

In [None]:
print(type(df_train.iloc[0, -1]))

#### (1) 下面只是测试模型能不能跑通,实际操作时需要用测试集进行评估
#### (2) 此外，测试集应该需要做和训练集一样的预处理操作

In [40]:
columns = df_train.columns.to_list()

In [41]:
cart = load_decision_tree() # 加载模型

In [42]:
test_list = df_train['income'].to_numpy()
pred_list = predict(cart, df_train)

In [43]:
acc = calc_acc(pred_list, test_list)

In [None]:
acc

In [None]:
def pre_dfprocess(dfin:pd.data)