In [50]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np 
import pandas as pd 

In [51]:
# metrics
def accuracy(y_true, y_pred):
    return sum(y_true == y_pred) / len(y_true)

In [74]:
# model
class DTN:  # node of decision Tree
    def __init__(self, D, A):
        self.D = D
        self.A = A
        self.attr = None
        self.vals = []
        self.children = []
        self.typeid = -1
    
    def settype(self, type):
        self.typeid = type
        
    def show(self):
        return f"DecisionTreeNode:data = {self.D} attributes = {self.A}"
        
class DecisionTreeClassifier:
    def __init__(self) -> None:
        self.tree = None

    def fit(self, X: pd.DataFrame, y: np.ndarray) -> None:
        # X: [n_samples_train, n_features], 
        # y: [n_samples_train, ],
        # TODO: implement decision tree algorithm to train the model
        self.X = X.reset_index(drop = True)
        self.y = y
        self.isContinuous = {}
        index = list(self.X.index)
        attrs = list(self.X.columns)
        for a in attrs:
            Xa = X.loc[:, a]
            diffvals = len(np.unique(Xa))
            if diffvals > 10:
                self.isContinuous[a] = True
            else:
                self.isContinuous[a] = False
        self.tree = self.tree_generate(index, attrs)
        self.X = None
        self.y = None
        
    def predict(self, X: pd.DataFrame):
        # X: [n_samples_test, n_features],
        # return: y: [n_samples_test, ]
        y = np.zeros(X.shape[0])
        # TODO:
        id = 0
        for _, row in X.iterrows():
            node = self.tree
            while node.typeid == -1: # not a leaf
                if self.isContinuous[node.attr]:
                    val = row[node.attr]
                    pivot = node.vals[0]
                    if val >= pivot:
                        node = node.children[0]
                    else:
                        node = node.children[1]
                else:
                    val = row[node.attr]
                    try:
                        val_id = node.vals.index(val)
                        node = node.children[val_id]
                    except Exception as e:
                        print("当前id" + str(id) + "\n")
                        print("有训练集中未出现的离散值！\n") 
                        print("出错属性:" + node.attr + "\n")  
                        print("结点是否连续:" + str(self.isContinuous[node.attr]) + "\n")  
                        print("node结点子孩子:" + ' '.join(map(str, node.vals)))   
            y[id] = node.typeid
            id += 1
        return y
    
    def tree_generate(self, D: list, A: list) -> DTN:
        node = DTN(D, A)
        X = self.X.loc[D]
        y = self.y[D]
        if len(np.unique(y)) == 1: 
            node.typeid = y[0]
            return node
        if len(A) == 0 or len(X.loc[:, A].drop_duplicates()) == 1:
            _, counts = np.unique(y, return_counts=True)
            node.typeid = y[np.argmax(counts)]
            return node
        t, node.attr = self.best_spilt(D, A)
        Xa = X.loc[:, node.attr]
        if self.isContinuous[node.attr] == True:
            D_plus = list(Xa[Xa >= t].index)
            D_minus = list(Xa[Xa < t].index)
            node.vals = [t]
            node_plus = self.tree_generate(D_plus, A)
            node_minus = self.tree_generate(D_minus, A)
            node.children = [node_plus, node_minus]
        else:
            A.remove(node.attr)
            vals, _ = np.unique(Xa, return_counts=True)
            if node.attr == 'Gender':
                print(vals)
            for v in vals:
                node.vals.append(v)
                D_new = list(Xa[Xa == v].index)
                if len(D_new) == 0:
                    node_new = DTN(D_new, A)
                    _, counts = np.unique(y, return_counts=True)
                    node_new.settype(y[np.argmax(counts)])
                else:
                    node_new = self.tree_generate(D_new, A)
                node.children.append(node_new)
        return node
    
    def Ent(self, D):
        _, counts = np.unique(self.y[D], return_counts=True)
        probabilities = counts / len(self.y)
        return -np.sum(probabilities * np.log2(probabilities))
    
    def infogain(self, D: list, a: list):
        ent_pos = self.Ent(D)
        ent_neg = 0
        pivot = 0
        X = self.X.loc[D]
        Xa = X.loc[:, a]
        if self.isContinuous[a] == True:
            vals = np.sort(Xa)
            maxt = 0
            minent = np.inf
            for i in range(len(vals) - 1):
                t = (vals[i] + vals[i+1]) / 2
                D_plus = list(Xa[Xa >= t].index)
                D_minus = list(Xa[Xa < t].index)
                totminus = (self.Ent(D_plus) * len(D_plus) + self.Ent(D_minus) * len(D_minus) ) / len(D)
                if totminus < minent:
                    minent = totminus
                    maxt = t
            pivot = maxt
            ent_neg = minent    
        else:
            vals, cnts = np.unique(Xa, return_counts=True)
            pros = cnts / len(D)
            for i, val in enumerate(vals):
                D_new = list(Xa[Xa == val].index)
                ent_neg += self.Ent(D_new) * pros[i]
        return pivot, ent_pos - ent_neg
    
    def best_spilt(self, D, A):
        max_info_gain = -np.inf
        max_attr = None
        pivot = 0
        for a in A:
            t, gain = self.infogain(D, a)
            if gain > max_info_gain:
                max_info_gain = gain
                max_attr = a
                pivot = t
        return pivot, max_attr
            
        

In [53]:
def load_data(datapath:str='./data/ObesityDataSet_raw_and_data_sinthetic.csv'):
    df = pd.read_csv(datapath)
    continue_features = ['Age', 'Height', 'Weight', ]
    discrete_features = ['Gender', 'CALC', 'FAVC', 'FCVC', 'NCP', 'SCC', 'SMOKE', 'CH2O', 'family_history_with_overweight', 'FAF', 'TUE', 'CAEC', 'MTRANS']
    
    X, y = df.iloc[:, :-1], df.iloc[:, -1]
    # encode discrete str to number, eg. male&female to 0&1
    labelencoder = LabelEncoder()
    for col in discrete_features:
        X[col] = labelencoder.fit(X[col]).transform(X[col])
    y = labelencoder.fit(y).fit_transform(y)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

In [54]:
X_train, X_test, y_train, y_test = load_data('./data/ObesityDataSet_raw_and_data_sinthetic.csv')

In [55]:
X_train.head(20)

Unnamed: 0,Age,Gender,Height,Weight,CALC,FAVC,FCVC,NCP,SCC,SMOKE,CH2O,family_history_with_overweight,FAF,TUE,CAEC,MTRANS
162,21.0,0,1.63,60.0,2,1,809,477,0,1,549,1,1071,0,0,3
2001,20.924956,0,1.752531,133.618706,2,1,809,477,0,0,1208,1,825,630,2,3
1435,22.89974,0,1.661715,82.595793,2,1,29,93,0,0,1115,1,114,1045,2,3
649,21.837996,0,1.588046,44.236067,3,0,809,142,0,0,951,0,649,0,1,3
1280,25.994746,1,1.811602,106.042142,2,1,809,477,0,0,1185,1,973,536,2,3
1697,23.327836,1,1.754439,119.441207,2,1,136,477,0,0,549,1,1010,788,2,3
1378,32.895637,1,1.783901,103.771371,2,1,716,152,0,0,980,1,1100,456,2,0
998,24.679807,1,1.7,84.687554,2,1,170,477,0,0,579,1,0,840,2,3
1855,26.0,0,1.644141,111.942544,2,1,809,477,0,0,1112,1,0,95,2,3
705,22.99368,0,1.741377,54.877111,3,1,809,477,0,0,567,1,1071,63,1,3


In [75]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

[0 1]


In [76]:
y_pred = clf.predict(X_test)
print(accuracy(y_test, y_pred))

0.950354609929078


In [81]:
X_test.shape

(423, 16)

In [15]:
type(X_train)

pandas.core.frame.DataFrame

In [17]:
X_train

Unnamed: 0,Age,Gender,Height,Weight,CALC,FAVC,FCVC,NCP,SCC,SMOKE,CH2O,family_history_with_overweight,FAF,TUE,CAEC,MTRANS
162,21.000000,0,1.630000,60.000000,2,1,809,477,0,1,549,1,1071,0,0,3
2001,20.924956,0,1.752531,133.618706,2,1,809,477,0,0,1208,1,825,630,2,3
1435,22.899740,0,1.661715,82.595793,2,1,29,93,0,0,1115,1,114,1045,2,3
649,21.837996,0,1.588046,44.236067,3,0,809,142,0,0,951,0,649,0,1,3
1280,25.994746,1,1.811602,106.042142,2,1,809,477,0,0,1185,1,973,536,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638,32.290160,1,1.754956,120.098812,2,1,779,477,0,0,937,1,548,939,2,0
1095,23.000000,1,1.718981,81.669950,2,1,170,151,0,0,230,1,493,844,2,3
1130,22.989846,0,1.650000,80.000000,3,1,170,477,0,0,549,1,129,1128,2,3
1294,23.000000,0,1.628168,84.497980,3,1,228,432,0,0,569,1,465,493,2,3


In [18]:
X_train1 = X_train.reset_index(drop=True)
X_train1

Unnamed: 0,Age,Gender,Height,Weight,CALC,FAVC,FCVC,NCP,SCC,SMOKE,CH2O,family_history_with_overweight,FAF,TUE,CAEC,MTRANS
0,21.000000,0,1.630000,60.000000,2,1,809,477,0,1,549,1,1071,0,0,3
1,20.924956,0,1.752531,133.618706,2,1,809,477,0,0,1208,1,825,630,2,3
2,22.899740,0,1.661715,82.595793,2,1,29,93,0,0,1115,1,114,1045,2,3
3,21.837996,0,1.588046,44.236067,3,0,809,142,0,0,951,0,649,0,1,3
4,25.994746,1,1.811602,106.042142,2,1,809,477,0,0,1185,1,973,536,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1683,32.290160,1,1.754956,120.098812,2,1,779,477,0,0,937,1,548,939,2,0
1684,23.000000,1,1.718981,81.669950,2,1,170,151,0,0,230,1,493,844,2,3
1685,22.989846,0,1.650000,80.000000,3,1,170,477,0,0,549,1,129,1128,2,3
1686,23.000000,0,1.628168,84.497980,3,1,228,432,0,0,569,1,465,493,2,3


In [13]:
list(X_train.columns)

['Age',
 'Gender',
 'Height',
 'Weight',
 'CALC',
 'FAVC',
 'FCVC',
 'NCP',
 'SCC',
 'SMOKE',
 'CH2O',
 'family_history_with_overweight',
 'FAF',
 'TUE',
 'CAEC',
 'MTRANS']

In [16]:
type(y_train)

numpy.ndarray

In [71]:
y_test[0]

0

In [22]:
y_train

array([1, 4, 2, ..., 6, 2, 5])

In [21]:
ids = [1,2,3]
y_train[ids]

array([4, 2, 0])

In [9]:
data = {
    'A': [1, 1, 3, 1],
    'B': [2, 2, 'z', 2],
    'C': [10, 20, 30, 1],
    'D': [1.1 ,2.1, 3.1, 1.3]
}

In [10]:
df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C,D
0,1,2,10,1.1
1,1,2,20,2.1
2,3,z,30,3.1
3,1,2,1,1.3


In [14]:
df[:,:]

InvalidIndexError: (slice(None, None, None), slice(None, None, None))

In [59]:
a1 = ['A', 'B']
df1 = df.loc[[0, 1, 3] , a1]
df1

Unnamed: 0,A,B
0,1,2
1,1,2
3,1,2


In [56]:
len(df1.drop_duplicates())

1

In [64]:
arr1, arr2 = np.unique(df.loc[:, 'D'], return_counts=True)
arr1

array([1.1, 1.3, 2.1, 3.1])

In [66]:
strs = ['A', 'B', 'C', 'D']
ARRS = np.array(strs)
ARRS

array(['A', 'B', 'C', 'D'], dtype='<U1')

In [80]:
for id, row in df.iterrows():
    print(id)

0
1
2
3


In [72]:
type(row1['A'])

int

In [77]:
arrrr = [1, 2]
arr4 = [3, 4]
arrrr.append([3, 4])
arrrr

[1, 2, [3, 4]]

In [67]:
A = ['ABC', 'DEF', 'GHI']
print(type(A))
for i in range(3):
    for a in A:
        print(a, end=", ")
    print("")
    first = A[0]
    print(first)
    A.remove(first)
print(A)

<class 'list'>
ABC, DEF, GHI, 
ABC
DEF, GHI, 
DEF
GHI, 
GHI
[]
