In [96]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeRegressor, plot_tree

plt.style.use('dark_background')

#### Omissions

In [97]:
df_full = pd.read_csv('data/cardio.csv', sep=';')

features = ['height', 'age', 'gender']
target = ['cardio']

df1 = df_full[:][features+target]
df1.shape

(70000, 4)

In [98]:
# insert target values
def gini(labels):
    labels = list(labels)
    set_labels = set(labels)

    gini = 1
    for label_name in set_labels:
        p = labels.count(label_name) / len(labels)
        gini -= p ** 2

    return gini

gini0 = gini(df1['cardio'])
gini0

0.49999982000000004

In [99]:
def gain(left_branch, right_branch, root_criterion, criterion):
    # e.g. gini for left & right branches, ===> need minimum values
    crit_right = criterion(right_branch)
    crit_left = criterion(left_branch)

    # probability for every branch (left & right)
    p_left = left_branch.shape[0] / (left_branch.shape[0] + left_branch.shape[0])
    p_right = 1 - p_left

    # x + y = n
    # e.g. gini(parent) - x/n * gini(left_child) - y/n * gini(right_child) # alternative gini => entropy
    result = root_criterion - p_left * crit_left - p_right * crit_right
    return result

# threshold
t = 50.5

df2_l = df1[df1['age'] <= t]
df2_r = df1[df1['age'] > t]
display(df2_l, df2_r)

Unnamed: 0,height,age,gender,cardio
0,168,50,2,0
3,169,48,2,1
4,156,47,1,0
8,158,48,1,0
12,165,40,2,0
...,...,...,...,...
69981,182,47,2,1
69984,168,49,2,1
69985,156,49,1,1
69986,180,49,2,0


Unnamed: 0,height,age,gender,cardio
1,156,55,1,1
2,165,51,1,1
5,151,60,1,0
6,157,60,1,0
7,178,61,2,1
...,...,...,...,...
69995,168,52,2,0
69996,158,61,1,1
69997,183,52,2,1
69998,163,61,1,1


In [100]:
gain(df2_l['cardio'], df2_r['cardio'], gini0, gini)

0.021584169997685476

In [101]:
# imitation of passes
random_pass = np.random.randint(0, df1.shape[0], 1000)  # may be repetitions
df1.loc[random_pass, ['height']] = np.nan

In [102]:
df1['height'].isna().sum()

992

In [103]:
root_gini = gini(df1['cardio'])
root_gini

0.49999982000000004

In [104]:
t = df1['height'].median()
df1_isna = df1['height'].isna()

# data without nan
df1_clean = df1[~df1_isna]
df1_clean.shape

(69008, 4)

In [105]:
# data_clean = left_clean_data + right_clean_data

# left_clean_data without nan
df2_l = df1_clean[df1_clean['height'] <= t]
# right_clean_data without nan
df2_r = df1_clean[df1_clean['height'] > t]

In [106]:
# information gain with clean data
gain1 = gain(df2_l['cardio'], df2_r['cardio'], root_gini, gini)
gain1

2.3792462962857197e-05

In [107]:
# correction for loss of information
df1_clean.shape[0] / df1.shape[0] * gain1

2.3455289773440704e-05

In [108]:
# if a split the best => add data_with_nan in branches
print(df2_l.shape[0], df2_r.shape[0])
df2_l = pd.concat([df2_l, df1[df1_isna]])
df2_r = pd.concat([df2_r, df1[df1_isna]])
print(df2_l.shape[0], df2_r.shape[0])

39723 29285
40715 30277


#### Categories

In [109]:
colors = ['gray', 'blue', 'green']
new_feature = []

for i in range(df1.shape[0]):
    new_feature.append(np.random.choice(colors, p=['0.5', '0.2', '0.3']))
df1['color_eye'] = new_feature

In [110]:
print('All')
display(df1['color_eye'].value_counts())
print('cardio = 0')
display(df1[df1['cardio'] == 0]['color_eye'].value_counts())
print('cardio = 1')
display(df1[df1['cardio'] == 1]['color_eye'].value_counts())
# for c in df1.columns:
#     if c == 'cardio':
#         print(df1[df1[c] == 1]['color_eye'].value_counts())

All


gray     34981
green    21065
blue     13954
Name: color_eye, dtype: int64

cardio = 0


gray     17493
green    10517
blue      7011
Name: color_eye, dtype: int64

cardio = 1


gray     17488
green    10548
blue      6943
Name: color_eye, dtype: int64

In [112]:
df1['color_eye'].replace({'gray': 3, 'green': 2, 'blue': 1}, inplace=True)
display(df1['color_eye'].value_counts())

3    34981
2    21065
1    13954
Name: color_eye, dtype: int64