In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [3]:
df = pd.read_csv("dt.csv")
df.columns = list(map(lambda x: "_".join(x.strip().lower().split()), df.columns))
df.head(10)

Unnamed: 0,loves_popcorn,loves_soda,age,loves_cool_as_ice
0,Yes,Yes,7,No
1,Yes,No,12,No
2,No,Yes,18,Yes
3,No,Yes,35,Yes
4,Yes,Yes,38,Yes
5,Yes,No,50,No
6,No,No,83,No


## Method 1: Entropy + Information Gain

Entropy, entropy_S, for the entire dataset

- We first compute the entropy, entropy_S, for the target variable (`loves_cool_as_ice`). 
- To do this, we count the number of 'Yes' == 3 and do the same for 'No' == 4.
$$ entropy\_S = -\left(\frac{yes\_count}{N} \cdot \log_2\left(\frac{yes\_count}{N}\right) \quad + \quad \frac{no\_count}{N} \cdot \log_2\left(\frac{no\_count}{N}\right)\right)$$
- Where:
  - `yes_count` is the count of occurences where the target variable was 'Yes'
  - `no_count` is the count of occurences where the target variable was 'No'
  - `N` is the size of the dataset (count of all target variables)

In [4]:
entropy_S = -(3/7 * np.log2(3/7) + 4/7 * np.log2(4/7))
entropy_S

np.float64(0.9852281360342515)

Entropy, p, after splitting on feature, `ft1`, i.e., loves_popcorn

loves_popcorn == Yes

In [5]:
# People who love popcorn
df[df['loves_popcorn'] == 'Yes']

Unnamed: 0,loves_popcorn,loves_soda,age,loves_cool_as_ice
0,Yes,Yes,7,No
1,Yes,No,12,No
4,Yes,Yes,38,Yes
5,Yes,No,50,No


In [6]:
entropy_ft1_y = -(1/4 * np.log2(1/4) + 3/4 * np.log2(3/4))
entropy_ft1_y

np.float64(0.8112781244591328)

loves_popcorn == No

In [7]:
# People who don't love popcorn
df[df['loves_popcorn'] == 'No']

Unnamed: 0,loves_popcorn,loves_soda,age,loves_cool_as_ice
2,No,Yes,18,Yes
3,No,Yes,35,Yes
6,No,No,83,No


In [8]:
entropy_ft1_n = -(2/3 * np.log2(2/3) + 1/3 * np.log2(1/3))
entropy_ft1_n

np.float64(0.9182958340544896)

Because the number of people who love popcorn != to those who don't love it, we have to calculate the weigted entropy, entropy_ft1, as follows:

The total number of people who love popcorn * entropy_ft1_y + total number of people who don't love popcorn * entropy_ft1_n

In [9]:
entropy_ft1 = 4/7 * entropy_ft1_y + 3/7 * entropy_ft1_n
entropy_ft1

np.float64(0.8571428571428571)

Information Gain (Loves Popcorn) = Entropy(S) - entropy(loves popcorn)

In [10]:
ig_ft1 = entropy_S - entropy_ft1
ig_ft1

np.float64(0.12808527889139443)

In [11]:
df.head(10)

Unnamed: 0,loves_popcorn,loves_soda,age,loves_cool_as_ice
0,Yes,Yes,7,No
1,Yes,No,12,No
2,No,Yes,18,Yes
3,No,Yes,35,Yes
4,Yes,Yes,38,Yes
5,Yes,No,50,No
6,No,No,83,No


In [17]:
# Define features and target
X = df.drop(columns=["loves_cool_as_ice"])
y = df["loves_cool_as_ice"]

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
type(X_train), type(y_train)

(pandas.core.frame.DataFrame, pandas.core.series.Series)

In [None]:
# Notice that we have numeric column (age) in our features. 
# We don't need to encode age because it's already numeric
X_train

Unnamed: 0,loves_popcorn,loves_soda,age
5,Yes,No,50
2,No,Yes,18
4,Yes,Yes,38
3,No,Yes,35
6,No,No,83


In [19]:
df.dtypes

loves_popcorn        object
loves_soda           object
age                   int64
loves_cool_as_ice    object
dtype: object

In [21]:
df.drop(columns=["loves_cool_as_ice"]).select_dtypes(include=["object", "category"])

Unnamed: 0,loves_popcorn,loves_soda
0,Yes,Yes
1,Yes,No
2,No,Yes
3,No,Yes
4,Yes,Yes
5,Yes,No
6,No,No


In [22]:
df.drop(columns=["loves_cool_as_ice"]).select_dtypes(include=["number"])

Unnamed: 0,age
0,7
1,12
2,18
3,35
4,38
5,50
6,83


In [24]:
# Define categorical and numerical columns
cat_cols = df.drop(columns=["loves_cool_as_ice"]).select_dtypes(include=["object", "category"]).columns
num_cols = df.drop(columns=["loves_cool_as_ice"]).select_dtypes(include=["number"]).columns

cat_cols, num_cols

(Index(['loves_popcorn', 'loves_soda'], dtype='object'),
 Index(['age'], dtype='object'))

In [None]:
# Encode features
ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
ohe.fit(X_train[cat_cols])  # Only fit on the training data, NEVER fit on the test set

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [30]:
X_train_cat = ohe.transform(X_train[cat_cols])
X_test_cat = ohe.transform(X_test[cat_cols])
X_train_cat

array([[0., 1., 1., 0.],
       [1., 0., 0., 1.],
       [0., 1., 0., 1.],
       [1., 0., 0., 1.],
       [1., 0., 1., 0.]])

In [43]:
X_train_num = X_train[num_cols].values
X_test_num = X_test[num_cols].values
X_train_num

array([[50],
       [18],
       [38],
       [35],
       [83]])

In [46]:
X_train_cat.shape, X_train_num.shape

((5, 4), (5, 1))

In [50]:
X_train

Unnamed: 0,loves_popcorn,loves_soda,age
5,Yes,No,50
2,No,Yes,18
4,Yes,Yes,38
3,No,Yes,35
6,No,No,83


In [52]:
ohe.categories_

[array(['No', 'Yes'], dtype=object), array(['No', 'Yes'], dtype=object)]

In [54]:
ohe.get_feature_names_out()

array(['loves_popcorn_No', 'loves_popcorn_Yes', 'loves_soda_No',
       'loves_soda_Yes'], dtype=object)

In [49]:
# Combine encoded features + numeric column (age) to form the complete dataset
np.hstack([X_train_cat, X_train_num])

array([[ 0.,  1.,  1.,  0., 50.],
       [ 1.,  0.,  0.,  1., 18.],
       [ 0.,  1.,  0.,  1., 38.],
       [ 1.,  0.,  0.,  1., 35.],
       [ 1.,  0.,  1.,  0., 83.]])

In [39]:
X_train_cat.shape, y_train.shape

((5, 4), (5,))

In [12]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X, y)

ValueError: could not convert string to float: 'Yes'