In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
def highlight(s):    
    return ['background-color: red' if v=="No" else 'background-color: green' for v in s]

In [None]:
data = pd.read_csv("PlayTennis.csv", index_col='Day')
data.style.apply(highlight, subset=['PlayTennis'])

In [None]:
target_count = data['PlayTennis'].value_counts(normalize=True)
target_count

In [None]:
for ix, v in enumerate(target_count):
    print(f"Probability(Play={target_count.index[ix]}) = {v:.4f}")

In [None]:
prob_yes, prob_no = target_count['Yes'], target_count['No']

In [None]:
node_entropy = prob_yes*np.log2(prob_yes)
node_entropy

In [None]:
node_entropy += prob_no*np.log2(prob_no)
node_entropy = node_entropy*(-1)
node_entropy

In [None]:
data.loc[data['Humidity']=='High']

In [None]:
data.loc[data['Humidity']=='Normal']

In [None]:
df_a1 = data.loc[data['Humidity']=='High']

In [None]:
data.columns[:-1]

In [None]:
for col in data.columns[:-1]:
    print(f"{col}: {data[col].unique()}")

In [None]:
for col in data.columns[:-1]:    
    level = data[col].unique()
    for l in level:
        print(f"for feature: {col} --> {l}")
        tempDf = data.loc[data[col]==l] 
        print(tempDf)    

In [None]:
def cal_entropy(df):
    target = df.columns[-1]
    entropy = 0
    target_level = df[target].unique()
    for val in target_level:
        level_probability = df[target].value_counts(normalize=True)[val]
        entropy = entropy - level_probability*np.log2(level_probability)
    return entropy

In [None]:
cal_entropy(tempDf)

In [None]:
for ix, col in enumerate(data.columns[:-1],1):    
    level = data[col].unique()
    print('*'*30)
    for l in level:
        print(f"for Feature {ix}: {col} --> {l}")
        tempDf = data.loc[data[col]==l] 
        E = cal_entropy(tempDf)
        print(f"Entropy E = {E:.4f}")  

In [None]:
instances = data.shape[0]
instances

In [None]:
for ix, col in enumerate(data.columns[:-1],1):    
    level = data[col].unique()
    print('*'*30)
    entropy_attribute = 0
    for l in level:
#         print(f"for Feature {ix}: {col} --> {l}")
        tempDf = data.loc[data[col]==l] 
        weight = tempDf.shape[0]/instances
        E = cal_entropy(tempDf)
        entropy_attribute += E*weight
    print(f"For Feature {ix}: {col}, Entropy = {entropy_attribute:.4f}") 

In [None]:
for ix, col in enumerate(data.columns[:-1],1):    
    level = data[col].unique()
    print('*'*30)
    entropy_attribute = 0
    for l in level:
#         print(f"for Feature {ix}: {col} --> {l}")
        tempDf = data.loc[data[col]==l] 
        weight = tempDf.shape[0]/instances
        E = cal_entropy(tempDf)
        entropy_attribute += E*weight
    print(f"For Feature {ix}: {col}, IG = {node_entropy-entropy_attribute:.4f}") 

In [None]:
# construct a crosstabulation
play_outlook = pd.crosstab(data['Outlook'], data['PlayTennis'], margins=True)
play_outlook
# data.groupby('Outlook')['PlayTennis'].value_counts()

In [None]:
# Getting values of last row
play_outlook.loc['All']
# play_outlook.iloc[-1]

In [None]:
for col in data.drop(['PlayTennis'], axis=1).columns:
    cross_tab = pd.crosstab(data[col], data['PlayTennis'], margins=True)
    print("*****************************************")
    print(f"Cross table for {col!r}")
    print(f"{cross_tab!r}")
    print("*****************************************")

In [None]:
# import seaborn as sns

In [None]:
# ctd = {}
# for col in data.drop(['PlayTennis'], axis=1).columns:
# #     print("*****************************************")
# #     print(f"Cross table for {col!r} in percentage (probability)")
#     cross_tab = pd.crosstab(data[col], data['PlayTennis'], normalize='columns')
#     ctd[col] = cross_tab.style.background_gradient(cmap='viridis')    

In [None]:
# ctd

In [None]:
for row in cross_tab.index:
    print(row)
    print(cross_tab.loc[row])

In [None]:
features = data.columns[-1]
features

In [None]:
features_level = data[features].unique()
features_level

In [None]:
for val in features_level:
    class_probability = data[features].value_counts(normalize=True)[val]

In [None]:
class_probability

In [None]:
print("Entropy of PlayTennis:", cal_entropy(data))

In [None]:
df_humid1 = data.loc[data.Humidity=="High"]
df_humid1.style.apply(highlight, subset=['PlayTennis'], axis=0)

In [None]:
print("Entropy of Humidity-High:", cal_entropy(df_humid1))

In [None]:
from sklearn.preprocessing import LabelEncoder

data_encoded = data.apply(LabelEncoder().fit_transform)

data_encoded

In [None]:
import sklearn

In [None]:
sklearn.__version__

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = data_encoded[data_encoded.columns[:-1]]
y = data_encoded[data_encoded.columns[-1]]

In [None]:
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=42,                                                     
                                                     test_size=0.05                                                 
                                                   )

In [None]:
X_train

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier(random_state=42, criterion='entropy')

In [None]:
data['Outlook'].map({'Overcast':1 , 'Sunny':2, 'Rain': 3})

In [None]:
model.fit(X_train,y_train)

In [None]:
model.predict(X_test)

In [None]:
model.score(X_test, y_test)

In [None]:
features_col = data.columns[:-1]

In [None]:
from sklearn.tree import plot_tree 
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(10, 8))
plot_tree(model.fit(X_train, y_train),
          feature_names=features_col,
          class_names=['No', 'Yes'],
          filled=True, rounded=True)
plt.show()