In [None]:
import math

def entropy(p1):
  e = 0
  if p1 == 1:
    e = 0
  elif p1 > 0:
    e = - (p1 * math.log2(p1) + (1-p1) * math.log2(1-p1))
  return e

In [None]:
from google.colab import drive, files
import os

PROJECT_FOLDER = "/content/gdrive/MyDrive/Colab Notebooks"
drive.mount('/content/gdrive/')
os.chdir(PROJECT_FOLDER)
print("Current dir: ", os.getcwd())

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, cross_val_score
import numpy as np

In [None]:
df = pd.read_csv("eyes.csv")
df.head()

In [None]:
df = pd.get_dummies(df, columns=['Age'],prefix=["Age"], dtype=int)
df.head()

Exercise B

In [None]:
def eye_attr_entropy_calc(data, attr, subset_param):
  subset = data[data[attr] == subset_param]
  s = subset.shape[0]
  s1 = subset[subset['Class'] == 'A'].shape[0]
  s2 = subset[subset['Class'] == 'B'].shape[0]
  y1 = s1 / s
  y2 = s2 / s
  if (s1 == 0 or s2 == 0):
    return 0
  else:
      return - (y1 * np.log2(y1) + y2 * np.log2(y2))


In [None]:
eye_attr_entropy_calc(df, "Age_Middle-aged", 1)




In [None]:
eye_attr_entropy_calc(df, "Vision", "Myopia")

In [None]:
eye_attr_entropy_calc(df, "Vision", "Farsightedness")

In [None]:
eye_attr_entropy_calc(df, "Astigmatism", "Yes")

In [None]:
eye_attr_entropy_calc(df, "UseOfGlasses", "Rare")

Exercise C

In [None]:
total_instances = df.shape[0]
print(total_instances)
class_a_instances = df[df['Class'] == 'A'].shape[0]
class_b_instances = df[df['Class'] == 'B'].shape[0]

p1 = class_a_instances / total_instances  # Proportion of Class A
p2 = class_b_instances / total_instances  # Proportion of Class B
print(p1)
print(p2)
initial_entropy = entropy(p1)

print(f"Initial Entropy: {initial_entropy}")


In [None]:
subset = df[df['Astigmatism'] == 'Yes']
s = subset.shape[0]
s1 = subset[subset['Class'] == 'A'].shape[0]
s2 = subset[subset['Class'] == 'B'].shape[0]
y1 = s1 / s
y2 = s2 / s
print(y1)
print(y2)

Information_Gain_Astigmatism = initial_entropy - ( y2 * eye_attr_entropy_calc(df, "Astigmatism", "Yes") + y1 * eye_attr_entropy_calc(df, "Astigmatism", "No"))
print(f"Information Gain Astigmatism: {Information_Gain_Astigmatism}")

In [None]:
subset = df[df['Age_Middle-aged'] == 1]
s = subset.shape[0]
s1 = subset[subset['Class'] == 'A'].shape[0]
s2 = subset[subset['Class'] == 'B'].shape[0]
y1 = s1 / s
y2 = s2 / s

print(y1)
print(y2)

Information_Gain_Age = initial_entropy - ( y2 * eye_attr_entropy_calc(df, "Age_Middle-aged", 1) + y1 * eye_attr_entropy_calc(df, "Age_Young", 1))
print(f"Information Gain Age: {Information_Gain_Age}")

In [None]:
subset = df[df['Vision'] == 'Myopia']
s = subset.shape[0]
s1 = subset[subset['Class'] == 'A'].shape[0]
s2 = subset[subset['Class'] == 'B'].shape[0]
y1 = s1 / s
y2 = s2 / s


Information_Gain_Vision = initial_entropy - ( y2 * eye_attr_entropy_calc(df, "Vision", "Myopia") + y1 * eye_attr_entropy_calc(df, "Vision", "Farsightedness"))
print(f"Information Gain Vision: {Information_Gain_Vision}")

In [None]:
subset = df[df['UseOfGlasses'] == 'Rare']
s = subset.shape[0]
s1 = subset[subset['Class'] == 'A'].shape[0]
s2 = subset[subset['Class'] == 'B'].shape[0]
y1 = s1 / s
y2 = s2 / s


Information_Gain_UseOfGlasses = initial_entropy - ( y2 * eye_attr_entropy_calc(df, "UseOfGlasses", "Rare") + y1 * eye_attr_entropy_calc(df, "UseOfGlasses", "Often"))
print(f"Information Gain Use Of Glasses: {Information_Gain_UseOfGlasses}")

In [None]:
df = pd.get_dummies(df, drop_first = True, dtype=int)


In [None]:
# Split the dataset into features (X) and target (y)
X = df.drop('Class_B', axis=1)
y = df['Class_B']

In [None]:
X

In [None]:
y

Exercise D

In [None]:
ClassAorB_tree = DecisionTreeClassifier(random_state=2, max_depth = 5, criterion = "entropy")
ClassAorB_tree.fit(X, y)

In [None]:
# Plot the decision tree
plt.figure(figsize=(8, 8))
plot_tree(ClassAorB_tree, feature_names=X.columns, class_names=['ClassA', 'ClassB'], filled=True, rounded=True)
plt.show()

Exercise E

In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
param_grid = {'max_depth': [2, 3, 4, 5, 6, 7]}

grid_search = GridSearchCV(dt_model, param_grid, cv=10, scoring='accuracy')
grid_search.fit(X, y)

# Get best hyperparameter and accuracy
best_depth = grid_search.best_params_['max_depth']
best_accuracy = grid_search.best_score_

# Print results
print(f"Best max_depth: {best_depth}")
print(f"10-Fold Cross-Validation Accuracy: {best_accuracy:.3f}")