In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd 
import numpy as np 
import math

In [None]:
dfdata = pd.read_csv('/content/drive/MyDrive/TH1391_NLMH/Week8_Thuchanh2/drug.csv')
dfdata

In [None]:
data_train=dfdata.iloc[:-1,1:]
data_train

In [None]:
attrs = dfdata.columns[1:]
attrs

In [None]:
class Tree:
  def __init__(self,observationIDs,features,currLvl=0,subTree={},bestFeature=None,majorityLabel=None,parentMajorityLabel=None):
    self.observationIDs = observationIDs
    self.features = features
    self.currLvl = currLvl
    self.subTree = subTree
    self.bestFeature = bestFeature
    self.majorityLabel = majorityLabel
    self.parentMajorityLabel = parentMajorityLabel
    self.setBestFeatureID(bestFeature)
    
  def setBestFeatureID(self, feature): # Chọn thuộc tính tốt nhất
    idx = None
    if feature == 'Age':
      idx = 0
    elif feature == 'Sex':
      idx = 1
    elif feature == 'BP':
      idx = 2
    else:
      idx = 3
    self.bestFeatureID = int(idx)

In [None]:
# Hàm dự đoán
def predict(tree, obs): # đư đoán dữ lieu bat ky
	if tree.bestFeature == None:
		return tree.majorityLabel
	featVal = obs[tree.bestFeatureID]
	if not featVal in tree.subTree: # val with no subtree
		return tree.majorityLabel
	else: # recurse on subtree
		return predict(tree.subTree[featVal],obs)

In [None]:
# Hàm hiểm thị cây
def displayDecisionTree(tree):
	print('\t'*tree.currLvl + '(lvl {}) {}'.format(tree.currLvl,tree.majorityLabel))
	if tree.bestFeature == None:
		return

	print('\t'*tree.currLvl + '{}'.format(tree.bestFeature) + ': ')
	for [val,subTree] in sorted(tree.subTree.items()):
		print('\t'*(tree.currLvl+1) + 'choice: {}'.format(val))
		displayDecisionTree(subTree)

In [None]:
# Hàm tính Entropy
def Entropy(ns): # Phai tinh toan
	entropy = 0.0
	total = sum(ns)
	for x in ns:
		entropy += -1.0*x/total*math.log(1.0*x/total,2)
	return entropy

In [None]:
# ID3 - Information Gain
def IG(observationIDs, feature, dfdata):
	# get smaller dataframe
	df = dfdata.loc[list(observationIDs)]
	# populate counts for Wins/Losses for each category of the feature
	labelCountDict = {}
	valueLabelCountDict = {}
	for index, row in df.iterrows():
		label = row['Drug'] # !!! Mục tiêu là chổ này
		if not label in labelCountDict:
			labelCountDict[label] = 0 # this specific label was not found so insert 0 count
		labelCountDict[label] += 1
		featureValue = row[feature]
		if not featureValue in valueLabelCountDict:
			valueLabelCountDict[featureValue] = {} # this specific feature value not found so insert empty dict
		if not label in valueLabelCountDict[featureValue]:
			valueLabelCountDict[featureValue][label] = 0 # this specific label was not found for this feature value so insert 0 count
		valueLabelCountDict[featureValue][label] += 1

	ns = []
	for [label,count] in labelCountDict.items():
		ns.append(count)

	H_Y = Entropy(ns) # Hàm tính Entropy

	H_Y_X = 0.0
	for [featureValue, labelCountDict] in valueLabelCountDict.items():
		nsHYX = []
		for [label,count] in labelCountDict.items():
			nsHYX.append(count)
		H_Y_X += 1.0*sum(nsHYX)/len(df)*Entropy(nsHYX)
	return H_Y - H_Y_X

In [None]:
# C4.5 - Gain Ratio
def GR(observationIDs, feature, dfdata):
	ig = IG(observationIDs,feature, dfdata)
	if ig == 0:
		return 0
	df = dfdata.loc[list(observationIDs)]
	valueLabelDict = {}
	for index, row in df.iterrows():
		label = row['Drug']
		featureValue = row[feature]
		if featureValue not in valueLabelDict:
			valueLabelDict[featureValue] = 0
		valueLabelDict[featureValue] += 1
	ns = []
	for [val,count] in valueLabelDict.items():
		ns.append(count)
	ent = Entropy(ns)
	return float(ig)/ent

In [None]:
# Xây dựng cây quyết định
def fillDecisionTree(tree, decisionTreeAlgo, dfdata): 
	# find the majorityLabel
	df = dfdata.loc[list(tree.observationIDs)] # smaller df
	counts = df['Drug'].value_counts()                       # !!! chỉnh phần này lại
	majorityLabel = df['Drug'].value_counts().idxmax()
	if len(counts) > 1:
		if counts['Drug A'] == counts['Drug B']:
			majorityLabel = tree.parentMajorityLabel
	tree.majorityLabel = majorityLabel

	# exit if only one label  # !!! Nhiều cái phân loại thêm vao day
	if len(counts) == 1:
		return
	# exit if no features left
	if len(tree.features) == 0:  
		return

	# find best feature
	featureValueDict = {}
	for feature in tree.features: 
		if decisionTreeAlgo == 'ID3':
			metricScore = IG(tree.observationIDs,feature,dfdata)
		if decisionTreeAlgo == 'C45':
			metricScore = GR(tree.observationIDs,feature,dfdata)
		featureValueDict[feature] = metricScore
	bestFeature, bestFeatureValue = sorted(featureValueDict.items(),reverse=True)[0]
	# exit if IG or GR is 0
	if bestFeatureValue == 0.0:
		return
	tree.bestFeature = bestFeature

	# find subset of features
	subFeatures = set()
	for feature in tree.features:
		if feature == bestFeature: # skip the current best feature
			continue
		subFeatures.add(feature)
	
	# find best feature id # !!! Tự fix
	bestFeatureIdx = 0
	if bestFeature == 'Age':
		bestFeatureIdx = 0
	elif bestFeature == 'Sex':
		bestFeatureIdx = 1
	elif bestFeature == 'BP':
		bestFeatureIdx = 2
	else:
		bestFeatureIdx = 3
	
	# find subset of observations
	subObservationsDict = {}
	for obs in tree.observationIDs:
		val = dfdata.values[obs][bestFeatureIdx]
		if not val in subObservationsDict:
			subObservationsDict[val] = set()
		subObservationsDict[val].add(obs)

	for [val,obs] in subObservationsDict.items():

		tree.subTree[val] = Tree(obs, subFeatures, tree.currLvl + 1,{},None,None,majorityLabel)
		
		fillDecisionTree(tree.subTree[val], decisionTreeAlgo, dfdata)

In [None]:
initialObservationIDs = set(range(len(data_train)))
initialFeatures = set(attrs)

In [None]:
# Lựa chọn thuật toán
# algoChoice = str(input(("Which decision tree algorithm would you like to use ('ID3' or 'C45)?")))
# if algoChoice not in {'ID3','C45'}:
# 	print("Invalid algorithm choice. You must choose 'ID3' or 'C45'")
# 	exit()

algoChoice = 'C45'
print("choice: {}".format(algoChoice))

In [None]:
# Hiểm thị cây
MyTree = Tree(initialObservationIDs, initialFeatures)
# fillDecisionTree(MyTree, algoChoice, dfdata) # In cây ra
fillDecisionTree(MyTree, algoChoice, data_train) # In cây ra

print('My Decision Tree:')
displayDecisionTree(MyTree)

In [None]:
# Vẽ hình
from IPython.display import Image
import pydotplus
import graphviz

def visualize_tree(tree, feature_names=attrs[:-1], class_names=data_train[data_train.columns[-1]].unique(), label='root'):
    if tree.bestFeature is None:
        node_attr = {'shape': 'plaintext', 'style': 'filled', 'fillcolor': 'limegreen'}
        graph = graphviz.Digraph(node_attr=node_attr, format='png')
        graph.attr(bgcolor='transparent')
        graph.node(label, f"{tree.majorityLabel}")
        return graph

    dot = graphviz.Digraph(node_attr={'shape': 'box', 'style': 'filled', 'fillcolor': '#00B2FC'}, format='png')
    dot.attr(bgcolor='transparent')
    dot.node(label, tree.bestFeature)

    for val, sub_tree in tree.subTree.items():
        sub_label = f"{label}-{val}"
        dot.subgraph(visualize_tree(sub_tree, feature_names, class_names, sub_label))
        dot.edge(label, sub_label, label=f"{val}", color='black')

    return dot

dot = visualize_tree(MyTree)
graph = pydotplus.graph_from_dot_data(dot.source)
Image(graph.create_png())

In [None]:
X_p14 = data_train.iloc[-2, :-1]
obs_row = X_p14.tolist()
pred = predict(MyTree, obs_row)
print(f'Prediction for {obs_row}: {pred}')

In [None]:
dfdata

In [None]:
data_test_train = data_train.iloc[:, :-1]
# data_test_train

In [None]:
for index, row in data_test_train.iterrows():
    obs_row = [row[0], row[1], row[2], row[3]]
    pred = predict(MyTree, obs_row)
    print(f'Prediction for {obs_row}: {pred}')

In [None]:
X_p9 = data_train.iloc[8, :-1]
obs_row = X_p9.tolist()
pred = predict(MyTree, obs_row)
print(f'Prediction for {obs_row}: {pred}')

In [None]:
X_p14 = data_train.iloc[-2, :-1]
obs_row = X_p14.tolist()
pred = predict(MyTree, obs_row)
print(f'Prediction for {obs_row}: {pred}')

In [None]:
# Dự đoán p15
X_p15 = dfdata.iloc[14, 1:-1]
obs_row = X_p15.tolist()
pred = predict(MyTree, obs_row)
print(f'Prediction for {obs_row}: {pred}')