In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd

In [3]:
dfdata = pd.read_csv('/content/drive/MyDrive/BaiKiemTra/activity.csv')
dfdata.head()

Unnamed: 0,ID,Lazy,Deadline,Party,Activity (class)
0,1,yes,Urgent,yes,Party
1,2,yes,Urgent,no,Study
2,3,yes,Near,yes,Party
3,4,no,,yes,Party
4,5,yes,,no,Pub


In [4]:
data_train=dfdata.iloc[:,1:]
data_train.head(5)

Unnamed: 0,Lazy,Deadline,Party,Activity (class)
0,yes,Urgent,yes,Party
1,yes,Urgent,no,Study
2,yes,Near,yes,Party
3,no,,yes,Party
4,yes,,no,Pub


In [5]:
attrs = dfdata.columns[1:]
attrs

Index(['Lazy', 'Deadline', 'Party', 'Activity (class)'], dtype='object')

In [6]:
class Tree:
  def __init__(self,observationIDs,features,currLvl=0,subTree={},bestFeature=None,majorityLabel=None,parentMajorityLabel=None):
    self.observationIDs = observationIDs
    self.features = features
    self.currLvl = currLvl
    self.subTree = subTree
    self.bestFeature = bestFeature
    self.majorityLabel = majorityLabel
    self.parentMajorityLabel = parentMajorityLabel
    self.setBestFeatureID(bestFeature)

  def setBestFeatureID(self, feature):
    idx = None
    if feature == 'Lazy':
      idx = 0
    elif feature == 'Deadline':
      idx = 1
    elif feature == 'Party':
      idx = 2
    else:
      idx = 3
    self.bestFeatureID = int(idx)

In [7]:
def predict(tree, obs):
	if tree.bestFeature == None:
		return tree.majorityLabel
	featVal = obs[tree.bestFeatureID]
	if not featVal in tree.subTree: # val with no subtree
		return tree.majorityLabel
	else: # recurse on subtree
		return predict(tree.subTree[featVal],obs)

In [8]:
def displayDecisionTree(tree):
	print('\t'*tree.currLvl + '(lvl {}) {}'.format(tree.currLvl,tree.majorityLabel))
	if tree.bestFeature == None:
		return

	print('\t'*tree.currLvl + '{}'.format(tree.bestFeature) + ': ')
	for [val,subTree] in sorted(tree.subTree.items()):
		print('\t'*(tree.currLvl+1) + 'choice: {}'.format(val))
		displayDecisionTree(subTree)

In [9]:
import math
def Entropy(ns):
	entropy = 0.0
	total = sum(ns)
	for x in ns:
		entropy += -1.0*x/total*math.log(1.0*x/total,2)
	return entropy

In [10]:
def IG(observationIDs, feature, dfdata):
	# get smaller dataframe
	df = dfdata.loc[list(observationIDs)]
	# populate counts for Wins/Losses for each category of the feature
	labelCountDict = {}
	valueLabelCountDict = {}
	for index, row in df.iterrows():
		label = row['Activity (class)']
		if not label in labelCountDict:
			labelCountDict[label] = 0 # this specific label was not found so insert 0 count
		labelCountDict[label] += 1
		featureValue = row[feature]
		if not featureValue in valueLabelCountDict:
			valueLabelCountDict[featureValue] = {} # this specific feature value not found so insert empty dict
		if not label in valueLabelCountDict[featureValue]:
			valueLabelCountDict[featureValue][label] = 0 # this specific label was not found for this feature value so insert 0 count
		valueLabelCountDict[featureValue][label] += 1

	ns = []
	for [label,count] in labelCountDict.items():
		ns.append(count)

	H_Y = Entropy(ns)

	H_Y_X = 0.0
	for [featureValue, labelCountDict] in valueLabelCountDict.items():
		nsHYX = []
		for [label,count] in labelCountDict.items():
			nsHYX.append(count)
		H_Y_X += 1.0*sum(nsHYX)/len(df)*Entropy(nsHYX)
	return H_Y - H_Y_X

In [11]:
def GR(observationIDs, feature,dfdata):
	ig = IG(observationIDs,feature)
	if ig == 0:
		return 0
	df = dfdata.loc[list(observationIDs)]
	valueLabelDict = {}
	for index, row in df.iterrows():
		label = row['Activity (class)']
		featureValue = row[feature]
		if featureValue not in valueLabelDict:
			valueLabelDict[featureValue] = 0
		valueLabelDict[featureValue] += 1
	ns = []
	for [val,count] in valueLabelDict.items():
		ns.append(count)
	ent = Entropy(ns)
	return float(ig)/ent

In [12]:
def fillDecisionTree(tree, decisionTreeAlgo, dfdata):
    # find the majorityLabel
    df = dfdata.loc[list(tree.observationIDs)]  # smaller df
    counts = df['Activity (class)'].value_counts()
    majorityLabel = df['Activity (class)'].value_counts().idxmax()
    if len(counts) > 1:
        if 'Party' in counts and 'Study' in counts and 'Pub' in counts:  # Kiểm tra nếu 'Drug A' và 'Drug B' có trong counts
            if counts['Party'] == counts['Study'] == counts['Pub']:
                majorityLabel = tree.parentMajorityLabel
    tree.majorityLabel = majorityLabel

    # exit if only one label
    if len(counts) == 1:
        return
    # exit if no features left
    if len(tree.features) == 0:
        return

    # find best feature
    featureValueDict = {}
    for feature in tree.features:
        if decisionTreeAlgo == 'ID3':
            metricScore = IG(tree.observationIDs, feature, dfdata)
        if decisionTreeAlgo == 'C45':
            metricScore = GR(tree.observationIDs, feature, dfdata)
        featureValueDict[feature] = metricScore
    bestFeature, bestFeatureValue = sorted(featureValueDict.items(), reverse=True)[0]
    # exit if IG or GR is 0
    if bestFeatureValue == 0.0:
        return
    tree.bestFeature = bestFeature

    # find subset of features
    subFeatures = set()
    for feature in tree.features:
        if feature == bestFeature:  # skip the current best feature
            continue
        subFeatures.add(feature)

    # find best feature id
    bestFeatureIdx = 0
    if bestFeature == 'Lazy':
        bestFeatureIdx = 0
    elif bestFeature == 'Deadline':
        bestFeatureIdx = 1
    elif bestFeature == 'Party':
        bestFeatureIdx = 2
    else:
        bestFeatureIdx = 3

    # find subset of observations
    subObservationsDict = {}
    for obs in tree.observationIDs:
        val = dfdata.values[obs][bestFeatureIdx]
        if val not in subObservationsDict:
            subObservationsDict[val] = set()
        subObservationsDict[val].add(obs)

    for [val, obs] in subObservationsDict.items():
        tree.subTree[val] = Tree(obs, subFeatures, tree.currLvl + 1, {}, None, None, majorityLabel)

        fillDecisionTree(tree.subTree[val], decisionTreeAlgo, dfdata)

In [13]:
initialObservationIDs = set(range(len(data_train)))
initialFeatures = set(attrs[:-1])

In [14]:
algoChoice = 'ID3'
print("choice: {}".format(algoChoice))

choice: ID3


In [15]:
MyTree = Tree(initialObservationIDs, initialFeatures)
fillDecisionTree(MyTree, algoChoice, data_train)

print('My Decision Tree:')
displayDecisionTree(MyTree)

My Decision Tree:
(lvl 0) Party
Party: 
	choice: no
	(lvl 1) Study
	Lazy: 
		choice: no
		(lvl 2) Study
		choice: yes
		(lvl 2) Study
		Deadline: 
			choice: Near
			(lvl 3) Watch TV
			choice: None
			(lvl 3) Pub
			choice: Urgent
			(lvl 3) Study
	choice: yes
	(lvl 1) Party


In [16]:
X = dfdata.iloc[:, 1:-1]
X

Unnamed: 0,Lazy,Deadline,Party
0,yes,Urgent,yes
1,yes,Urgent,no
2,yes,Near,yes
3,no,,yes
4,yes,,no
5,no,,yes
6,no,Near,no
7,yes,Near,no
8,yes,Near,yes
9,no,Urgent,no


In [17]:
y = dfdata.iloc[:, -1]  # cột cuối là nhãn
y

0       Party
1       Study
2       Party
3       Party
4         Pub
5       Party
6       Study
7    Watch TV
8       Party
9       Study
Name: Activity (class), dtype: object

In [18]:
from sklearn.preprocessing import LabelEncoder

# Dictionary để lưu trữ mapping của các cột đã được mã hóa
encoded_values = {}

# Tạo và copy data_train
X_le = X.copy()

# Áp dụng LabelEncoder cho từng cột và lấy các giá trị đã được mã hóa
for col in X_le:
    labelencoder_X = LabelEncoder()
    encoded_labels = labelencoder_X.fit_transform(X[col])
    encoded_values[col] = labelencoder_X.classes_
    X_le[col] = encoded_labels

# In Mapping of encoded values
for col, values in encoded_values.items():
    print(f"Mapping of encoded values for column {col}:")
    for i, value in enumerate(values):
        print(f"{value} -> {i}")
    print()

Mapping of encoded values for column Lazy:
no -> 0
yes -> 1

Mapping of encoded values for column Deadline:
Near -> 0
None -> 1
Urgent -> 2

Mapping of encoded values for column Party:
no -> 0
yes -> 1



In [19]:
y = dfdata.iloc[:, -1]  # cột cuối là nhãn
y

0       Party
1       Study
2       Party
3       Party
4         Pub
5       Party
6       Study
7    Watch TV
8       Party
9       Study
Name: Activity (class), dtype: object

In [20]:
labelencoder_y = LabelEncoder()
encoded_labels_y = labelencoder_y.fit_transform(y)
y_le = encoded_labels_y

In [21]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_le, y_le)

In [22]:
feature_names = X_le.columns.tolist()
class_names = ['Party', 'Study']

In [23]:
feature_names

['Lazy', 'Deadline', 'Party']

In [24]:
class_names

['Party', 'Study']

In [25]:
import graphviz
from sklearn import tree
dot_data = tree.export_graphviz(dt,feature_names=feature_names,
                                class_names=class_names,
                                rounded=True,filled=True)
graph = graphviz.Source(dot_data, format="png")
graph

IndexError: ignored