In [551]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split, cross_val_score # Import train_test_split function
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

In [552]:
# function to save the observed values (to be compared later)
def save_observations(dt,flag):
    file1 = open('./results.txt','a')

    if(flag>=0):
        file1.write(str(dt.formulation)+'\t\t\t'
                    +str(dt.criterion)+'\t\t\t'
                    +str(dt.max_depth)+'\t\t\t'
                    +str(dt.max_features)+'\t\t\t'
                    +str(dt.accuracy)+'\t\t\t'
                    +str(dt.f1_micro)+'\t\t\t'
                    +str(dt.f1_macro)+'\t\t\t'
                    +str(dt.precision)+'\t\t\t'
                    +str(dt.recall)+'\n')
    else:
        file1.write('\n')
    file1.close()


In [553]:
# class for powerset formulation
class DecisionTree:
    
    def __init__(self,max_depth, max_features, criterion,formulation):
        self.formulation = formulation
        self.max_depth = max_depth
        self.max_features = max_features
        self.criterion = criterion
        self.classtoid = {
            'electronics':0,
            'clothing':1,
            'sports':2,
            'furniture':3,
            'beauty':4,
            'food':5,
            'home':6,
            'books':7
        }
        
    def loadDataset(self, file_name):
        self.data = pd.read_csv(file_name)
        
    def generatePowerset(self):
        # assigning 8 labels to the 8 simple classes
        labels = [0,1,2,3,4,5,6,7]
        n = len(labels)
        
        # creating a powerset (list of sets)
        power_set = []
        
        for i in range(2**n):
            subset = set()
            for j in range(n):
                if (i >> j) & 1:
                    val = labels[j]
                    subset.add(val)
            power_set.append(subset)

        # returns the list of sets - powerset (each set is a class)
        return power_set
    
    def convertCategoricalToLabels(self,list1):
        labels = set()
        index = 0
        mapping = {} 
        
        for element in list1:
            if(isinstance(element,set)):
                labels.add(frozenset(element))
            else:
                labels.add(element)
            
        for element in labels:
            # assigning integer labels to categorical data instances
            mapping[element] = index
            index = index + 1
        # if(isinstance(list1[0],set)):
        #     print('id1 = ',str(mapping[frozenset({'beauty','books'})]))
        #     print('id2 = ',str(mapping[frozenset({'books','beauty'})]))
            
        # for each row entry of the column, append its id to a list
        labels_list = []
        for element in list1:
            if(isinstance(element,set)):
                labels_list.append(mapping[frozenset(element)])
            else:
                labels_list.append(mapping[element])

        return labels_list
    
    def getMultioutput(self):
        labels = self.data['labels']
        labels_list = []
        for label in labels:
            words = label.split()
            val = 0
            for word in words:
                i = self.classtoid[word]
                val = val + 2**i
            labels_list.append(val)
        return labels_list
    
    def fit_powerset(self):
        # creating a classifier object
        clf = DecisionTreeClassifier(
            criterion=self.criterion,
            max_depth = self.max_depth,
            max_features = self.max_features
        )
        
        # converting X_train to numerical features (where required -> object only)
        features = ['age', 'gender', 'income', 'education', 'married', 'children', 'city', 'occupation', 'purchase_amount', 'most bought item']
        
        for feature in features:    
            col_data_type = self.data[feature].dtype
            if(col_data_type=='object'):
                converted = self.convertCategoricalToLabels(self.data[feature])
                self.data[feature] = converted
        
        X_train = self.data[features]
        
        # now converting Y_train to numerical data (max 256 values)
        ps = self.generatePowerset()
        
        column_index = 10 # column of Y_train
        labels_list = []
        for label in self.data.iloc[:,column_index]:
            labels_list.append(label)
        
        Y_train = []
        for label in labels_list:
            words = label.split()
            temp = set()
            for word in words:    
                index = self.classtoid[word]
                temp.add(index)
            Y_train.append(temp)
            
        Y_train = self.convertCategoricalToLabels(Y_train)
        
        
        self.X = X_train
        self.Y = Y_train
        
        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)
        
        # now fitting the classifier with obtained X_train and Y_train and storing it
        # print(X_train)
        # print(Y_train)
        clf.fit(self.X_train,self.Y_train)
        self.classifier = clf
        
        
    def fit_multioutput(self):
        # creating a classifier object
        clf = DecisionTreeClassifier(
            criterion=self.criterion,
            max_depth = self.max_depth,
            max_features = self.max_features
        )
        
        # converting X_train to numerical features (where required -> object only)
        features = ['age', 'gender', 'income', 'education', 'married', 'children', 'city', 'occupation', 'purchase_amount', 'most bought item']
        
        for feature in features:    
            col_data_type = self.data[feature].dtype
            if(col_data_type=='object'):
                converted = self.convertCategoricalToLabels(self.data[feature])
                self.data[feature] = converted
        
        X_train = self.data[features]
        
        # now converting Y_train to numerical data (max 256 values)
        Y_train = self.getMultioutput()
        
        self.X = X_train
        self.Y = Y_train
        
                
        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)
        
        # now fitting the classifier with obtained X_train and Y_train and storing it
        # print(X_train)
        # print(Y_train)
        clf.fit(self.X_train,self.Y_train)
        self.classifier = clf


        
    
    def fit(self):
        if(self.formulation=='powerset'):
            self.fit_powerset()
        else:
            self.fit_multioutput()    
    
    def _predict(self):
        self.Y_pred = self.classifier.predict(self.X_test)
        # print(self.Y_test)
        # print(Y_pred)
        # bool_list = (self.Y_test==self.Y_pred)
        # int_list = [1 if value else 0 for value in bool_list]
        # print('accuracy = ',str(100*sum(int_list)/len(int_list)),'%')

    def calc_performance(self):
        self.accuracy = round(accuracy_score(self.Y_test, self.Y_pred),3)
        self.f1_micro = round(f1_score(self.Y_test, self.Y_pred, average='micro',zero_division=1),3)
        self.f1_macro = round(f1_score(self.Y_test, self.Y_pred, average='macro',zero_division=1),3)
        self.precision = round(precision_score(self.Y_test, self.Y_pred, average='macro',zero_division=1),3)
        self.recall = round(recall_score(self.Y_test, self.Y_pred, average='macro',zero_division=1),3)
        self.confusion_mat = confusion_matrix(self.Y_test, self.Y_pred)
        
    def print_performance(self):
        print("Accuracy:", self.accuracy)
        print("F1 Score (Micro):", self.f1_micro)
        print("F1 Score (Macro):", self.f1_macro)
        print("Precision:", self.precision)
        print("Recall:", self.recall)
        print("Confusion Matrix:")
        print(self.confusion_mat)
        
    def getKfold(self,k):
        
        kth_fold_metric = cross_val_score(self.classifier, self.X, self.Y, cv=k, scoring='accuracy')

        # Print the accuracy for each fold and the mean accuracy
        for i, acc in enumerate(kth_fold_metric, 1):
            print(f"Fold {i}: Accuracy = {acc:.4f}")

        # Calculate and print the mean accuracy across all folds
        avg_acc = np.mean(kth_fold_metric)
        print(f"Mean Accuracy across {k}-fold cross-validation: {avg_acc:.4f}")
        return avg_acc

        

In [554]:
# creating dt object
dt1 = DecisionTree(5,5,'entropy','multioutput')
dt1.loadDataset('./advertisement.csv')
dt1.fit()



In [555]:

dt1._predict()
dt1.calc_performance()
dt1.print_performance()



Accuracy: 0.06
F1 Score (Micro): 0.06
F1 Score (Macro): 0.013
Precision: 0.81
Recall: 0.078
Confusion Matrix:
[[1 0 0 ... 0 0 0]
 [0 3 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [556]:
dt1.getKfold(24)



Fold 1: Accuracy = 0.0476
Fold 2: Accuracy = 0.0476
Fold 3: Accuracy = 0.0476
Fold 4: Accuracy = 0.0238
Fold 5: Accuracy = 0.0000
Fold 6: Accuracy = 0.0714
Fold 7: Accuracy = 0.0238
Fold 8: Accuracy = 0.0476
Fold 9: Accuracy = 0.0476
Fold 10: Accuracy = 0.0238
Fold 11: Accuracy = 0.0476
Fold 12: Accuracy = 0.1190
Fold 13: Accuracy = 0.0238
Fold 14: Accuracy = 0.0714
Fold 15: Accuracy = 0.0238
Fold 16: Accuracy = 0.0238
Fold 17: Accuracy = 0.0732
Fold 18: Accuracy = 0.0488
Fold 19: Accuracy = 0.0732
Fold 20: Accuracy = 0.0000
Fold 21: Accuracy = 0.0244
Fold 22: Accuracy = 0.0244
Fold 23: Accuracy = 0.0000
Fold 24: Accuracy = 0.0244
Mean Accuracy across 24-fold cross-validation: 0.0399


0.039948703058459156

In [505]:
form = ['powerset','multioutput']
crit = ['gini','entropy']
max_dp = [3,5,10,20,30]
max_feat = [3,5,7,9,10]

for fr in form:
    for c in crit:
        for d in max_dp:
            for f in max_feat:
                dt1.formulation = fr
                dt1.criterion = c
                dt1.max_depth = d
                dt1.max_features = f
                dt1.fit()
                dt1._predict()
                dt1.calc_performance()
                save_observations(dt1,1)