In [1]:
import pandas as pd
import numpy  as np
from pprint import pprint
from sklearn.model_selection import train_test_split
from winsound import PlaySound

In [2]:
def entropy(data):
    """
        entropy(data)
        = -sum val in class P(data = val) * log2(P(data = val))
    
    """
    
    total_count = len(data)
    entropy_val = 0
    
    _,counts    = np.unique(data,return_counts=True)
    for count in counts:
        Prob = count/total_count
        entropy_val += (Prob * np.log2(Prob))
        
    return -entropy_val


def information_gain(attribute,data,target = "class"):
    
    """
        Information_Gain(attribute,data)
        = entropy(data) - entropy(feature,data)
        
        entropy(feature,data) 
        = weighted sum of entropy for every value of fuature
    """
    
    total_entropy = entropy(data[target])
    
    total_count      = len(data)
    weighted_entropy = 0
    
    values,counts = np.unique(data[attribute],return_counts = True)
    for value,count in zip(values,counts):
        # data for a specific value
        data_for_value = data.loc[data[attribute] == value][target]
        weighted_entropy += (count/total_count * entropy(data_for_value))
        
    return total_entropy - weighted_entropy



In [3]:
# ID3 Algo 

def plurality_value(examples,target):
    """
        input: 
                examples - the example
                target   - the name of target
        output:
                the most occuring target in examples
    """
    values,counts = np.unique(examples[target],return_counts=True)
    max_index     = np.argmax(counts)
    return values[max_index]


def decision_tree(examples,attributes,parent_examples,depth):
    # target name is at the last column name of example
    target = examples.columns[-1]
    
    # cases
    
    # stopping criteria
    #-----------------------------------------------------
    # 1. examples is empty
    if len(examples) == 0:
        return plurality_value(parent_examples,target)
    
    # 2. all the example are same
    unique_values = np.unique(examples[target])
    if len(unique_values) <= 1:
        return unique_values[0]
    
    # 3. attributes is empty or depth limit reached
    if len(attributes) == 0 or depth == 0:
        return plurality_value(examples,target)
    #-----------------------------------------------------
    
    # 4. grow the tree
    
    #information gain for each attributes
    importance = [ information_gain(attribute,examples,target) for attribute in attributes ]
    
    # best information gain
    best_attribute = attributes[np.argmax(importance)]
    
    # tree with best feature as root
    tree = {best_attribute:{}}
    
    #attributes_minus_best = attributes - best_attribute
    attributes_minus_best = [ a for a in attributes if a != best_attribute] 
    
    # for each value in best_attribute
    for value in np.unique(examples[best_attribute]):
        
        # split the examples on attrivute values
        split_examples = examples.loc[examples[best_attribute] == value]
        
        # recursively build the subtree
        subtree = decision_tree(split_examples,attributes_minus_best,examples,depth - 1)
        
        # ddd the sub tree under root 
        tree[best_attribute][value] = subtree
    
    # if no value match then default = plurality in the node
    tree[best_attribute]["__default__"] = plurality_value(examples,target)

    return tree
    
    

In [4]:
# predict a query

def get_tree_node(query,tree):
    """
        return - matching feature node of query in the tree
    """
    for key in list(query.keys()):
        if key in list(tree.keys()):
            try:
                result = tree[key][query[key]] 
            except:
                # if the value does not exist in tree branch
                # that means new data
                # return the default branch
                return tree[key]["__default__"]
            
            # written again for python scope -_-
            result = tree[key][query[key]]
            return result 

        
def is_leaf(node):
    """
        in this design if node is a dictionary its not a leaf
        if its just a value then its the class type
    """
    return not isinstance(node,dict)


def dt_predict(query,tree):
    """
    input:
        query - in for if a dictionary {feature1: value1,feature2: value2,....}
        tree  - decsion tree in the form of dictionary of dictionaries
        
    output:
        a target class
    """
    
    # get the corresponding tree node
    node = get_tree_node(query,tree)
    
    # if the node is leaf just return the result
    if is_leaf(node):
        return node
    
    # recurse on the node 
    return dt_predict(query,node)
    


In [5]:
def dt_get_pred_true(test_examples,tree):
    # removes last column that contains class
    # converts each row as a dictionary of columns
    queries = test_examples.iloc[:,:-1].to_dict(orient='records')
    
    y_pred = [] 
    y_true = [] 
    for i in range(len(test_examples)):
        prediction = dt_predict(queries[i],tree)
        y_pred.append(prediction)
        y_true.append(test_examples.iloc[i,-1])
    
    return y_pred,y_true


        

In [6]:
# adaboost

def resample(examples,w):
    """
        n = |examples|
        resamples n value with replacement from examples
        with probability w
    """
    n = len(examples.index)
    # chooses randon indices based on a probability array
    indices = np.random.choice(range(n),n,p = w)
    return examples.iloc[indices]

        
def adaboost(examples,algo,n_rounds):
    n = len(examples.index)
    w = np.full(n, 1/n,dtype='float64')
    
    # initilize a k size array 
    h = [0 for _ in range(n_rounds)]
    z = [0 for _ in range(n_rounds)]
    for k in range(n_rounds):
        #resamples data based on weight
        data = resample(examples,w)
        
        #attributes are values of column apart from the last one
        attributes = list(examples.columns[:-1])
        h[k] = algo(data,attributes,data,1) # train stump->decsion tree with depth 1
        
        error = .0
        for j in range(n):
            query        = examples.iloc[j,:-1].to_dict()
            prediction   = dt_predict(query,h[k])
            actual_class = examples.iloc[j,-1]
            if prediction != actual_class:
                error += w[j]
        
        if error > .5:
            continue
            
        for j in range(n):
            query        = examples.iloc[j,:-1].to_dict()
            prediction   = dt_predict(query,h[k])
            actual_class = examples.iloc[j,-1]
            if prediction == actual_class:
                w[j] *= (error/(1 - error))
        
        # normalize w
        sum_w = sum(w)
        w = [ val/sum_w for val in w ]
        
        if error == .0:
            z[k] = float('inf')
        else:
            z[k] = np.log2((1 - error)/error)
    
    return (h,z)


def adaboost_predict(query,h,z):
    # stores the vote for each prediction as {p : v} dictionary
    pred_vote = dict()
    for i in range(len(h)):
        pred = dt_predict(query,h[i])
        if pred in pred_vote:
            pred_vote[pred] += z[i]
        else:
            pred_vote[pred]  = z[i]
        
    # return the key with maximum count
    # ie, prediction with maximum votes
    return max(pred_vote.keys(), key=(lambda k: pred_vote[k]))
        
            

In [7]:
def adaboost_get_pred_true(test_examples,h,z):
    # removes last column that contains class
    # converts each row as a dictionary of columns
    queries = test_examples.iloc[:,:-1].to_dict(orient='records')
    
    y_pred = [] 
    y_true = [] 
    for i in range(len(test_examples)):
        prediction = adaboost_predict(queries[i],h,z)
        y_pred.append(prediction)
        y_true.append(test_examples.iloc[i,-1])
    
    return y_pred,y_true

        

In [8]:
# performance measure
def confusion_matrix_2(y_pred,y_true,pos,neg):
    """
    inout:
        y_pred - prediction value
        y_true - true value
        pos    - positive attribute
        neg    - negative attribute
        
    output:
        a diftionary with TruePositive,FalsePositive,FalseNegative,TrueNegative
    """
    confusion_matrix = {'TP': 0 ,'FP' : 0,'FN': 0 ,'TN' : 0}
    
    for pred,true in zip(y_pred,y_true):
        if pred == pos and true == pos:
            confusion_matrix['TP'] += 1
        
        elif pred == pos and true == neg:
            confusion_matrix['FP'] += 1
            
        elif pred == neg and true == pos:
            confusion_matrix['FN'] += 1
            
        elif pred == neg and true == neg:
            confusion_matrix['TN'] += 1
            
    return confusion_matrix
        


In [9]:
# get stat
def get_stat(confusion_matrix):
    TP = confusion_matrix['TP']
    TN = confusion_matrix['TN']
    FP = confusion_matrix['FP']
    FN = confusion_matrix['FN']
    
    accuracy = (TP + TN)/(TP + TN + FP + FN)*100
    print(f'Accuracy\t\t{accuracy}%')
    
    sensitivity = TP/(TP + FN)*100
    print(f'Sensitivity\t\t{sensitivity}%')
    
    specificity = TN/(TN + FP)*100
    print(f'Specificity\t\t{specificity}%')
    
    precision = TP/(TP + FP)*100
    print(f'precision\t\t{precision}%')
    
    false_discovery_rate = FP/(FP + TP)*100
    print(f'False Discovery Rate\t{false_discovery_rate}%')
    
    f1_score = (2*TP)/(2*TP + FP + FN)*100
    print(f'F1 score\t\t{f1_score}%')
    
    
def show_dt_stat(examples,train_examples,test_examples_list,pos = 1,neg = 0):
    # Parse parameter from example
    attributes = list(examples.columns)[:-1]
    depth      = len(attributes)
    
    # Train decision tree 
    print('****Start Training****')
    tree = decision_tree(examples,attributes,train_examples,depth)
    print('****Training Done****')
    #pprint(tree)
    
    # for each specified test examples show stat
    for test_examples in test_examples_list:
        # Get prediction
        print('\n\n*******Testing*******')
        y_pred,y_true = dt_get_pred_true(test_examples,tree)
        
        # get confusion_matrix 
        print('*****Testing Done*****\n')
        print('*********Stat*********')
        get_stat(confusion_matrix_2(y_pred,y_true,pos,neg))
        print('\n')
    

def show_ab_stat(train_examples,algo,test_examples_list,n_round_list,pos = 1,neg = 0):
    # for each specified round generate stat
    for n_round in n_round_list:
        # Train adaboost
        print('******Training*******')
        h,z = adaboost(train_examples,algo,n_round)
        print('****Training Done****')
        #print(h,z)
        
        # for each specified test examples show stat
        for test_examples in test_examples_list:
            # Get prediction
            print('\n\n*******Testing*******')
            y_pred,y_true = adaboost_get_pred_true(test_examples,h,z)

            # get confusion_matrix 
            print('*****Testing Done*****\n')
            print('*********Stat*********')
            print(f'----K = {n_round}----')
            get_stat(confusion_matrix_2(y_pred,y_true,pos,neg))

    

In [10]:
# Load Example

# Dataset 1
#examples = pd.read_csv('data/churn_proc.csv')

# Dataset 2
#examples = pd.read_csv('data/adult_proc.csv')

# Dataset 2
examples = pd.read_csv('data/creditcard_proc.csv')

# Train-Test Split 
train_examples,test_examples = train_test_split(examples,test_size=0.2, random_state=1)

# show Dicsion Tree Result
show_dt_stat(examples,train_examples,[train_examples,test_examples])

# show Dicsion Tree with AdaBoost Result
#show_ab_stat(train_examples,decision_tree,[train_examples,test_examples],[10])


# play a bell sound at the and :3
PlaySound("bell.wav",False)

****Start Training****
****Training Done****


*******Testing*******
*****Testing Done*****

*********Stat*********
Accuracy		100.0%
Sensitivity		100.0%
Specificity		100.0%
precision		100.0%
False Discovery Rate	0.0%
F1 score		100.0%




*******Testing*******
*****Testing Done*****

*********Stat*********
Accuracy		100.0%
Sensitivity		100.0%
Specificity		100.0%
precision		100.0%
False Discovery Rate	0.0%
F1 score		100.0%


