## Import all libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import random
from pprint import pprint

## Read data

In [2]:
df = pd.read_csv('features.csv')

In [3]:
# df = df.rename(columns={'Apple':'label'})
df.head()
print(len(df))

521


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 521 entries, 0 to 520
Data columns (total 5 columns):
avgRGB         521 non-null float64
area           521 non-null float64
perimeter      521 non-null int64
no_of_edges    521 non-null int64
label          521 non-null object
dtypes: float64(2), int64(2), object(1)
memory usage: 20.5+ KB


### Train-test split

In [5]:
def train_test_split(df,test_size):
   
    if isinstance(test_size,float):
        test_size = round(test_size*len(df))

    idx = df.index.tolist()
    test_index = random.sample(population=idx,k=test_size)

    test_df = df.loc[test_index]
    train_df = df.drop(test_index)
    
    return train_df,test_df

In [6]:
random.seed(0)
train_df,test_df = train_test_split(df,.40)

In [7]:
train_df.head()

Unnamed: 0,avgRGB,area,perimeter,no_of_edges,label
1,142.0,2784.0,205,9,Apple
2,128.0,2903.5,205,11,Apple
3,127.0,2940.0,208,12,Apple
4,126.0,2934.0,208,13,Apple
5,126.0,2908.5,207,12,Apple


In [8]:
test_df.head()

Unnamed: 0,avgRGB,area,perimeter,no_of_edges,label
394,131.0,2273.5,186,12,Avocado
430,143.0,2529.5,192,12,Mango
41,145.0,2530.0,205,14,Apple
265,133.0,2930.0,204,14,Pear
497,148.0,2442.5,194,14,Mango


## Helper Functions

### Data purity

In [9]:
data = train_df.values
data

array([[142.0, 2784.0, 205, 9, 'Apple'],
       [128.0, 2903.5, 205, 11, 'Apple'],
       [127.0, 2940.0, 208, 12, 'Apple'],
       ...,
       [138.0, 2594.5, 193, 12, 'Mango'],
       [143.0, 2537.5, 193, 12, 'Mango'],
       [138.0, 2734.0, 197, 13, 'Mango']], dtype=object)

In [10]:
def check_purity(data):
    label_column = data[:,-1]
    unique_class = np.unique(label_column)

    if(len(unique_class) == 1):
        return True
    else:
        return False

### Classification

In [11]:
def classify(data):
    label_column = data[:,-1]
    unique_classes,counts = np.unique(label_column,return_counts=True)

    idx = counts.argmax()
    classification = unique_classes[idx]
    
    return classification

In [12]:
# classify(train_df[train_df.area>2784.0].values)

### Potential split

In [13]:
def get_potential_split(data):
    
    potential_splits = {}
    columns = data.shape[1]
    
    for col in range(columns-1):
        potential_splits[col] = []
        values = data[:,col]
        unique_values = np.unique(values)
        
        for i in range(1,len(unique_values)):
            cur_value = unique_values[i]
            prev_value = unique_values[i-1]
            potential_split = (cur_value+prev_value)/2
            potential_splits[col].append(potential_split)
            
    return potential_splits
                       

In [14]:
# potential_splits = get_potential_split(train_df.values)

In [15]:
# sns.lmplot(data=train_df,x='avgRGB',y='no_of_edges',hue='label',
#            fit_reg=False)

# # plt.vlines(x=potential_splits[2],ymin=1,ymax=17)

### Split data

In [16]:
def split_data(data,split_column,split_value):
    
    split_column_values = data[:,split_column]

    data_below = data[split_column_values <= split_value]
    data_above = data[split_column_values > split_value]
    
    return data_below,data_above


In [17]:
# split_column = 3
# split_value = 0.5

In [18]:
# data_below,data_above = split_data(data,split_column,split_value)

In [19]:
# plotting_df = pd.DataFrame(data_above,columns=df.columns)

# sns.lmplot(data=plotting_df,x = 'no_of_edges',y='avgRGB',fit_reg=False)
# # plt.vlines(x=split_value,ymin=1,ymax=230)


### Lowest overall entropy

In [20]:
def calculate_entropy(data):
    
    label_column = data[:,-1]

    _,counts = np.unique(label_column,return_counts = True)
    probabilities = counts/counts.sum()
    entropy = sum(probabilities * -np.log2(probabilities))
    
    return entropy


In [21]:
def get_overall_entropy(data_below,data_above):
    
    n_data_points = len(data_below) + len(data_above)

    p_data_below = len(data_below)/n_data_points
    p_data_above = len(data_above)/n_data_points

    overall_entropy = (p_data_below * calculate_entropy(data_below)
                       + p_data_above * calculate_entropy(data_above))
    
    return overall_entropy

In [22]:
def determine_best_split(data,potential_splits):
    
    overall_entropy = 9999

    for idx in potential_splits:
        for value in potential_splits[idx]:
            data_below,data_above = split_data(data,split_column=idx,split_value=value)
            current_overall_entropy = get_overall_entropy(data_below,data_above)

            if current_overall_entropy <= overall_entropy:
                overall_entropy = current_overall_entropy
                best_split_column = idx
                best_split_value = value
                
    return best_split_column,best_split_value

### Decision tree algorithm 

sub_tree = {question:[yes_answer,no_answer]}

In [37]:
def decision_tree(df,counter = 0,min_samples=2,max_depth=5):
    if counter == 0:
        data = df.values
    else:
        data = df
    
    if (check_purity(data)) or (len(data) < min_samples) or (counter == max_depth):
        classification = classify(data)
        
        return classification
    else:
        counter+=1
        potential_splits = get_potential_split(data)
        split_column,split_value = determine_best_split(data,potential_splits)
        data_below,data_above = split_data(data,split_column,split_value)
        
        column_name = column_header[split_column]
        question = "{} <= {}".format(column_name,split_value)
        sub_tree = {question:[]}
        
        yes_answer = decision_tree(data_below,counter,min_samples,max_depth)
        no_answer = decision_tree(data_above,counter,min_samples,max_depth)
        
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
        
        return sub_tree

In [38]:
column_header = train_df.columns
tree = decision_tree(train_df,max_depth = 5)
pprint(tree)

{'avgRGB <= 155.5': [{'avgRGB <= 122.0': [{'area <= 2826.25': [{'avgRGB <= 115.5': ['Avocado',
                                                                                    'Pomegranate']},
                                                               {'avgRGB <= 99.5': [{'perimeter <= 211.0': ['Avocado',
                                                                                                           'Pomegranate']},
                                                                                   'Pomegranate']}]},
                                          {'area <= 2382.25': [{'perimeter <= 196.0': ['Avocado',
                                                                                       'Pomegranate']},
                                                               {'perimeter <= 201.5': [{'area <= 2708.0': ['Mango',
                                                                                                           'Apple']},
                        

## Classification

In [39]:
def classify_test_data(test_data,tree):
    question = list(tree.keys())[0]
    feature,operator,value = question.split()
    
    if test_data[feature] <= float(value):
        answer = tree[question][0]
    else:
        answer = tree[question][1]
    
    if not isinstance(answer,dict):
        return answer
    else:
        remaining_tree = answer
        return classify_test_data(test_data,remaining_tree)
    

In [40]:
example = test_df.iloc[2]
example

avgRGB                    145
area                     2530
perimeter                 205
no_of_edges                14
label                   Apple
classification          Apple
correctly_classified     True
Name: 41, dtype: object

In [41]:
classify_test_data(example,tree)

'Apple'

## Accuracy

In [42]:
def find_accuracy(df,tree):
    
    df["classification"] = df.apply(classify_test_data,
                               axis = 1,args=(tree,))
    df['correctly_classified'] = df.classification == df.label

    accuracy = df.correctly_classified.mean()
    
    return accuracy

In [73]:

train_df,test_df = train_test_split(df,.20)
tree = decision_tree(train_df,max_depth = 10)
find_accuracy(test_df,tree)


0.9615384615384616

In [74]:
test_df


Unnamed: 0,avgRGB,area,perimeter,no_of_edges,label,classification,correctly_classified
226,173.0,2238.0,190,10,Pear,Pear,True
187,177.0,2327.5,192,13,Pear,Pear,True
465,141.0,2717.0,203,15,Mango,Mango,True
477,138.0,2861.5,201,12,Mango,Mango,True
265,133.0,2930.0,204,14,Pear,Apple,False
...,...,...,...,...,...,...,...
396,134.0,2205.0,183,14,Avocado,Avocado,True
297,175.0,2236.5,189,13,Pear,Pear,True
498,147.0,2461.0,194,13,Mango,Mango,True
410,140.0,2089.0,177,12,Avocado,Avocado,True
