# MATH2319/ MATH2387 Machine Learning
## Take-Home Assessment
### Galen Ralph Herten-Crabb 3955778 
### Question 2

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io
import requests
import math
import statistics


pd.set_option('display.max_columns', None) 

In [2]:
df1 = pd.read_csv("THA_diamonds.csv")

### Part A
Discretize the depth and carat features separately as "category_1", "category_2", and "category_3" respectively using the equal-frequency binning technique.

To achieve this the qcut function was called, separating each variable into the categories specified. This process creates three evenly distributed categories or levels for further calculation.

In [3]:
df2 = df1.copy()
df2['depth'] = pd.qcut(df2['depth'], 
                              q=3, 
                              labels=['category_1', 'category_2', 'category_3'])

In [4]:
df2['carat'] = pd.qcut(df2['carat'], 
                              q=3, 
                              labels=['category_1', 'category_2', 'category_3'])

In [5]:
df2.head(10)

Unnamed: 0,cut,color,depth,price,carat
0,Good,D,category_2,low,category_1
1,Fair,F,category_3,low,category_1
2,Good,I,category_1,low,category_1
3,Good,F,category_1,low,category_1
4,Fair,F,category_3,low,category_1
5,Fair,F,category_3,low,category_1
6,Good,D,category_2,low,category_1
7,Good,D,category_2,low,category_1
8,Good,D,category_2,low,category_1
9,Fair,F,category_3,low,category_1


### Part B
Compute the impurity of the price feature

Value_counts function is called to determine the frequency of values within the price variable, these frequencies also amount to the probability of each level. The result is passed through the purity using entropy formula.

To make things easier later the 'compute_impurity' function from the course website is written in and tested to ensure it returns the same result. It does.

In [6]:
freq = df2['price'].value_counts(normalize=True)

In [7]:
purity_entropy = -1 * np.sum(np.log2(freq) * freq)
purity_entropy

1.7160130346557048

In [8]:
def compute_impurity(feature, impurity_criterion):
   
    probs = feature.value_counts(normalize=True)
    
    if impurity_criterion == 'entropy':
        impurity = -1 * np.sum(np.log2(probs) * probs)
    elif impurity_criterion == 'gini':
        impurity = 1 - np.sum(np.square(probs))
    else:
        raise ValueError('Unknown impurity criterion')
        
    return(round(impurity, 3))

In [9]:
purity_entropy_f = compute_impurity(df2['price'], 'entropy')
purity_entropy_f

1.716

### Part C

Determining the root node for decision tree.

This code was written in from the course website and functions to provide an analysis of the levels in any given variable. In the example below the 'cut' variable is plugged in and the resulting tables return the data required to calculate the information gain of each variable, which is critical for determining the root node.

In [10]:
for level in df2['cut'].unique():
    print('level name:', level)
    df_feature_level = df2[df2['cut'] == level]
    print('corresponding data partition:')
    print(df_feature_level)
    print('partition target feature impurity:', compute_impurity(df_feature_level['price'], 'entropy'))
    print('partition weight:', str(len(df_feature_level)) + '/' + str(len(df2)))
    print('====================')

level name: Good
corresponding data partition:
      cut color       depth    price       carat
0    Good     D  category_2      low  category_1
2    Good     I  category_1      low  category_1
3    Good     F  category_1      low  category_1
6    Good     D  category_2      low  category_1
7    Good     D  category_2      low  category_1
..    ...   ...         ...      ...         ...
202  Good     I  category_1  premium  category_3
203  Good     I  category_1  premium  category_3
205  Good     I  category_1  premium  category_3
207  Good     F  category_2  premium  category_3
210  Good     I  category_1  premium  category_3

[152 rows x 5 columns]
partition target feature impurity: 1.68
partition weight: 152/212
level name: Fair
corresponding data partition:
      cut color       depth    price       carat
1    Fair     F  category_3      low  category_1
4    Fair     F  category_3      low  category_1
5    Fair     F  category_3      low  category_1
9    Fair     F  category_3     

This function returns the data needed to calculate the remainder, using the entropy split criterion, for the 'cut' feature.
This is done by multiplying the target feature impurity of each level by their respective weights.

1.68 x (152/212) + 1.78 x (60/212) = 1.708

Information gain is calculated by deducting this number from the purity of the target feature.

1.716 - 1.708 = 0.008

This result is very poor for the variable 'cut' and signals that it would not be an optimal root node for the decision tree.

Below is code (also from the course website) that creates a function that will apply these calculations to all the variables in our dataset and print out the relevant values, revealing the optimal node.

In [11]:
def comp_feature_information_gain(df, target, descriptive_feature, split_criterion):
   
    
    print('target feature:', target)
    print('descriptive_feature:', descriptive_feature)
    print('split criterion:', split_criterion)
            
    target_entropy = compute_impurity(df[target], split_criterion)

   
    entropy_list = list()
    weight_list = list()
    
   
    for level in df[descriptive_feature].unique():
        df_feature_level = df[df[descriptive_feature] == level]
        entropy_level = compute_impurity(df_feature_level[target], split_criterion)
        entropy_list.append(round(entropy_level, 3))
        weight_level = len(df_feature_level) / len(df)
        weight_list.append(round(weight_level, 3))

    print('impurity of partitions:', entropy_list)
    print('weights of partitions:', weight_list)

    feature_remaining_impurity = np.sum(np.array(entropy_list) * np.array(weight_list))
    print('remaining impurity:', feature_remaining_impurity)
    
    information_gain = target_entropy - feature_remaining_impurity
    print('information gain:', information_gain)
    
    print('====================')

    return(information_gain)

In [12]:
split_criterion = 'entropy'
for feature in df2.drop(columns='price').columns:
    feature_info_gain = comp_feature_information_gain(df2, 'price', feature, split_criterion)

target feature: price
descriptive_feature: cut
split criterion: entropy
impurity of partitions: [1.68, 1.78]
weights of partitions: [0.717, 0.283]
remaining impurity: 1.7083
information gain: 0.00770000000000004
target feature: price
descriptive_feature: color
split criterion: entropy
impurity of partitions: [1.657, 1.445, 1.833]
weights of partitions: [0.269, 0.434, 0.297]
remaining impurity: 1.617264
information gain: 0.09873599999999993
target feature: price
descriptive_feature: depth
split criterion: entropy
impurity of partitions: [1.517, 1.749, 1.74]
weights of partitions: [0.349, 0.316, 0.335]
remaining impurity: 1.6650170000000002
information gain: 0.05098299999999978
target feature: price
descriptive_feature: carat
split criterion: entropy
impurity of partitions: [-0.0, 1.365, 1.529]
weights of partitions: [0.335, 0.373, 0.292]
remaining impurity: 0.9556129999999998
information gain: 0.7603870000000001


This table is populated with the results from above and indicates that 'carat' is the variable that splits into the more pure sets, or gains the most information. Therefore 'carat' should be the root node.

In [13]:
data = {'split': ['cut', 'color', 'depth', 'carat'],
        'remainder': [1.708, 1.617, 1.665,0.955],
        'info_gain': [0.007, 0.098, 0.050, 0.760],
        'is_optimal': ['False','False','False','True']}

df_splits = pd.DataFrame(data)

df_splits

Unnamed: 0,split,remainder,info_gain,is_optimal
0,cut,1.708,0.007,False
1,color,1.617,0.098,False
2,depth,1.665,0.05,False
3,carat,0.955,0.76,True


### Part D

It is admitted that there is likely a more elegant solution, however the below satisfies the requirements of the table. The leaf_prediction column is made up of the category of price with the higest probability. 

To calculate the probability of each outcome for each category of 'carat' the frequency of each outcome was calculated from a data frame containing the relevant coloumns using the .value_counts function.

These values were then added to the table.

In [14]:
#To discover what the probability is of a level, within the carat variable, having a low_price, medium_price etc...  

df_prob = df2[['carat', 'price']]

A = df_prob[df_prob["carat"].str.contains('category_1')].value_counts(normalize=True)
print(A)
print('======================')
B = df_prob[df_prob["carat"].str.contains('category_2')].value_counts(normalize=True)
print(B)
print('======================')
C = df_prob[df_prob["carat"].str.contains('category_3')].value_counts(normalize=True)
print(C)
print('======================')

carat       price
category_1  low      1.0
dtype: float64
carat       price  
category_2  medium     0.607595
            low        0.278481
            high       0.101266
            premium    0.012658
dtype: float64
carat       price  
category_3  medium     0.419355
            high       0.370968
            premium    0.209677
dtype: float64


In [15]:
data_b = {'leaf_condition': ['carat == category_1', 'carat == category_2', 'carat == category_3'],
        'low_price_prob': ['1.0', '0.278', '0.0'],
        'medium_price_prob': ['0.0', '0.607', '0.419'],
        'high_price_prob': ['0.0','0.101','0.370'],
        'premium_price_prob':['0.0', '0.012','0.209'],
        'leaf_prediction':['low_price', 'medium_price', 'medium_price']}

df_pred = pd.DataFrame(data_b)

df_pred

Unnamed: 0,leaf_condition,low_price_prob,medium_price_prob,high_price_prob,premium_price_prob,leaf_prediction
0,carat == category_1,1.0,0.0,0.0,0.0,low_price
1,carat == category_2,0.278,0.607,0.101,0.012,medium_price
2,carat == category_3,0.0,0.419,0.37,0.209,medium_price
