# CHAID

The EPC data contains several categorical variables with a lot of values. In order to find suitable features which will retain the most information, three feature sets are explored;
* data driven
* domain driven
* exhaustive

The first approach, termed data driven, uses statistical methods to reduce the number of variables. As the variables containing textual descriptions of the property have been created free-hand, many contain a large number of unique values. In some cases, only recorded for one property. The data driven approach uses a single level Chi-square Automatic Interaction Detector (CHAID) to group the levels within each categorical variable into a smaller number of groups. CHAID groups values with a similar response rate or in this context Energy Efficiency Rating (EER). 

This script run CHAID and stores the results in a dictionary

In [1]:
import numpy as np
import pandas as pd
import datetime
import os
import glob
import json
from CHAID import Tree
import re

In [18]:
# set variables from config file
config_path = os.path.abspath('..')

with open(config_path + '/config-example.json', 'r') as f:
    config = json.load(f)

processing_path = config['DEFAULT']['processing_path']
epc_train_fname = config['DEFAULT']['epc_train_fname']
epc_test_fname = config['DEFAULT']['epc_test_fname']
epc_train_clean_fname = config['DEFAULT']['epc_train_clean_fname']
epc_chaid_fname = config['DEFAULT']['epc_chaid_fname']
epc_fname_suffix = config['DEFAULT']['epc_fname_suffix']

In [19]:
dtype_dict = {'INSPECTION_DATE':'str'}

epc_train = pd.read_csv(os.path.join(processing_path,epc_train_clean_fname + epc_fname_suffix),
                        header = 0,
                        delimiter = ',',
                        dtype = dtype_dict,
                        parse_dates = ['INSPECTION_DATE'])

  epc_train = pd.read_csv(os.path.join(processing_path,epc_train_clean_fname + epc_fname_suffix),


In [20]:
#Get quantile boundafries
quantiles = epc_train['CURRENT_ENERGY_EFFICIENCY'].describe()
print(quantiles['25%'])
print(quantiles['75%'])

56.0
73.0


In [21]:
#Create a new new target field within the training data
min_eff = epc_train['CURRENT_ENERGY_EFFICIENCY'].min()
max_eff = epc_train['CURRENT_ENERGY_EFFICIENCY'].max()
epc_train['eff_flag'] = pd.cut(epc_train['CURRENT_ENERGY_EFFICIENCY'],
                               bins = [min_eff,56.0,73.0,max_eff],
                               labels = ['0','99','1'])

#Drop unwated '99' level of eff_flag and convert to integer
epc_train = epc_train[epc_train['eff_flag'].isin(['0','1'])]
epc_train['eff_flag'] = epc_train['eff_flag'].astype(int)

In [22]:
#Get the {0,1} sample size by taking the smallest class from the 0
#and 1 outcomes
sample_size = epc_train['eff_flag'].value_counts().min()


#Subsample positive and negative samples
neg_eff_flag = epc_train[epc_train['eff_flag'] == 0].sample(sample_size,random_state = 1234,axis = 0)

pos_eff_flag = epc_train[epc_train['eff_flag'] == 1].sample(sample_size,random_state = 1234,axis = 0)

#Concatenate
epc_chaid = pd.concat([neg_eff_flag, pos_eff_flag])
epc_chaid['eff_flag'].value_counts()

#Randomly shuffle epc_CHAID and reset the index
epc_chaid = epc_chaid.sample(frac = 1).reset_index(drop=True)

In [23]:
epc_chaid

Unnamed: 0,LMK_KEY,region,POSTCODE,BUILDING_REFERENCE_NUMBER,CURRENT_ENERGY_RATING,CURRENT_ENERGY_EFFICIENCY,PROPERTY_TYPE,BUILT_FORM,INSPECTION_DATE,COUNTY,...,FLOOR_HEIGHT,MECHANICAL_VENTILATION,inspection_year,floors_average_thermal_transmittance,SECONDHEAT_DESCRIPTION,LIGHTING_DESCRIPTION,low_energy_lighting_perc,roof_average_thermal_transmittance,walls_average_thermal_transmittance,eff_flag
0,488254419702010052013563976609318,Worthing,BN14 9AN,6790616768,C,74,Flat,Detached,2010-05-19,West Sussex,...,2.388,natural,2010,,"Room heaters, electric",low energy lighting 50% of fixed outlets,50.0,,,1
1,ce1110dda77f41dae9e242c9f40fcec6c1f55aba65f2fe...,Carlisle,CA8 9BF,10002920696,B,86,House,Semi-Detached,2022-05-27,Cumbria,...,2.420,,2022,0.2,,low energy lighting in all fixed outlets,,0.2,0.2,1
2,789057449002012052917521197829538,Gloucester,GL2 0SG,2038148968,D,56,Flat,Semi-Detached,2012-05-17,Gloucestershire,...,2.540,natural,2012,,"Room heaters, mains gas",low energy lighting 10% of fixed outlets,10.0,,,0
3,38046880062008110423224633248218,Preston,PR2 6HF,1368863568,B,84,House,Mid-Terrace,2008-11-04,Lancashire,...,2.400,,2008,,,,,,,1
4,1549129665332017060210543786278404,Colchester,CO1 2RS,6476412578,G,8,House,Semi-Detached,2017-05-26,Essex,...,,natural,2017,,,low energy lighting 30% of fixed outlets,30.0,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1912829,583719352712011012318005691290383,Cotswold,GL7 1RS,3377213868,B,82,House,Detached,2011-01-23,Gloucestershire,...,2.440,,2011,0.2,,low energy lighting 30% of fixed outlets,30.0,0.1,0.2,1
1912830,1461593799742016071308511244569528,Hertsmere,WD23 2FD,717795478,C,76,House,Semi-Detached,2016-07-12,Hertfordshire,...,2.380,natural,2016,,,low energy lighting 40% of fixed outlets,40.0,,,1
1912831,1535524411512019071908383892910057,Thanet,CT10 1HJ,754121578,E,45,House,End-Terrace,2019-07-18,Kent,...,,natural,2019,,,low energy lighting in all fixed outlets,,,,0
1912832,177337670302008110413403355380548,Breckland,IP24 3JQ,2317773568,C,74,House,End-Terrace,2008-11-04,Norfolk,...,2.270,natural,2008,,,no low energy lighting,,,,1


In [24]:
#Numeric
var_list_num = epc_chaid.select_dtypes(include= 'number').columns.tolist()
var_list_num.remove('CURRENT_ENERGY_EFFICIENCY')

#Categorical
var_list_cat = epc_chaid.select_dtypes(include= ['object','category']).columns.tolist()
var_list_cat.remove('LMK_KEY')
var_list_cat.remove('POSTCODE')
var_list_cat.remove('CURRENT_ENERGY_RATING')

### Creating a dictionary of CHAID scores

In [25]:
chaid_dict = {}
for var in var_list_cat:
    #Set the inputs and outputs
    #The imputs are given as a dictionary along with the type
    #The output must be of string type
    #I have assume all features are nominal, we can change the features dictionary to include the ordinal type
    features = {var:'nominal'}
    label = 'eff_flag'
    #Create the Tree
    chaid_dict[var] = {}
    tree = Tree.from_pandas_df(epc_chaid, i_variables = features, d_variable = label, alpha_merge = 0.0)
    #Loop through all the nodes and enter into a dictionary
    print(f'\n\n\nVariable: {var}')
    print(f'p-value: {tree.tree_store[0].split.p}')
    print(f'Chi2: {tree.tree_store[0].split.score}')
    for i in range(1, len(tree.tree_store)):
        count = tree.tree_store[i].members[0] + tree.tree_store[i].members[1]
        rate = tree.tree_store[i].members[1] / count
        print(f'\nNode {i}:\n\tCount = {count}\tRate = {rate}')
        print(f'\t{tree.tree_store[i].choices}')
        chaid_dict[var]['node' + str(i)] = tree.tree_store[i].choices




Variable: region
p-value: 0.0
Chi2: 65517.82722484344

Node 1:
	Count = 433464.0	Rate = 0.45513352896665005
	['Adur', 'Gedling', 'Bassetlaw', 'Cotswold', 'Teignbridge', 'Boston', 'Thanet', 'Hastings', 'Ashfield', 'Babergh', 'Lewes', 'Waverley', 'Breckland', 'Guildford', 'Worcester', 'Fylde', 'Wealden', 'Bolsover', 'Brentwood', 'Carlisle', 'Broadland', 'Rochford', 'Dover', 'Stroud', 'Chichester']

Node 2:
	Count = 301953.0	Rate = 0.37460796878984476
	['Allerdale', 'Rossendale', 'Craven', 'Broxtowe', 'Ryedale', 'Torridge', 'Copeland', 'Richmondshire', 'Erewash', 'Tendring', 'Worthing', 'Rother', 'Scarborough', 'Hambleton', 'Harrogate', 'Lancaster', 'Wyre', 'Melton', 'Sevenoaks', 'Maldon']

Node 3:
	Count = 364964.0	Rate = 0.5059348319286285
	['Arun', 'Blaby', 'Sedgemoor', 'Braintree', 'Swale', 'Havant', 'Stafford', 'Canterbury', 'Cheltenham', 'Preston', 'Bromsgrove', 'Spelthorne', 'Chesterfield', 'Fenland', 'Mansfield', 'Elmbridge', 'Mendip', 'Rushcliffe', 'Tandridge', 'Gravesham']

N

In [26]:
%store chaid_dict

Stored 'chaid_dict' (dict)
