# Verbose Features

Dictionaries for explanations and improved feature names of the 2016 PUMF dataset.

## Data Preprocessing

### Manual Feature Selection

For ease of loading, relevant features should be hand-selected and saved.

Broader analysis may be performed at a later date to identify all relevant features.

In [2]:
import pandas as pd

In [3]:
fields = ['PPSORT', 'WEIGHT', 'ABOID', 'BFNMEMB', 'CFSTAT', 'Citizen', 'CMA',
       'DETH123', 'HHTYPE',
       'MarStH', 'PR', 'PRIHM', 'REGIND', 'Sex', 'SHELCO',
       'AGEGRP', 'ATTSCH', 'BedRm', 'CFInc', 'CFInc_AT', 'CfSize',
       'CIP2011', 'CIP2011_STEM_SUM', 'CONDO', 'DPGRSUM', 'DTYPE',
       'EFDecile', 'EfDIMBM', 'EFInc', 'EFInc_AT', 'EfSize', 'ETHDER',
       'GENSTAT', 'HCORENEED_IND', 'HDGREE', 'HHInc', 'HHInc_AT',
       'HHMRKINC', 'HHSIZE', 'HLANO', 'IMMCAT5', 'IMMSTAT',
       'LFACT', 'LICO', 'LICO_AT', 'LOC_ST_RES', 'LoLIMA', 'LoLIMB',
       'LoMBM', 'LSTWRK', 'MOB1', 'Mob5', 'MrkInc',
       'MTNNO', 'NOS', 'PKID0_1', 'PKID15_24', 'PKID2_5', 'PKID25',
       'PKID6_14', 'PKIDS', 'POB', 'POBF', 'POBM', 'PR1', 'PR5',
       'PresMortG', 'REPAIR', 'ROOMS', 'SSGRAD', 'Tenur', 'TotInc',
       'TotInc_AT', 'VALUE', 'VisMin', 'WRKACT']

In [4]:
field_name_data = pd.read_csv('Data_Mining_Names.csv')

field_name_data

Unnamed: 0,Mapping_Needed,Data_Type,Meaning,Definitive_Name,dataset_2016
0,,Discrete,Unique record (row) identifier,Row_ID_PPsort,PPSORT
1,,Continuous,No. of individuals represented by this row,Row_Weight,WEIGHT
2,Possibly,Categorical,Detailed aboriginal identity,Aboriginal_Identity,ABOID
3,,"Binary (0,1)",Membership in a First Nation or band,FN_Band_Membership,BFNMEMB
4,Possibly,Categorical,"Living arrangement (e.g. married, single)",Household_Living_Arrangements,CFSTAT
...,...,...,...,...,...
71,,Continuous,Total personal income from recurring sources,Total_Personal_Recurring_Income,TotInc
72,,Continuous,Total personal income from recurring sources a...,Total_Personal_Recurring_Income_After_Tax,TotInc_AT
73,,Continuous,Household property value (including land),Property_Value,VALUE
74,Likely,Categorical,Visible minority group,Visible_Minority_Group,VisMin


In [9]:
data_types = field_name_data['Data_Type']
meanings = field_name_data['Meaning']

descriptions = []

for types, meaning in zip(data_types, meanings):
    descriptions.append(types + ': ' + meaning)

description_dict = dict(zip(field_name_data['dataset_2016'], descriptions))

description_dict

{'PPSORT': 'Discrete: Unique record (row) identifier',
 'WEIGHT': 'Continuous: No. of individuals represented by this row',
 'ABOID': 'Categorical: Detailed aboriginal identity',
 'BFNMEMB': 'Binary (0,1): Membership in a First Nation or band',
 'CFSTAT': 'Categorical: Living arrangement (e.g. married, single)',
 'Citizen': 'Categorical: Canadian citizenship status',
 'CMA': 'Categorical: Area of current residence',
 'DETH123': 'Binary (1,2): Single or multiple ethnic origin',
 'HHTYPE': 'Categorical: Household type (e.g. multiple family)',
 'MarStH': 'Categorical: Marital status',
 'PR': 'Categorical: Province or territory of current residence',
 'PRIHM': 'Binary (0,1): Person is primary household maintainer',
 'REGIND': 'Binary (0,1): Registered or treaty First Nation status',
 'Sex': 'Binary (1,2): Male or female sex',
 'SHELCO': 'Continuous: Average monthly cost of shelter',
 'AGEGRP': 'Categorical: Age bracket',
 'ATTSCH': 'Categorical: Highest education level ever attended',
 'Be

In [10]:
name_dict = dict(zip(field_name_data['dataset_2016'], field_name_data['Definitive_Name']))

name_dict

{'PPSORT': 'Row_ID_PPsort',
 'WEIGHT': 'Row_Weight',
 'ABOID': 'Aboriginal_Identity',
 'BFNMEMB': 'FN_Band_Membership',
 'CFSTAT': 'Household_Living_Arrangements',
 'Citizen': 'Canadian_Citizenship',
 'CMA': 'Metro_Area_Current_Residence',
 'DETH123': 'Single_or_Multiple_Ethnic_Origin',
 'HHTYPE': 'Household_Family_Status',
 'MarStH': 'Marital_Status',
 'PR': 'Province_Current_Residence',
 'PRIHM': 'Is_Primary_Household_Maintainer',
 'REGIND': 'Registered_Treaty_Status',
 'Sex': 'Sex',
 'SHELCO': 'Shelter_Cost',
 'AGEGRP': 'Age_Bracket',
 'ATTSCH': 'Education_Level_Attended',
 'BedRm': 'Household_Bedroom_Count',
 'CFInc': 'Census_Family_Income_Bracket',
 'CFInc_AT': 'Census_Family_Income_After_Tax',
 'CfSize': 'Census_Family_Size',
 'CIP2011': 'Postsecondary_Field_of_Study',
 'CIP2011_STEM_SUM': 'Postsecondary_Field_STEM_or_BHASE',
 'CONDO': 'Condominium_Membership',
 'DPGRSUM': 'Population_Group',
 'DTYPE': 'Dwelling_Structure_Type',
 'EFDecile': 'Economic_Family_Income_Decile_After_T