# Feature Improvement

Various feature improvement.

## Verbose Features

Dictionaries for explanations and improved feature names of the 2016 PUMF dataset.

In [2]:
import pandas as pd

In [3]:
fields = ['PPSORT', 'WEIGHT', 'ABOID', 'BFNMEMB', 'CFSTAT', 'Citizen', 'CMA',
       'DETH123', 'HHTYPE',
       'MarStH', 'PR', 'PRIHM', 'REGIND', 'Sex', 'SHELCO',
       'AGEGRP', 'ATTSCH', 'BedRm', 'CFInc', 'CFInc_AT', 'CfSize',
       'CIP2011', 'CIP2011_STEM_SUM', 'CONDO', 'DPGRSUM', 'DTYPE',
       'EFDecile', 'EfDIMBM', 'EFInc', 'EFInc_AT', 'EfSize', 'ETHDER',
       'GENSTAT', 'HCORENEED_IND', 'HDGREE', 'HHInc', 'HHInc_AT',
       'HHMRKINC', 'HHSIZE', 'HLANO', 'IMMCAT5', 'IMMSTAT',
       'LFACT', 'LICO', 'LICO_AT', 'LOC_ST_RES', 'LoLIMA', 'LoLIMB',
       'LoMBM', 'LSTWRK', 'MOB1', 'Mob5', 'MrkInc',
       'MTNNO', 'NOS', 'PKID0_1', 'PKID15_24', 'PKID2_5', 'PKID25',
       'PKID6_14', 'PKIDS', 'POB', 'POBF', 'POBM', 'PR1', 'PR5',
       'PresMortG', 'REPAIR', 'ROOMS', 'SSGRAD', 'Tenur', 'TotInc',
       'TotInc_AT', 'VALUE', 'VisMin', 'WRKACT']

In [4]:
field_name_data = pd.read_csv('Data_Mining_Names.csv')

field_name_data

Unnamed: 0,Mapping_Needed,Data_Type,Meaning,Definitive_Name,dataset_2016
0,,Discrete,Unique record (row) identifier,Row_ID_PPsort,PPSORT
1,,Continuous,No. of individuals represented by this row,Row_Weight,WEIGHT
2,Possibly,Categorical,Detailed aboriginal identity,Aboriginal_Identity,ABOID
3,,"Binary (0,1)",Membership in a First Nation or band,FN_Band_Membership,BFNMEMB
4,Possibly,Categorical,"Living arrangement (e.g. married, single)",Household_Living_Arrangements,CFSTAT
...,...,...,...,...,...
71,,Continuous,Total personal income from recurring sources,Total_Personal_Recurring_Income,TotInc
72,,Continuous,Total personal income from recurring sources a...,Total_Personal_Recurring_Income_After_Tax,TotInc_AT
73,,Continuous,Household property value (including land),Property_Value,VALUE
74,Likely,Categorical,Visible minority group,Visible_Minority_Group,VisMin


In [9]:
data_types = field_name_data['Data_Type']
meanings = field_name_data['Meaning']

descriptions = []

for types, meaning in zip(data_types, meanings):
    descriptions.append(types + ': ' + meaning)

description_dict = dict(zip(field_name_data['dataset_2016'], descriptions))

description_dict

{'PPSORT': 'Discrete: Unique record (row) identifier',
 'WEIGHT': 'Continuous: No. of individuals represented by this row',
 'ABOID': 'Categorical: Detailed aboriginal identity',
 'BFNMEMB': 'Binary (0,1): Membership in a First Nation or band',
 'CFSTAT': 'Categorical: Living arrangement (e.g. married, single)',
 'Citizen': 'Categorical: Canadian citizenship status',
 'CMA': 'Categorical: Area of current residence',
 'DETH123': 'Binary (1,2): Single or multiple ethnic origin',
 'HHTYPE': 'Categorical: Household type (e.g. multiple family)',
 'MarStH': 'Categorical: Marital status',
 'PR': 'Categorical: Province or territory of current residence',
 'PRIHM': 'Binary (0,1): Person is primary household maintainer',
 'REGIND': 'Binary (0,1): Registered or treaty First Nation status',
 'Sex': 'Binary (1,2): Male or female sex',
 'SHELCO': 'Continuous: Average monthly cost of shelter',
 'AGEGRP': 'Categorical: Age bracket',
 'ATTSCH': 'Categorical: Highest education level ever attended',
 'Be

In [10]:
name_dict = dict(zip(field_name_data['dataset_2016'], field_name_data['Definitive_Name']))

name_dict

{'PPSORT': 'Row_ID_PPsort',
 'WEIGHT': 'Row_Weight',
 'ABOID': 'Aboriginal_Identity',
 'BFNMEMB': 'FN_Band_Membership',
 'CFSTAT': 'Household_Living_Arrangements',
 'Citizen': 'Canadian_Citizenship',
 'CMA': 'Metro_Area_Current_Residence',
 'DETH123': 'Single_or_Multiple_Ethnic_Origin',
 'HHTYPE': 'Household_Family_Status',
 'MarStH': 'Marital_Status',
 'PR': 'Province_Current_Residence',
 'PRIHM': 'Is_Primary_Household_Maintainer',
 'REGIND': 'Registered_Treaty_Status',
 'Sex': 'Sex',
 'SHELCO': 'Shelter_Cost',
 'AGEGRP': 'Age_Bracket',
 'ATTSCH': 'Education_Level_Attended',
 'BedRm': 'Household_Bedroom_Count',
 'CFInc': 'Census_Family_Income_Bracket',
 'CFInc_AT': 'Census_Family_Income_After_Tax',
 'CfSize': 'Census_Family_Size',
 'CIP2011': 'Postsecondary_Field_of_Study',
 'CIP2011_STEM_SUM': 'Postsecondary_Field_STEM_or_BHASE',
 'CONDO': 'Condominium_Membership',
 'DPGRSUM': 'Population_Group',
 'DTYPE': 'Dwelling_Structure_Type',
 'EFDecile': 'Economic_Family_Income_Decile_After_T

### Pipeline: Verbose Features

In [27]:
import pandas as pd

def get_verbose_feature_dicts():
    """Return verbose field name dict and verbose field description dict."""
    
    field_name_data = pd.read_csv('Data_Mining_Names.csv')
    
    data_types = field_name_data['Data_Type']
    meanings = field_name_data['Meaning']

    descriptions = []

    for types, meaning in zip(data_types, meanings):
        descriptions.append(types + ': ' + meaning)

    description_dict = dict(zip(field_name_data['dataset_2016'], descriptions))
    name_dict = dict(zip(field_name_data['dataset_2016'], field_name_data['Definitive_Name']))
    
    return description_dict, name_dict


get_verbose_feature_dicts()

({'PPSORT': 'Discrete: Unique record (row) identifier',
  'WEIGHT': 'Continuous: No. of individuals represented by this row',
  'ABOID': 'Categorical: Detailed aboriginal identity',
  'BFNMEMB': 'Binary (0,1): Membership in a First Nation or band',
  'CFSTAT': 'Categorical: Living arrangement (e.g. married, single)',
  'Citizen': 'Categorical: Canadian citizenship status',
  'CMA': 'Categorical: Area of current residence',
  'DETH123': 'Binary (1,2): Single or multiple ethnic origin',
  'HHTYPE': 'Categorical: Household type (e.g. multiple family)',
  'MarStH': 'Categorical: Marital status',
  'PR': 'Categorical: Province or territory of current residence',
  'PRIHM': 'Binary (0,1): Person is primary household maintainer',
  'REGIND': 'Binary (0,1): Registered or treaty First Nation status',
  'Sex': 'Binary (1,2): Male or female sex',
  'SHELCO': 'Continuous: Average monthly cost of shelter',
  'AGEGRP': 'Categorical: Age bracket',
  'ATTSCH': 'Categorical: Highest education level eve

---

## Discretization

Converting continuous and wide-domain discrete values into categorical deciles.

In [3]:
import pandas as pd

field_name_data = pd.read_csv('Data_Mining_Names.csv')

data_types = field_name_data['Data_Type']
dataset_names = field_name_data['dataset_2016']

to_categorize = ['AGEP', 'WKSWKP']

for types, names in zip(data_types, dataset_names):
    if types in ('Discrete', 'Continuous') and names not in ('PPSORT', 'WEIGHT'):
        # Omitting row identifier and row weight (should not be used)
        to_categorize.append(names)

dataset_names = field_name_data['dataset_1991']
        
for types, names in zip(data_types, dataset_names):
    if types in ('Discrete', 'Continuous') and names not in ('WEIGHTP'):
        # Omitting row weight (should not be used)
        if names != '-':
            to_categorize.append(names)

to_categorize

['AGEP',
 'WKSWKP',
 'SHELCO',
 'MrkInc',
 'TotInc_AT',
 'TotInc',
 'VALUE',
 'TOTINCP',
 'VALUEP']

In [5]:
dataset_2016 = pd.read_csv('pumf-98M0001-E-2016-individuals_F1.csv')

dataset_2016['SHELCO']

0         2500
1         2400
2         2500
3         1300
4          700
          ... 
930416     600
930417    1900
930418    3294
930419    1500
930420    1400
Name: SHELCO, Length: 930421, dtype: int64

In [4]:
pd.qcut(dataset_2016['SHELCO'], 10, False)

0         8
1         8
2         8
3         4
4         1
5         4
6         3
7         2
8         2
9         8
10        7
11        6
12        1
13        9
14        8
15        7
16        7
17        0
18        2
19        8
20        8
21        9
22        4
23        8
24        2
25        4
26        9
27        9
28        9
29        8
         ..
930391    5
930392    6
930393    5
930394    5
930395    5
930396    2
930397    0
930398    1
930399    5
930400    4
930401    4
930402    4
930403    2
930404    1
930405    7
930406    7
930407    5
930408    4
930409    4
930410    4
930411    5
930412    4
930413    5
930414    5
930415    5
930416    1
930417    7
930418    9
930419    5
930420    5
Name: SHELCO, Length: 930421, dtype: int64

### Pipeline: Discretization

In [6]:
import pandas as pd

def discretify(dataset, quantiles=10, target_fields=None,
               invalid_quantile = 88):
    """Convert target fields in dataset to quantile categories."""
    
    target_fields = target_fields if target_fields else ['AGEP', 'WKSWKP', 'SHELCO', 'MrkInc', 'TotInc_AT',
                                                         'TotInc', 'VALUE', 'TOTINCP', 'VALUEP']
    invalid_values = {
        'AGEP': [98],
        'WKSWKP': [99],
        'SHELCO': [],
        'MrkInc': [88888888, 99999999],
        'TotInc': [88888888, 99999999],
        'TotInc_AT': [88888888, 99999999],
        'VALUE': [88888888, 99999999],
        'VALUEP': [999999],
        'TOTINCP': [9999999]
    }
    
    dataset_copy = dataset.copy()

    for continuous in target_fields:
        if continuous in dataset_copy:
            print(continuous)
            values = []
            quantile_values = pd.qcut(dataset_copy[continuous].rank(method='first'), quantiles, False)

            for value, quantile in zip(dataset_copy[continuous], quantile_values):
                if value in invalid_values[continuous]:
                    values.append(invalid_quantile)
                else:
                    values.append(quantile)

            dataset_copy[continuous] = values
    
    return dataset_copy


discretify(dataset_2016)

SHELCO
MrkInc
TotInc_AT
TotInc
VALUE


Unnamed: 0,PPSORT,WEIGHT,WT1,WT2,WT3,WT4,WT5,WT6,WT7,WT8,...,Subsidy,Tenur,TotInc,TotInc_AT,VALUE,VisMin,Wages,WKSWRK,WRKACT,YRIMM
0,453141,37.037277,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,...,9,1,7,7,4,13,95000,6,11,9999
1,923226,37.037277,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,...,9,1,88,88,4,13,99999999,9,99,9999
2,385097,37.037277,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,...,9,1,88,88,4,13,99999999,9,99,9999
3,732612,37.037277,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,592.596438,...,9,1,4,5,6,13,19000,6,11,9999
4,143665,37.120914,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,...,9,1,3,3,0,13,29000,5,9,9999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
930416,700854,37.037277,0.0,0.0,0.000000,0.000000,0.0,592.596438,0.0,0.000000,...,9,1,7,7,6,13,130000,6,11,9999
930417,821443,37.037277,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,...,8,8,6,6,88,13,78000,6,11,9999
930418,116531,37.042280,0.0,0.0,592.676474,0.000000,0.0,0.000000,0.0,0.000000,...,9,1,6,6,5,13,81000,5,9,9999
930419,499993,37.037277,0.0,0.0,0.000000,592.596438,0.0,0.000000,0.0,0.000000,...,0,2,2,3,88,2,99999999,9,1,2


---

## Language Mapping

Mappings languages between the 2016 and 1991 datasets, and providing standardized language group names.

In [29]:
import pandas as pd

language_mappings = pd.read_csv('Language_Mappings.csv')

language_mappings

Unnamed: 0,HLANO_2016_ID,HLANO_(Home_Language),MTNNO_2016_ID,MTNNO_(Mother_Tongue),HLNP_1991_ID,HLNP_(Home_Language),MTNP_1991_ID,MTNP_(Mother_Tongue),Equivalency_Class
0,,,,,3.0,English and French,3.0,English and French,Official_Languages
1,,,,,1.0,English single response,1.0,English single response,Official_Languages
2,,,,,2.0,French single response,2.0,French single response,Official_Languages
3,2.0,Aboriginal languages,2.0,Aboriginal languages,4.0,NOL: Aboriginal languages,4.0,NOL: Aboriginal languages,Aboriginal
4,24.0,All other languages,32.0,All other single languages,15.0,NOL: Other,18.0,NOL: Other,All_Other_Languages
5,,,,,16.0,NOL: Other (Atl./Terr. only),19.0,NOL: Other (Atl./Terr. only),All_Other_Languages
6,11.0,Arabic,3.0,Arabic,12.0,NOL: Arabic,15.0,NOL: Arabic,Arabic
7,20.0,Austro-Asiatic languages,20.0,Austro-Asiatic languages,11.0,NOL: Austro-Asiatic languages,14.0,NOL: Austro-Asiatic languages,Austro-Asiatic Languages
8,17.0,Cantonese,5.0,Cantonese,,,,,Chinese_Languages
9,19.0,Chinese languages,6.0,Chinese languages,10.0,NOL: Chinese,13.0,NOL: Chinese,Chinese_Languages


In [36]:
from math import isnan

print(isnan(language_mappings['HLANO_2016_ID'][0]))
print(isnan(language_mappings['HLANO_2016_ID'][3]))

True
False


### Pipeline: Language Mapping

In [61]:
def get_language_mappings():
    """Get language mapping dicts, separated into subdicts by feature name."""

    hlano = {}
    mtnno = {}
    hlnp = {}
    mtnp = {}
    classes = set()

    for row in list(language_mappings.itertuples()):
        # print(row[1])
        # print(row[3])
        # print(row[5])
        # print(row[7])
        # print(row[9])

        classes.add(row[9])
        
        if not isnan(row[1]):
            hlano[int(row[1])] = row[9]
        if not isnan(row[3]):
            mtnno[int(row[3])] = row[9]
        if not isnan(row[5]):
            hlnp[int(row[5])] = row[9]
        if not isnan(row[7]):
            mtnp[int(row[7])] = row[9]

    return {
        'hlano': hlano,
        'mtnno': mtnno,
        'hlnp': hlnp,
        'mtnp': mtnp,
        'classes': list(classes)
    }


get_language_mappings()

{'hlano': {2: 'Aboriginal',
  24: 'All_Other_Languages',
  11: 'Arabic',
  20: 'Austro-Asiatic Languages',
  17: 'Chinese_Languages',
  19: 'Chinese_Languages',
  6: 'German',
  3: 'Italian',
  18: 'Chinese_Languages',
  1: 'Official_Languages',
  88: 'Not_Available',
  12: 'All_Other_Languages',
  22: 'All_Other_Languages',
  10: 'All_Other_Languages',
  16: 'Other_Indo_Iranian_Languages',
  9: 'All_Other_Languages',
  15: 'Other_Indo_Iranian_Languages',
  8: 'Polish',
  5: 'Portuguese',
  13: 'Punjabi',
  7: 'All_Other_Languages',
  4: 'Spanish',
  21: 'All_Other_Languages',
  23: 'All_Other_Languages',
  14: 'Other_Indo_Iranian_Languages'},
 'mtnno': {2: 'Aboriginal',
  32: 'All_Other_Languages',
  3: 'Arabic',
  20: 'Austro-Asiatic Languages',
  5: 'Chinese_Languages',
  6: 'Chinese_Languages',
  7: 'German',
  9: 'All_Other_Languages',
  13: 'Italian',
  28: 'All_Other_Languages',
  4: 'Chinese_Languages',
  31: 'All_Other_Languages',
  1: 'Official_Languages',
  88: 'Not_Availabl

---

## Unified Features

Conversion of and restriction to unified 1991-2016 fields.

In [1]:
import pandas as pd

field_name_data = pd.read_csv('Data_Mining_Names.csv')

field_name_data

Unnamed: 0,Mapping_Needed,Data_Type,Meaning,Definitive_Name,dataset_2016,dataset_1991,Notes
0,,Categorical,Census family income bracket after tax,Census_Family_Income_After_Tax,CFInc_AT,-,
1,,Categorical,Dwelling structure type (e.g. apartment),Dwelling_Structure_Type,DTYPE,-,
2,,Categorical,Economic family disposable income,Economic_Family_Disposable_Income,EfDIMBM,-,
3,,Categorical,Economic family income bracket,Economic_Family_Income_Bracket,EFInc,-,
4,,Categorical,Economic family income bracket after tax,Economic_Family_Income_Bracket_After_Tax,EFInc_AT,-,
...,...,...,...,...,...,...,...
71,,Continuous,Total personal income from recurring sources,Total_Personal_Recurring_Income,TotInc,TOTINCP,
72,,Discrete (1-7+),Number of persons in household,Household_Size,HHSIZE,UNITSP,
73,,Continuous,Household property value (including land),Property_Value,VALUE,VALUEP,
74,,Continuous,No. of individuals represented by this row,Row_Weight,WEIGHT,WEIGHTP,


### Pipelines: Unified Features

In [9]:
import pandas as pd

def get_unified_feature_dicts():
    """Return 1991-2016 unified verbose field name dict and 1991-2016 unified verbose field description dict."""
    
    field_name_data = pd.read_csv('Data_Mining_Names.csv')
    
    data_types = field_name_data['Data_Type']
    meanings = field_name_data['Meaning']

    descriptions = []

    for types, meaning in zip(data_types, meanings):
        descriptions.append(types + ': ' + meaning)

    description_dict = dict(zip(field_name_data['dataset_2016'], descriptions))
    name_dict = dict(zip(field_name_data['dataset_2016'], field_name_data['Definitive_Name']))
    
    description_dict.update(dict(zip(field_name_data['dataset_1991'], descriptions)))
    name_dict.update(dict(zip(field_name_data['dataset_1991'], field_name_data['Definitive_Name'])))
    
    del description_dict['-']
    del name_dict['-']
    
    return description_dict, name_dict


get_unified_feature_dicts()

({'CFInc_AT': 'Categorical: Census family income bracket after tax',
  'DTYPE': 'Categorical: Dwelling structure type (e.g. apartment)',
  'EfDIMBM': 'Categorical: Economic family disposable income',
  'EFInc': 'Categorical: Economic family income bracket',
  'EFInc_AT': 'Categorical: Economic family income bracket after tax',
  'EFDecile': 'Discrete (1-10): Total family income decile after tax',
  'LICO_AT': 'Binary (1,2): Non-low or low income cut-off economic family after tax',
  'PKIDS': 'Binary (0,1): Presence of children in any household',
  'PKID0_1': 'Binary (0,1): Presence of children in household aged 0-1',
  'PKID15_24': 'Binary (0,1): Presence of children in household aged 15-24',
  'PKID2_5': 'Binary (0,1): Presence of children in household aged 2-5',
  'PKID25': 'Binary (0,1): Presence of children in household aged 25+',
  'PKID6_14': 'Binary (0,1): Presence of children in household aged 6_14',
  'PresMortG': 'Binary (0,1): Presence of mortgage',
  'LOC_ST_RES': 'Categori

In [10]:
import pandas as pd

def get_usable_features():
    """Return 1991 fields and 2016 fields usable for cross-year comparison."""
    
    field_name_data = pd.read_csv('Data_Mining_Names.csv')
    
    features_1991 = []
    features_2016 = []

    for older, newer in zip(field_name_data['dataset_1991'], field_name_data['dataset_2016']):
        if older != '-':
            features_1991.append(older)
            features_2016.append(newer)
    
    return features_1991, features_2016


get_usable_features()

(['ABETHNCP',
  'AGEP',
  'BNFNMEMP',
  'CFINCP',
  'CFSIZEP',
  'CFSTATP',
  'CITIZENP',
  'CMAPUMFP',
  'CONDWELP',
  'DGMFSP',
  'DGREEP',
  'EFSIZEP',
  'ETHNICRP',
  'HHINCP',
  'HLNP',
  'HLOSP',
  'HTYPEP',
  'IMMPOPP',
  'INCSTP',
  'LFACTP',
  'LSTWKP',
  'MARSTLP',
  'MOB1P',
  'MOB5P',
  'MTNP',
  'POBP',
  'PRMAINP',
  'PROV1P',
  'PROV5P',
  'PROVP',
  'RCONDP',
  'REGINP',
  'ROOMP',
  'SECGRADP',
  'SEXP',
  'TENURP',
  'TOTINCP',
  'UNITSP',
  'VALUEP',
  'WEIGHTP',
  'WKSWKP'],
 ['ABOID',
  'AGEGRP',
  'BFNMEMB',
  'CFInc',
  'CfSize',
  'CFSTAT',
  'Citizen',
  'CMA',
  'REPAIR',
  'CIP2011',
  'HDGREE',
  'EfSize',
  'ETHDER',
  'HHInc',
  'HLANO',
  'ATTSCH',
  'HHTYPE',
  'IMMSTAT',
  'LICO',
  'LFACT',
  'LSTWRK',
  'MarStH',
  'MOB1',
  'Mob5',
  'MTNNO',
  'POB',
  'PRIHM',
  'PR1',
  'PR5',
  'PR',
  'CONDO',
  'REGIND',
  'ROOMS',
  'SSGRAD',
  'Sex',
  'Tenur',
  'TotInc',
  'HHSIZE',
  'VALUE',
  'WEIGHT',
  'WRKACT'])