In [52]:
import pandas as pd
import numpy as np

In [53]:
!pip install bokeh



In [54]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.layouts import row
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool

output_notebook()

In [55]:
dataset = pd.read_csv("pumf-98M0001-E-2016-individuals_F1.csv")
dataset.head()

Unnamed: 0,PPSORT,WEIGHT,WT1,WT2,WT3,WT4,WT5,WT6,WT7,WT8,...,Subsidy,Tenur,TotInc,TotInc_AT,VALUE,VisMin,Wages,WKSWRK,WRKACT,YRIMM
0,453141,37.037277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9,1,97000,73000,450000,13,95000,6,11,9999
1,923226,37.037277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9,1,99999999,99999999,440000,13,99999999,9,99,9999
2,385097,37.037277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9,1,99999999,99999999,440000,13,99999999,9,99,9999
3,732612,37.037277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,592.596438,...,9,1,46000,41000,839779,13,19000,6,11,9999
4,143665,37.120914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9,1,30000,26000,60000,13,29000,5,9,9999


In [56]:
list(dataset.columns)

['PPSORT',
 'WEIGHT',
 'WT1',
 'WT2',
 'WT3',
 'WT4',
 'WT5',
 'WT6',
 'WT7',
 'WT8',
 'WT9',
 'WT10',
 'WT11',
 'WT12',
 'WT13',
 'WT14',
 'WT15',
 'WT16',
 'ABOID',
 'AGEGRP',
 'AGEIMM',
 'ATTSCH',
 'BedRm',
 'BFNMEMB',
 'CapGn',
 'CFInc',
 'CFInc_AT',
 'CfSize',
 'CFSTAT',
 'CHDBN',
 'ChldC',
 'CIP2011',
 'CIP2011_STEM_SUM',
 'Citizen',
 'CitOth',
 'CMA',
 'CONDO',
 'COW',
 'CQPPB',
 'DETH123',
 'DIST',
 'DPGRSUM',
 'DTYPE',
 'EFDecile',
 'EfDIMBM',
 'EFInc',
 'EFInc_AT',
 'EfSize',
 'EICBN',
 'EmpIn',
 'ETHDER',
 'FOL',
 'FPTWK',
 'GENSTAT',
 'GovtI',
 'GTRfs',
 'HCORENEED_IND',
 'HDGREE',
 'HHInc',
 'HHInc_AT',
 'HHMRKINC',
 'HHSIZE',
 'HHTYPE',
 'HLAEN',
 'HLAFR',
 'HLANO',
 'HLBEN',
 'HLBFR',
 'HLBNO',
 'IMMCAT5',
 'IMMSTAT',
 'IncTax',
 'Invst',
 'KOL',
 'LFACT',
 'LICO',
 'LICO_AT',
 'LOC_ST_RES',
 'LOCSTUD',
 'LoLIMA',
 'LoLIMB',
 'LoMBM',
 'LSTWRK',
 'LWAEN',
 'LWAFR',
 'LWANO',
 'LWBEN',
 'LWBFR',
 'LWBNO',
 'MarStH',
 'MOB1',
 'Mob5',
 'MODE',
 'MrkInc',
 'MTNEn',
 'MTNFr'

In [57]:
#removing English as Mother Tongue before proceeding:
datasetNoMTNEN = dataset.loc[dataset['MTNEn'] != 1]
print("Number of samples when English removed as mother tongue:", datasetNoMTNEN.shape[0])
datasetNoMTNEN = datasetNoMTNEN.loc[datasetNoMTNEN['MTNEn'] != 8]
print("Number of samples when unavailable rows also removed:", datasetNoMTNEN.shape[0])

Number of samples when English removed as mother tongue: 389571
Number of samples when unavailable rows also removed: 387817


In [58]:
datasetNoMTNEN.shape

(387817, 141)

###  Notes from Mr. Fogel

MTNNO (Mother Tongue - first write-in component) and HLANO (Home Language - first write-in component) - see if these match, use boolean whether they match as classification. Also classify MTNNO and HLANO separately

In Decision Tree file I pruned out these values from feature set - these all come from the language section:
- MTNEN (mother tongue english) - high correlation to classified feature in tree
- MTNFR (mother tongue french)   - ditto
- HLAFR (home language french)  - ditto
- HLAEN (home language english) - ditto
- HLBEN (home language part b english)
- HLBFR (home language part b french) 
- HLBNO 
- NOL (knowledge of non-official languages) - very high correlation to classified features
- FOL (first official language spoken) 
- KOL (knowledge of official languages) 

_----the below language features had very low influence in the decision tree----_

- LWAEN (language at work part a english)
- LWAFR (language at work part a french)
- LWANO (language at work part a first write-in component)
- LWBEN (language at work part b english)
- LWBFR (language at work part b french)
- LWBNO (language at work part b first write-in component)

In [59]:
# class label: home language part A - first language write in component
homeLang = datasetNoMTNEN.iloc[:,65]
# class label: mother tongue part A - first language write in component
motherTongue = datasetNoMTNEN.iloc[:, 96]
# variables
weights = datasetNoMTNEN.iloc[:, 1]
x1, x2, x3 = datasetNoMTNEN.iloc[:, 18:65], datasetNoMTNEN.iloc[:, 66:96], datasetNoMTNEN.iloc[:, 97:-1]
x = pd.concat([weights, x1,x2,x3, homeLang, motherTongue], axis=1, sort=False)

In [60]:
x.tail()

Unnamed: 0,WEIGHT,ABOID,AGEGRP,AGEIMM,ATTSCH,BedRm,BFNMEMB,CapGn,CFInc,CFInc_AT,...,Tenur,TotInc,TotInc_AT,VALUE,VisMin,Wages,WKSWRK,WRKACT,HLANO,MTNNO
930407,37.04155,6,14,8,1,3,0,99999999,25,21,...,1,64000,49000,390000,4,64000,6,11,1,30
930410,37.04155,6,11,99,1,4,0,99999999,30,25,...,1,26000,24000,230000,13,22000,5,10,1,1
930411,37.04155,6,11,99,1,4,0,99999999,30,25,...,1,130000,83000,240000,13,120000,3,5,1,1
930412,37.04155,6,3,99,9,4,0,99999999,30,25,...,1,99999999,99999999,250000,13,99999999,9,99,1,1
930419,37.037277,6,21,7,1,3,0,99999999,13,13,...,2,22000,23000,99999999,2,99999999,9,1,17,5


When English as a mother tongue is present 88% of the entries for age of immigration are unavailable/unapplicable. 

Once English as a mother tongue has been removed this goes down to 61%. 

In [61]:
datasetNoMTNEN[(datasetNoMTNEN['AGEIMM'] == 88) | (datasetNoMTNEN['AGEIMM'] == 99)].shape[0]/datasetNoMTNEN.shape[0]

0.6161230683544042

CitOth : Citizenship: Other country of citizenship has the value 99 
        to indicate no other country of citizenship, while 88 is 'not available'

In [62]:
missing_values_dict = dict(
    AGEGRP=88,
    MOB1= [8,9],
    Mob5=9,
    PR1=[88,99],
    PR5=99,
    DPGRSUM=88,
    ETHDER=88,
    VisMin=88,
    HLANO=88, # This is one of the target columns
    KOL=8,
    LWAEN=[8,9],
    LWAFR=[8,9],
    LWANO=9,
    LWBEN=[8,9],
    LWBFR=[8,9],
    LWBNO=9,
    MTNEn=8,
    MTNFr=8,
    MTNNO=88, # This is one of the target columns
    NOL=88,
    AGEIMM=[88,99],
    CitOth=88,
    GENSTAT=8,
    IMMCAT5=88,
    IMMSTAT=8,
    POB=88,
    POBF=8,
    POBM=8,
    YRIMM=[8888,9999],
    ATTSCH=[8,9],
    CIP2011=[88,99],
    CIP2011_STEM_SUM=[88,99],
    HDGREE=[88,99],
    LOC_ST_RES=9,
    LOCSTUD=99,
    SSGRAD=[88,99],
    COW=8,
    FPTWK=[8,9],
    LFACT=99,
    LSTWRK=9,
    NAICS=[88,99],
    NOC16=[88,99],
    NOCS=[88,99],
    WKSWRK=9,
    WRKACT=[88,99],
    DIST=[8,9],
    MODE=9,
    POWST=[8,9],
    PWDUR=[8,9],
    PWLEAVE=[8,9],
    PWOCC=9,
    PWPR=[88,99],
    CapGn=[88888888,99999999],
    CFInc=88,
    CFInc_AT=88,
    CHDBN=[88888888,99999999],
    ChldC=[88888888,99999999],
    CQPPB=[88888888,99999999],
    EFDecile=88,
    EfDIMBM=88,
    EFInc=88,
    EFInc_AT=88,
    EICBN=[88888888,99999999],
    EmpIn=[88888888,99999999],
    GovtI=[88888888,99999999],
    GTRfs=[88888888,99999999],
    HHInc=88,
    HHInc_AT=88,
    HHMRKINC=88,
    IncTax=[88888888,99999999],
    Invst=[88888888,99999999],
    LICO=[8,9],
    LICO_AT=[8,9],
    LoLIMA=[8,9],
    LoLIMB=[8,9],
    LoMBM=[8,9],
    MrkInc=[88888888,99999999],
    OASGI=[88888888,99999999],
    OtInc=[88888888,99999999],
    Retir=[88888888,99999999],
    SempI=[88888888,99999999],
    TotInc=[88888888,99999999],
    TotInc_AT=[88888888,99999999],
    Wages=[88888888,99999999],
    CfSize=8,
    EfSize=8,
    PKID0_1=[8,9],
    PKID15_24=[8,9],
    PKID2_5=[8,9],
    PKID25=[8,9],
    PKID6_14=[8,9],
    PKIDS=9,
    HHSIZE=8,
    BedRm=8,
    CONDO=8,
    DTYPE=8,
    HCORENEED_IND=888,
    NOS=8,
    PresMortG=[8,9],
    REPAIR=8,
    ROOMS=88,
    Subsidy=[8,9],
    Tenur=8,
    VALUE=[88888888,99999999]
)

In [63]:

def process_NA_rows(dataset, col_name, na_value):
    if isinstance(na_value, list):
        num_na = dataset.loc[dataset[col_name] == na_value[0]].shape[0]
        num_na += dataset.loc[dataset[col_name] == na_value[1]].shape[0]
    else:
        num_na = dataset[dataset[col_name] == na_value].shape[0]
    print(col_name, "% missing values =", num_na/dataset.shape[0]*100)
    ratio = num_na/dataset.shape[0]*100
    return ratio  

In [64]:
missing_val_summary = dict()
for name in list(x.columns):
    if name in missing_values_dict:
        percent_bad_rows = process_NA_rows(x, name, missing_values_dict[name])
        missing_val_summary[name] = percent_bad_rows
    else:
        missing_val_summary[name] = 0
        print(name, "% missing values =", 0)
    

WEIGHT % missing values = 0
ABOID % missing values = 0
AGEGRP % missing values = 1.1484798242470031
AGEIMM % missing values = 61.61230683544042
ATTSCH % missing values = 14.244089351420902
BedRm % missing values = 0.9744286609405983
BFNMEMB % missing values = 0
CapGn % missing values = 91.65224835424955
CFInc % missing values = 0.7410711753223814
CFInc_AT % missing values = 0.7410711753223814
CfSize % missing values = 0.3537751052687221
CFSTAT % missing values = 0
CHDBN % missing values = 87.80017379330974
ChldC % missing values = 94.32206427258217
CIP2011 % missing values = 15.771356077737696
CIP2011_STEM_SUM % missing values = 15.770582517011889
Citizen % missing values = 0
CitOth % missing values = 2.3057266700531436
CMA % missing values = 0
CONDO % missing values = 0.6345776487363886
COW % missing values = 0.6278734557794011
CQPPB % missing values = 79.99700889852687
DETH123 % missing values = 0
DIST % missing values = 53.99170227194785
DPGRSUM % missing values = 2.9642847012895257

In [65]:
df_missing = pd.DataFrame(list(missing_val_summary.items()), columns=['Feature', 'Ratio'])
df_missing

Unnamed: 0,Feature,Ratio
0,WEIGHT,0.000000
1,ABOID,0.000000
2,AGEGRP,1.148480
3,AGEIMM,61.612307
4,ATTSCH,14.244089
5,BedRm,0.974429
6,BFNMEMB,0.000000
7,CapGn,91.652248
8,CFInc,0.741071
9,CFInc_AT,0.741071


In [75]:
df_missing = df_missing.sort_values(by=['Ratio'], ascending=False)
source = ColumnDataSource(data=df_missing)

p = figure(x_range=df_missing['Feature'], plot_height=500, plot_width=2000, title="Missing Ratios",
           toolbar_location=None, x_axis_label='Feature', y_axis_label='Ratio', tools="")
p.vbar(x='Feature', top='Ratio', width=0.9, source=source)

p.xaxis.major_label_orientation = np.pi/4
p.xgrid.grid_line_color = None
p.y_range.start = 0

p.add_tools(HoverTool(tooltips=[('Feature', '@Feature'),
                                ('Ratio', '@Ratio')]))

show(p)

In [66]:
from bokeh.models import Span

# The same plot as above but resized for paper
CUTOFF_RATIO = 16.0
PRUNE_THRESHOLD = 20.0

df_missing_pruned = df_missing_pruned.sort_values(by=['Ratio'], ascending=False)
df_missing_pruned = df_missing_pruned[df_missing_pruned['Ratio'] > CUTOFF_RATIO]

source_pruned = ColumnDataSource(data=df_missing_pruned)

p2 = figure(x_range=df_missing_pruned['Feature'], plot_height=500, plot_width=800,
            x_axis_label='Feature', y_axis_label='Ratio')
p2.vbar(x='Feature', top='Ratio', width=0.9, source=source_pruned)

hline = Span(location=PRUNE_THRESHOLD, dimension='width', line_color='red', line_width=3)
p2.renderers.extend([hline])

p2.xaxis.major_label_orientation = np.pi/4
p2.xgrid.grid_line_color = None
p2.y_range.start = 0

p2.add_tools(HoverTool(tooltips=[('Feature', '@Feature'),
                                 ('Ratio', '@Ratio')]))

show(p2)

Looking at the above data, a logical point to prune features would be those that are over 20% missing ratio. Proceeding with this option

In [67]:
df_missing_pruned = df_missing[df_missing['Ratio'] > 20]
df_missing_pruned

Unnamed: 0,Feature,Ratio
3,AGEIMM,61.612307
7,CapGn,91.652248
12,CHDBN,87.800174
13,ChldC,94.322064
21,CQPPB,79.997009
23,DIST,53.991702
31,EICBN,91.876065
32,EmpIn,41.915388
35,FPTWK,44.306206
37,GovtI,54.024708


In [68]:
len(df_missing_pruned)

39

Feature set needs to be further pruned...

In [69]:
df_unpruned = df_missing[df_missing['Ratio'] <= 30]
cols_with_most_data_present = df_unpruned['Feature'].tolist()
cols_with_most_data_present

['WEIGHT',
 'ABOID',
 'AGEGRP',
 'ATTSCH',
 'BedRm',
 'BFNMEMB',
 'CFInc',
 'CFInc_AT',
 'CfSize',
 'CFSTAT',
 'CIP2011',
 'CIP2011_STEM_SUM',
 'Citizen',
 'CitOth',
 'CMA',
 'CONDO',
 'COW',
 'DETH123',
 'DPGRSUM',
 'DTYPE',
 'EFDecile',
 'EfDIMBM',
 'EFInc',
 'EFInc_AT',
 'EfSize',
 'ETHDER',
 'FOL',
 'GENSTAT',
 'HCORENEED_IND',
 'HDGREE',
 'HHInc',
 'HHInc_AT',
 'HHMRKINC',
 'HHSIZE',
 'HHTYPE',
 'HLAEN',
 'HLAFR',
 'HLBEN',
 'HLBFR',
 'HLBNO',
 'IMMCAT5',
 'IMMSTAT',
 'KOL',
 'LFACT',
 'LICO',
 'LICO_AT',
 'LOC_ST_RES',
 'LoLIMA',
 'LoLIMB',
 'LoMBM',
 'LSTWRK',
 'MarStH',
 'MOB1',
 'Mob5',
 'MrkInc',
 'MTNEn',
 'MTNFr',
 'NOL',
 'NOS',
 'PKID0_1',
 'PKID15_24',
 'PKID2_5',
 'PKID25',
 'PKID6_14',
 'PKIDS',
 'POB',
 'POBF',
 'POBM',
 'PR',
 'PR1',
 'PR5',
 'PRIHM',
 'REGIND',
 'REPAIR',
 'ROOMS',
 'Sex',
 'SHELCO',
 'SSGRAD',
 'Tenur',
 'TotInc',
 'TotInc_AT',
 'VisMin',
 'WRKACT',
 'HLANO',
 'MTNNO']

In [70]:
df_unpruned.shape[0]

85

In [71]:
df_selected = pd.read_csv("pumf-98M0001-E-2016-individuals_F1.csv", usecols=cols_with_most_data_present)
df_selected = df_selected.loc[df_selected['MTNEn'] != 1]
df_selected = df_selected.loc[df_selected['MTNEn'] != 8]
df_selected.head()

Unnamed: 0,WEIGHT,ABOID,AGEGRP,ATTSCH,BedRm,BFNMEMB,CFInc,CFInc_AT,CfSize,CFSTAT,...,REPAIR,ROOMS,Sex,SHELCO,SSGRAD,Tenur,TotInc,TotInc_AT,VisMin,WRKACT
4,37.120914,6,15,1,2,0,16,15,2,1,...,3,5,1,700,6,1,30000,26000,13,9
7,37.019784,6,18,1,3,0,13,12,1,6,...,1,5,1,900,6,1,41000,37000,13,1
8,37.04155,6,14,1,2,0,25,21,2,1,...,1,4,2,900,5,2,53000,43000,13,11
19,37.04228,6,11,1,3,0,28,25,4,2,...,1,8,2,2200,8,1,120000,98000,12,11
21,37.037277,6,12,1,4,0,29,25,4,2,...,1,10,1,2700,8,1,2000,2000,2,1


In [72]:
df_selected.shape

(387817, 85)

### Writing data with pruned columns to new csv

In [73]:
df_selected.to_csv('pumf-2016-selected-features-No-MTEn-English.csv', index=False)

In [74]:
df_selected.columns.values

array(['WEIGHT', 'ABOID', 'AGEGRP', 'ATTSCH', 'BedRm', 'BFNMEMB', 'CFInc',
       'CFInc_AT', 'CfSize', 'CFSTAT', 'CIP2011', 'CIP2011_STEM_SUM',
       'Citizen', 'CitOth', 'CMA', 'CONDO', 'COW', 'DETH123', 'DPGRSUM',
       'DTYPE', 'EFDecile', 'EfDIMBM', 'EFInc', 'EFInc_AT', 'EfSize',
       'ETHDER', 'FOL', 'GENSTAT', 'HCORENEED_IND', 'HDGREE', 'HHInc',
       'HHInc_AT', 'HHMRKINC', 'HHSIZE', 'HHTYPE', 'HLAEN', 'HLAFR',
       'HLANO', 'HLBEN', 'HLBFR', 'HLBNO', 'IMMCAT5', 'IMMSTAT', 'KOL',
       'LFACT', 'LICO', 'LICO_AT', 'LOC_ST_RES', 'LoLIMA', 'LoLIMB',
       'LoMBM', 'LSTWRK', 'MarStH', 'MOB1', 'Mob5', 'MrkInc', 'MTNEn',
       'MTNFr', 'MTNNO', 'NOL', 'NOS', 'PKID0_1', 'PKID15_24', 'PKID2_5',
       'PKID25', 'PKID6_14', 'PKIDS', 'POB', 'POBF', 'POBM', 'PR', 'PR1',
       'PR5', 'PRIHM', 'REGIND', 'REPAIR', 'ROOMS', 'Sex', 'SHELCO',
       'SSGRAD', 'Tenur', 'TotInc', 'TotInc_AT', 'VisMin', 'WRKACT'],
      dtype=object)