In [145]:
import pandas as pd
import numpy as np

In [146]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.layouts import row
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool

output_notebook()

In [147]:
dataset = pd.read_csv("pumf-98M0001-E-2016-individuals_F1.csv")
dataset.head()

Unnamed: 0,PPSORT,WEIGHT,WT1,WT2,WT3,WT4,WT5,WT6,WT7,WT8,...,Subsidy,Tenur,TotInc,TotInc_AT,VALUE,VisMin,Wages,WKSWRK,WRKACT,YRIMM
0,453141,37.037277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9,1,97000,73000,450000,13,95000,6,11,9999
1,923226,37.037277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9,1,99999999,99999999,440000,13,99999999,9,99,9999
2,385097,37.037277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9,1,99999999,99999999,440000,13,99999999,9,99,9999
3,732612,37.037277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,592.596438,...,9,1,46000,41000,839779,13,19000,6,11,9999
4,143665,37.120914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9,1,30000,26000,60000,13,29000,5,9,9999


###  Notes from Mr. Fogel

MTNNO (Mother Tongue - first write-in component) and HLANO (Home Language - first write-in component) - see if these match, use boolean whether they match as classification. Also classify MTNNO and HLANO separately

In Decision Tree file I pruned out these values from feature set - these all come from the language section:
- MTNEN (mother tongue english) - high correlation to classified feature in tree
- MTNFR (mother tongue french)   - ditto
- HLAFR (home language french)  - ditto
- HLAEN (home language english) - ditto
- HLBEN (home language part b english)
- HLBFR (home language part b french) 
- HLBNO 
- NOL (knowledge of non-official languages) - very high correlation to classified features
- FOL (first official language spoken) 
- KOL (knowledge of official languages) 

_----the below language features had very low influence in the decision tree----_

- LWAEN (language at work part a english)
- LWAFR (language at work part a french)
- LWANO (language at work part a first write-in component)
- LWBEN (language at work part b english)
- LWBFR (language at work part b french)
- LWBNO (language at work part b first write-in component)

In [148]:
# class label: home language part A - first language write in component
homeLang = dataset.iloc[:,65]
# class label: mother tongue part A - first language write in component
motherTongue = dataset.iloc[:, 96]
# variables
x1, x2, x3 = dataset.iloc[:, 18:65], dataset.iloc[:, 66:96], dataset.iloc[:, 97:-1]
x = pd.concat([x1,x2,x3], axis=1, sort=False)
weights = dataset.iloc[:, 1]

In [149]:
x

Unnamed: 0,ABOID,AGEGRP,AGEIMM,ATTSCH,BedRm,BFNMEMB,CapGn,CFInc,CFInc_AT,CfSize,...,SSGRAD,Subsidy,Tenur,TotInc,TotInc_AT,VALUE,VisMin,Wages,WKSWRK,WRKACT
0,6,11,99,1,5,0,99999999,30,27,4,...,5,9,1,97000,73000,450000,13,95000,6,11
1,6,5,99,9,5,0,99999999,30,27,4,...,99,9,1,99999999,99999999,440000,13,99999999,9,99
2,6,2,99,9,5,0,99999999,30,27,4,...,99,9,1,99999999,99999999,440000,13,99999999,9,99
3,6,12,99,1,4,0,99999999,20,19,4,...,8,9,1,46000,41000,839779,13,19000,6,11
4,6,15,99,1,2,0,99999999,16,15,2,...,6,9,1,30000,26000,60000,13,29000,5,9
5,6,19,99,1,3,0,-3300,28,26,2,...,4,9,1,82000,69000,839779,13,99999999,9,1
6,6,18,99,1,3,0,-3400,28,26,2,...,6,9,1,51000,45000,839779,13,99999999,9,1
7,6,18,99,1,3,0,99999999,13,12,1,...,6,9,1,41000,37000,310000,13,99999999,9,1
8,6,14,99,1,2,0,99999999,25,21,2,...,5,0,2,53000,43000,99999999,13,52000,6,11
9,6,8,99,4,4,0,99999999,33,31,4,...,4,9,1,27000,26000,640000,13,27000,4,7


There are no null values in the dataset

In [150]:
x.isna().sum()

ABOID               0
AGEGRP              0
AGEIMM              0
ATTSCH              0
BedRm               0
BFNMEMB             0
CapGn               0
CFInc               0
CFInc_AT            0
CfSize              0
CFSTAT              0
CHDBN               0
ChldC               0
CIP2011             0
CIP2011_STEM_SUM    0
Citizen             0
CitOth              0
CMA                 0
CONDO               0
COW                 0
CQPPB               0
DETH123             0
DIST                0
DPGRSUM             0
DTYPE               0
EFDecile            0
EfDIMBM             0
EFInc               0
EFInc_AT            0
EfSize              0
                   ..
POB                 0
POBF                0
POBM                0
POWST               0
PR                  0
PR1                 0
PR5                 0
PresMortG           0
PRIHM               0
PWDUR               0
PWLEAVE             0
PWOCC               0
PWPR                0
REGIND              0
REPAIR    

Verifying that the metadata is accurate

In [151]:
x[(x['CHDBN'] == 99999999) | (x['CHDBN'] == 8888888)].shape[0]/x.shape[0]

0.8837547733767832

In [152]:
x[x['POB'] == 88].shape

(14478, 120)

Close enough... Taking invalid data seen in metadata and putting in a dictionary

In [153]:
missing_dict = dict(
    AGGRP=9139,
    MOB1=9884,
    Mob5=51091,
    PR1=188597,
    PR5=88597,
    DPGRSUM=21199,
    ETHDER=31391,
    VisMin=17496,
    HLANO=4081,
    KOL=1754,
    LWAEN=392592,
    LWAFR=392592,
    LWANO=391418,
    LWBEN=392592,
    LWBFR=392592,
    LWBNO=391418,
    MTNEn=1754,
    MTNFr=1754,
    MTNNO=13756,
    NOL=13756,
    AGEIMM=726540,
    CitOth=817447,
    GENSTAT=432,
    IMMCAT5=2412,
    IMMSTAT=2412,
    POB=14478,
    POBF=37281,
    POBM=37281,
    YRIMM=726538,
    ATTSCH=157104,
    CIP2011=157131,
    CIP2011_STEM_=157131,
    HDGREE=157131,
    LOC_ST_RES=157131,
    LOCSTUD=503155,
    SSGRAD=157131,
    COW=391418,
    FPTWK=407453,
    LFACT=157131,
    LSTWRK=157131,
    NAICS=391418,
    NOC16=391418,
    NOCS=391418,
    WKSWRK=391418,
    WRKACT=157131,
    DIST=502619,
    MODE=435003,
    POWST=391418,
    PWDUR=435003,
    PWLEAVE=435003,
    PWOCC=542244,
    PWPR=461131,
    CapGn=844416,
    CFInc=5542,
    CFInc_AT=5542,
    CHDBN=828655,
    ChldC=886332,
    CQPPB=752220,
    EFDecile=5542,
    EfDIMBM=5542,
    EFInc=5542,
    EFInc_AT=5542,
    EICBN=862277,
    EmpIn=384479,
    GovtI=564132,
    GTRfs=399242,
    HHInc=5542,
    HHInc_AT=5542,
    HHMRKINC=5542,
    IncTax=415816,
    Invst=701218,
    LICO=2758,
    LICO_AT=2758,
    LoLIMA=2758,
    LoLIMB=2758,
    LoMBM=2758,
    MrkInc=272659,
    OASGI=795379,
    OtInc=810831,
    Retir=817735,
    SempI=849594,
    TotInc=194147,
    TotInc_AT=193828,
    Wages=427992,
    CfSize=3118,
    EfSize=3118,
    PKID0_1=164769,
    PKID15_24=164769,
    PKID2_5=164769,
    PKID25=164769,
    PKID6_14=164769,
    PKIDS=164769,
    HHSIZE=3118,
    BedRm=8320,
    CONDO=5406,
    DTYPE=2764,
    HCORENEED_IN=45134,
    NOS=8320,
    PresMortG=253489,
    REPAIR=540,
    ROOMS=8320,
    Subsidy=672083,
    Tenur=4849,
    VALUE=258338
)

In [154]:
total_count = x.shape[0]

df_missing = pd.DataFrame(list(missing_dict.items()), columns=['Feature', 'Count'])
df_missing['Ratio'] = df_missing['Count'] / total_count
df_missing

Unnamed: 0,Feature,Count,Ratio
0,AGGRP,9139,0.009822
1,MOB1,9884,0.010623
2,Mob5,51091,0.054912
3,PR1,188597,0.202701
4,PR5,88597,0.095222
5,DPGRSUM,21199,0.022784
6,ETHDER,31391,0.033738
7,VisMin,17496,0.018804
8,HLANO,4081,0.004386
9,KOL,1754,0.001885


In [158]:
source = ColumnDataSource(data=df_missing.sort_values(by=['Ratio'], ascending=False))

p = figure(x_range=df_missing['Feature'], plot_height=500, plot_width=2000, title="Missing Ratios",
           toolbar_location=None, x_axis_label='Feature', y_axis_label='Ratio', tools="")
p.vbar(x='Feature', top='Ratio', width=0.9, source=source)

p.xaxis.major_label_orientation = np.pi/4
p.xgrid.grid_line_color = None
p.y_range.start = 0

p.add_tools(HoverTool(tooltips=[('Feature', '@Feature'),
                                ('Ratio', '@Ratio')]))

show(p)

####  Data Imputation

Need to impute the dummy values used in the data, e.g. the “99999999” seen in the Wages column. The PDF metadata tells you for each column what it used for its dummy values.

Simplest solution: Remove the rows which have these values.

Simple solution: take the average (mean/median for numerical, median for categorical) of the non-dummy values and replace dummy values by the average found.

Ultra fun solution: train a model (like our best friend, XGBoost) to predict the missing values.
