# Feature Analysis

In [59]:
import pandas as pd

In [60]:
shift_1991 = pd.read_csv('1991_LanguageShift.csv')
shift_2016 = pd.read_csv('2016_LanguageShift.csv')
shift_2016_all = pd.read_csv('2016_LanguageShift_All.csv')

In [61]:
shift_1991

Unnamed: 0,WEIGHTP,LanguageShift,ABETHNCP,AGEP,BNFNMEMP,CFINCP,CFSIZEP,CFSTATP,CITIZENP,CMAPUMFP,...,RCONDP,REGINP,ROOMP,SECGRADP,SEXP,TENURP,TOTINCP,UNITSP,VALUEP,WKSWKP
0,33.33,False,5,5,2,17,4,4,1,999,...,2,2,10,2,1,1,3,4,0,0
1,33.33,False,5,0,2,17,4,7,1,999,...,2,2,10,9,1,1,88,4,0,88
2,33.33,False,5,6,2,21,5,1,1,999,...,2,2,9,5,2,1,5,5,0,0
3,33.33,False,5,1,2,21,5,7,1,999,...,2,2,9,9,1,1,88,5,0,88
4,33.33,False,5,8,2,8,2,1,1,999,...,2,2,8,1,2,1,2,4,0,88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809649,33.33,False,2,0,2,99,1,11,1,999,...,2,2,5,9,1,1,88,7,1,88
809650,33.33,False,2,3,2,12,4,4,1,999,...,9,2,5,1,1,2,1,4,88,88
809651,33.33,False,2,0,2,12,4,7,1,999,...,9,2,5,9,2,2,88,4,88,88
809652,33.33,False,2,0,2,12,4,7,1,999,...,9,2,5,9,2,2,88,4,88,88


In [62]:
def analyze_feature_value(dataset, feature):
    """Return weighted count of negative and positive language shifts for a value."""
    
    weight_name = 'WEIGHTP' if 'WEIGHTP' in dataset else 'WEIGHT'
    
    sums = dataset[['LanguageShift', weight_name, feature]].groupby([feature], as_index=False).sum()
    
    sums_dict = dict(zip(sums[feature], sums[weight_name]))
    
    pd.options.display.float_format = '{:.2f}'.format
    analyzed = dataset[[weight_name, 'LanguageShift', feature]].groupby([feature, 'LanguageShift'], as_index=False).sum()
    
    analyzed[weight_name] = [weight / sums_dict[value] for weight, value in zip(analyzed[weight_name], analyzed[feature])]
    
    return analyzed.loc[analyzed['LanguageShift'] == True]


analyze_feature_value(shift_1991, 'SEXP')

Unnamed: 0,SEXP,LanguageShift,WEIGHTP
1,1,True,0.07
3,2,True,0.08


## 2016 Full Feature List

```
Via Katie:


in analysis:
a comparison of langShift== true/langShift==False and ages of respondents [ theory: more young respondents in our data, leaving less time for language shift to occur, leaving imbalance in data towards languageShift == False]
a comparison of genstat and langShift [ theory: more genstat ==1 (ie. first gen immigrants) in sample, means more langshift == false]


Analyze generation status and age of immigration and age in general

Language shift correlates with many fields which correlate with wealth!
* E.g. Highest_Education_Location_vs_Residence

Immigration status and Aboriginal status matter little, but immigrant generation, group, Aboriginal group, etc. matter much more; this may be due to the dropping of certain fields, which was done in the plain 2016 but not the comparisons

Mother's place of birth matters much more than father's
```

### Age and Generation Status

Language shift is highest for second generation immigrants with both parents born outside of Canada.  It is very low for third generation immigrants.

Language shifts are more common in immigrants 25-49, and again at 80 and up.

In [63]:
analyze_feature_value(shift_2016_all, 'GENSTAT')

Unnamed: 0,GENSTAT,LanguageShift,WEIGHT
1,1,True,0.32
3,2,True,0.48
5,3,True,0.22
7,4,True,0.02
9,8,True,0.27


In [64]:
analyze_feature_value(shift_2016_all, 'AGEGRP')

Unnamed: 0,AGEGRP,LanguageShift,WEIGHT
1,1,True,0.09
3,2,True,0.12
5,3,True,0.13
7,4,True,0.16
9,5,True,0.16
11,6,True,0.18
13,7,True,0.17
15,8,True,0.19
17,9,True,0.22
19,10,True,0.22


### Inequality Related Fields

* LICO: No insight
* HHInc: Higher with low income households and high income households
* WRKACT: Higher with part time work except for the highest week bracket
* REPAIR: No insight
* ROOMS: Highest for households with few rooms
* SHELCO: Highest for high shelter costs
* LOC_ST_RES: High for students not living in the same province as their school
* HHMRKINC: Highest for high market income households

In [65]:
analyze_feature_value(shift_2016_all, 'LICO')

Unnamed: 0,LICO,LanguageShift,WEIGHT
1,1,True,0.19
3,2,True,0.19
5,8,True,0.31
7,9,True,0.29


In [66]:
analyze_feature_value(shift_2016_all, 'HHInc')

Unnamed: 0,HHInc,LanguageShift,WEIGHT
1,1,True,0.2
3,2,True,0.21
5,3,True,0.22
7,4,True,0.18
9,5,True,0.18
11,6,True,0.16
13,7,True,0.18
15,8,True,0.15
17,9,True,0.17
19,10,True,0.17


In [67]:
analyze_feature_value(shift_2016_all, 'WRKACT')

Unnamed: 0,WRKACT,LanguageShift,WEIGHT
1,1,True,0.19
3,2,True,0.21
5,3,True,0.19
7,4,True,0.21
9,5,True,0.18
11,6,True,0.21
13,7,True,0.18
15,8,True,0.2
17,9,True,0.19
19,10,True,0.2


In [68]:
analyze_feature_value(shift_2016_all, 'REPAIR')

Unnamed: 0,REPAIR,LanguageShift,WEIGHT
1,1,True,0.2
3,2,True,0.18
5,3,True,0.19
7,8,True,0.33


In [69]:
analyze_feature_value(shift_2016_all, 'ROOMS')

Unnamed: 0,ROOMS,LanguageShift,WEIGHT
1,1,True,0.29
3,2,True,0.26
5,3,True,0.21
7,4,True,0.17
9,5,True,0.17
11,6,True,0.19
13,7,True,0.19
15,8,True,0.19
17,9,True,0.19
19,10,True,0.2


In [70]:
analyze_feature_value(shift_2016_all, 'SHELCO')

Unnamed: 0,SHELCO,LanguageShift,WEIGHT
1,0,True,0.14
3,1,True,0.17
5,2,True,0.18
7,3,True,0.17
9,4,True,0.16
11,5,True,0.17
13,6,True,0.19
15,7,True,0.22
17,8,True,0.24
19,9,True,0.27


In [71]:
analyze_feature_value(shift_2016_all, 'LOC_ST_RES')

Unnamed: 0,LOC_ST_RES,LanguageShift,WEIGHT
1,1,True,0.19
3,2,True,0.29
5,3,True,0.27
7,4,True,0.18
9,9,True,0.13


In [72]:
analyze_feature_value(shift_2016_all, 'HHMRKINC')

Unnamed: 0,HHMRKINC,LanguageShift,WEIGHT
1,1,True,0.15
3,2,True,0.19
5,3,True,0.18
7,4,True,0.19
9,5,True,0.19
11,6,True,0.18
13,7,True,0.19
15,8,True,0.17
17,9,True,0.17
19,10,True,0.17


### Binary vs Detailed Fields

Immigrants who landed before 1980 swings IMMCAT5 massively.

Métis especially, and Inuit rarely shift.

In [73]:
analyze_feature_value(shift_2016_all, 'IMMCAT5')

Unnamed: 0,IMMCAT5,LanguageShift,WEIGHT
1,1,True,0.1
3,2,True,0.52
5,3,True,0.28
7,21,True,0.28
9,22,True,0.26
11,23,True,0.26
13,88,True,0.34


In [74]:
analyze_feature_value(shift_2016_all, 'IMMSTAT')

Unnamed: 0,IMMSTAT,LanguageShift,WEIGHT
1,1,True,0.1
3,2,True,0.32
5,3,True,0.28
7,8,True,0.34


In [75]:
analyze_feature_value(shift_2016_all, 'BFNMEMB')

Unnamed: 0,BFNMEMB,LanguageShift,WEIGHT
1,0,True,0.19
3,1,True,0.34


In [76]:
analyze_feature_value(shift_2016_all, 'ABOID')

Unnamed: 0,ABOID,LanguageShift,WEIGHT
1,1,True,0.31
3,2,True,0.03
5,3,True,0.18
7,4,True,0.06
9,5,True,0.1
11,6,True,0.19


### Sex and Parental Sex

No need to examine categories, but do note how mother matters more.  Linkt he source related to mothers.

In [77]:
analyze_feature_value(shift_2016_all, 'Sex')

Unnamed: 0,Sex,LanguageShift,WEIGHT
1,1,True,0.19
3,2,True,0.19


In [78]:
analyze_feature_value(shift_2016_all, 'POBM')

Unnamed: 0,POBM,LanguageShift,WEIGHT
1,1,True,0.03
3,2,True,0.32
5,3,True,0.53
7,4,True,0.22
9,5,True,0.27
11,6,True,0.29
13,8,True,0.43


In [79]:
analyze_feature_value(shift_2016_all, 'POBF')

Unnamed: 0,POBF,LanguageShift,WEIGHT
1,1,True,0.03
3,2,True,0.31
5,3,True,0.52
7,4,True,0.22
9,5,True,0.27
11,6,True,0.29
13,8,True,0.43


---

## 1991 vs 2016 Comparisons

```
Canadian citizenship status matters much more in 1991

Property value matter much more in 2016

Highest degree and secondary degree matters more in 2016, while highest attended matters more in 1991

Field of study matters much more in 2016
```

### Citizenship

In [80]:
analyze_feature_value(shift_1991, 'CITIZENP')

Unnamed: 0,CITIZENP,LanguageShift,WEIGHTP
1,1,True,0.04
3,2,True,0.3
5,3,True,0.15
7,4,True,0.19
9,8,True,0.26


In [81]:
analyze_feature_value(shift_2016, 'Citizen')

Unnamed: 0,Citizen,LanguageShift,WEIGHT
1,1,True,0.04
3,2,True,0.28
5,3,True,0.21


### Property Value

In [82]:
analyze_feature_value(shift_1991, 'VALUEP')

Unnamed: 0,VALUEP,LanguageShift,WEIGHTP
1,0,True,0.04
3,1,True,0.05
5,2,True,0.05
7,3,True,0.07
9,4,True,0.09
11,5,True,0.14
13,6,True,0.12
15,88,True,0.06


In [83]:
analyze_feature_value(shift_2016, 'VALUE')

Unnamed: 0,VALUE,LanguageShift,WEIGHT
1,0,True,0.04
3,1,True,0.05
5,2,True,0.07
7,3,True,0.09
9,4,True,0.11
11,5,True,0.13
13,6,True,0.15
15,7,True,0.17
17,88,True,0.1


### Degrees vs Attendance

In [84]:
analyze_feature_value(shift_1991, 'DGREEP')

Unnamed: 0,DGREEP,LanguageShift,WEIGHTP
1,1,True,0.08
3,2,True,0.07
5,3,True,0.1
7,4,True,0.09
9,5,True,0.09
11,6,True,0.1
13,7,True,0.11
15,8,True,0.13
17,9,True,0.13
19,10,True,0.16


In [85]:
analyze_feature_value(shift_2016, 'HDGREE')

Unnamed: 0,HDGREE,LanguageShift,WEIGHT
1,1,True,0.09
3,2,True,0.08
5,3,True,0.07
7,4,True,0.08
9,5,True,0.08
11,6,True,0.08
13,7,True,0.1
15,8,True,0.12
17,9,True,0.13
19,10,True,0.14


In [86]:
analyze_feature_value(shift_1991, 'SECGRADP')

Unnamed: 0,SECGRADP,LanguageShift,WEIGHTP
1,1,True,0.08
3,2,True,0.11
5,3,True,0.09
7,4,True,0.07
9,5,True,0.1
11,9,True,0.03


In [87]:
analyze_feature_value(shift_2016, 'SSGRAD')

Unnamed: 0,SSGRAD,LanguageShift,WEIGHT
1,1,True,0.09
3,2,True,0.09
5,3,True,0.08
7,4,True,0.08
9,5,True,0.07
11,6,True,0.09
13,7,True,0.12
15,8,True,0.13
17,9,True,0.14
19,10,True,0.16


In [88]:
analyze_feature_value(shift_1991, 'HLOSP')

Unnamed: 0,HLOSP,LanguageShift,WEIGHTP
1,1,True,0.08
3,2,True,0.1
5,3,True,0.07
7,4,True,0.07
9,5,True,0.08
11,6,True,0.09
13,7,True,0.11
15,8,True,0.09
17,9,True,0.09
19,10,True,0.1


In [89]:
analyze_feature_value(shift_2016, 'ATTSCH')

Unnamed: 0,ATTSCH,LanguageShift,WEIGHT
1,1,True,0.1
3,2,True,0.08
5,3,True,0.1
7,4,True,0.13
9,5,True,0.13
11,8,True,0.13
13,9,True,0.07


### Field of Study

In [90]:
analyze_feature_value(shift_1991, 'DGMFSP')

Unnamed: 0,DGMFSP,LanguageShift,WEIGHTP
1,1,True,0.08
3,2,True,0.11
5,3,True,0.09
7,4,True,0.09
9,5,True,0.11
11,6,True,0.08
13,7,True,0.12
15,8,True,0.16
17,9,True,0.1
19,10,True,0.08


In [91]:
analyze_feature_value(shift_2016, 'CIP2011')

Unnamed: 0,CIP2011,LanguageShift,WEIGHT
1,1,True,0.09
3,2,True,0.1
5,3,True,0.11
7,4,True,0.12
9,5,True,0.12
11,6,True,0.14
13,7,True,0.15
15,8,True,0.1
17,9,True,0.06
19,10,True,0.1
