In [17]:
from sklearn.preprocessing import StandardScaler
import re
import pandas as pd
import numpy as np
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.linear_model import MultiTaskLasso
# Setting random seed to get reproducible runs
RSEED = 100

# Part 1: Data Processing
### reading and processing the data

In [2]:
data = pd.read_csv("./data_clean/cancer_industry.csv")

In [3]:
data.columns

Index(['locale', 'fips', 'areatype', 'cancer', 'stateFIPS', 'state',
       'cancer_description', 'annual_count_avg', 'incidence rate_per_100000',
       'incidence rate_lower_95_confidence',
       'incidence rate_upper_95_confidence', 'ACID', 'ENRG', 'ETOX', 'EUTR',
       'FOOD', 'GCC', 'HAPS', 'HAZW', 'HC', 'HNC', 'HRSP', 'HTOX', 'JOBS',
       'LAND', 'METL', 'MINE', 'MSW', 'NREN', 'OZON', 'PEST', 'REN', 'SMOG',
       'VADD', 'WATR'],
      dtype='object')

In [4]:
factors=np.array(['ACID', 'ENRG', 'ETOX', 'EUTR', 'FOOD',
       'GCC', 'HAPS', 'HAZW', 'HC', 'HNC', 'HRSP', 'HTOX', 'JOBS', 'LAND',
       'METL', 'MINE', 'MSW', 'NREN', 'OZON', 'PEST', 'REN', 'SMOG', 'VADD',
       'WATR'])

In [5]:
data

Unnamed: 0,locale,fips,areatype,cancer,stateFIPS,state,cancer_description,annual_count_avg,incidence rate_per_100000,incidence rate_lower_95_confidence,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
0,"Autauga County(6,10)",1001,county,1,1,alabama,All Cancer Sites,304,495.6,470.6,...,0.000177,0.000000,3.332971,1558.288943,8.205763e-06,0.000069,1050.867678,9.764524,168.936475,27.485562
1,"Autauga County(6,10)",1001,county,3,1,alabama,Oral Cavity & Pharynx,8,13.5,9.6,...,0.000177,0.000000,3.332971,1558.288943,8.205763e-06,0.000069,1050.867678,9.764524,168.936475,27.485562
2,"Autauga County(6,10)",1001,county,17,1,alabama,Esophagus,0,0.0,0.0,...,0.000177,0.000000,3.332971,1558.288943,8.205763e-06,0.000069,1050.867678,9.764524,168.936475,27.485562
3,"Autauga County(6,10)",1001,county,18,1,alabama,Stomach,6,9.1,6.0,...,0.000177,0.000000,3.332971,1558.288943,8.205763e-06,0.000069,1050.867678,9.764524,168.936475,27.485562
4,"Autauga County(6,10)",1001,county,20,1,alabama,Colon & Rectum,32,52.2,44.2,...,0.000177,0.000000,3.332971,1558.288943,8.205763e-06,0.000069,1050.867678,9.764524,168.936475,27.485562
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72054,"Weston County(6,10)",56045,county,86,56,wyoming,Non-Hodgkin Lymphoma,0,0.0,0.0,...,0.000002,31.285543,0.491881,1971.904410,8.367255e-07,0.000002,0.159930,3.642929,28.875893,0.122450
72055,"Weston County(6,10)",56045,county,90,56,wyoming,Leukemia,0,0.0,0.0,...,0.000002,31.285543,0.491881,1971.904410,8.367255e-07,0.000002,0.159930,3.642929,28.875893,0.122450
72056,"Weston County(6,10)",56045,county,400,56,wyoming,Breast (in situ) (Female),0,0.0,0.0,...,0.000002,31.285543,0.491881,1971.904410,8.367255e-07,0.000002,0.159930,3.642929,28.875893,0.122450
72057,"Weston County(6,10)",56045,county,515,56,wyoming,"Childhood (Ages <20, All Sites)",0,0.0,0.0,...,0.000002,31.285543,0.491881,1971.904410,8.367255e-07,0.000002,0.159930,3.642929,28.875893,0.122450


In [6]:
data['incidence rate_per_100000'] = data['incidence rate_per_100000'].astype("float64")

### Getting the various cancers and their description

In [7]:
cancerVals=data["cancer"].unique()
cancerVals

array([  1,   3,  17,  18,  20,  35,  40,  47,  53,  55,  57,  58,  61,
        66,  71,  72,  76,  80,  86,  90, 400, 515, 516])

In [8]:
cancerDescr=data["cancer_description"].unique()
cancerDescr

array(['All Cancer Sites', 'Oral Cavity & Pharynx', 'Esophagus',
       'Stomach', 'Colon & Rectum', 'Liver & Bile Duct', 'Pancreas',
       'Lung & Bronchus', 'Melanoma of the Skin', 'Breast (Female)',
       'Cervix (Female)', 'Uterus (Corpus & Uterus, NOS) (Female)',
       'Ovary (Female)', 'Prostate (Male)', 'Bladder',
       'Kidney & Renal Pelvis', 'Brain & ONS', 'Thyroid',
       'Non-Hodgkin Lymphoma', 'Leukemia', 'Breast (in situ) (Female)',
       'Childhood (Ages <20, All Sites)',
       'Childhood (Ages <15, All Sites)'], dtype=object)

In [9]:
cancerDescription=data[["cancer","cancer_description"]]
cancerTypes=cancerDescription.groupby(["cancer","cancer_description"]).sum().reset_index()
print(cancerTypes)

cancer                      cancer_description
0        1                        All Cancer Sites
1        3                   Oral Cavity & Pharynx
2       17                               Esophagus
3       18                                 Stomach
4       20                          Colon & Rectum
5       35                       Liver & Bile Duct
6       40                                Pancreas
7       47                         Lung & Bronchus
8       53                    Melanoma of the Skin
9       55                         Breast (Female)
10      57                         Cervix (Female)
11      58  Uterus (Corpus & Uterus, NOS) (Female)
12      61                          Ovary (Female)
13      66                         Prostate (Male)
14      71                                 Bladder
15      72                   Kidney & Renal Pelvis
16      76                             Brain & ONS
17      80                                 Thyroid
18      86                    Non-H

### Reading and integrating with the average count of various cancer types per county

In [10]:
data2 = pd.read_csv("./data_clean/avg_count_per_cancer_per_fips.csv")
data2

Unnamed: 0,fips,1,3,17,18,20,35,40,47,53,...,66,71,72,76,80,86,90,400,515,516
0,1001,304,8,0,6,32,6,8,45,18,...,32,11,13,4,7,9,8,8,3,0
1,1003,1189,40,14,13,108,22,34,199,81,...,121,56,44,16,14,44,32,36,11,7
2,1005,145,7,0,0,14,0,4,23,4,...,22,6,7,0,0,5,0,4,0,0
3,1007,128,4,0,0,12,0,4,21,3,...,16,4,7,0,4,4,3,3,0,0
4,1009,317,8,0,5,28,5,7,52,15,...,38,12,13,4,6,12,10,10,4,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3128,56037,162,6,0,0,13,0,5,18,9,...,25,8,6,4,5,7,5,4,0,0
3129,56039,98,0,0,0,6,0,0,4,12,...,17,5,0,0,3,6,4,4,0,0
3130,56041,78,0,0,0,6,0,0,7,6,...,8,4,3,0,4,4,0,0,0,0
3131,56043,44,0,0,0,5,0,0,5,0,...,5,0,0,0,0,0,0,0,0,0


In [11]:
data.columns

Index(['locale', 'fips', 'areatype', 'cancer', 'stateFIPS', 'state',
       'cancer_description', 'annual_count_avg', 'incidence rate_per_100000',
       'incidence rate_lower_95_confidence',
       'incidence rate_upper_95_confidence', 'ACID', 'ENRG', 'ETOX', 'EUTR',
       'FOOD', 'GCC', 'HAPS', 'HAZW', 'HC', 'HNC', 'HRSP', 'HTOX', 'JOBS',
       'LAND', 'METL', 'MINE', 'MSW', 'NREN', 'OZON', 'PEST', 'REN', 'SMOG',
       'VADD', 'WATR'],
      dtype='object')

In [12]:

#selecting data for the specific cancer type
cancerData=data["cancer"]==1
newData=data[cancerData]
newData=newData[['fips','ACID', 'ENRG', 'ETOX', 'EUTR',
       'FOOD', 'GCC', 'HAPS', 'HAZW', 'HC', 'HNC', 'HRSP', 'HTOX', 'JOBS',
       'LAND', 'METL', 'MINE', 'MSW', 'NREN', 'OZON', 'PEST', 'REN', 'SMOG',
       'VADD', 'WATR']]
newData

Unnamed: 0,fips,ACID,ENRG,ETOX,EUTR,FOOD,GCC,HAPS,HAZW,HC,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
0,1001,1.049800,2609.156621,3.932211,0.091967,1.070762,375.880086,0.028596,0.038196,1.392760e-08,...,0.000177,0.000000,3.332971,1558.288943,8.205763e-06,6.937696e-05,1050.867678,9.764524,168.936475,27.485562
23,1003,0.280475,15.561882,7.987962,0.290754,7.601684,121.748265,0.062129,0.932868,5.461040e-08,...,0.000130,44.123480,28.367361,12.236171,1.025729e-05,1.910217e-04,3.325711,9.430919,1620.234233,4.251408
46,1005,0.210885,0.010255,4.445820,0.027651,0.184201,41.021013,0.014290,0.006247,3.207004e-08,...,0.000007,0.000000,1.000043,0.000000,2.608045e-07,5.804013e-05,0.010255,7.502860,65.168132,0.701977
69,1007,0.059937,30.656456,2.418462,0.009070,0.112118,11.566748,0.004779,0.007600,9.819009e-09,...,0.000010,0.000000,0.520036,0.000000,8.704410e-07,3.238827e-05,30.656456,2.133152,24.050294,0.384009
92,1009,0.066937,0.017388,0.668729,0.008988,0.313152,13.374711,0.006907,0.014673,1.074744e-08,...,0.000009,0.000000,1.277899,0.000000,4.780418e-06,1.113430e-05,0.017388,2.409639,60.574602,0.160868
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71944,56037,1.288242,6846.474113,1.671191,0.242657,1.548135,468.448494,0.094393,0.074211,2.360126e-07,...,0.000158,3197.779327,6.811835,6844.091086,6.462275e-06,1.788926e-05,2.383027,41.472524,654.507956,15.008485
71967,56039,0.083954,1.400183,17.679180,0.056704,3.572117,47.566459,0.090532,0.267253,3.504199e-08,...,0.000049,0.010181,12.942471,0.656189,5.078643e-06,4.713616e-04,0.743994,3.222444,721.506003,4.511225
71990,56041,0.192684,3509.922266,0.227323,0.027889,0.588789,62.957404,0.020252,0.015144,3.722919e-08,...,0.000022,730.743964,2.413328,3509.893777,1.139964e-06,1.078417e-06,0.028489,6.938709,202.008187,1.091574
72013,56043,0.084944,1652.835530,0.073477,0.012313,0.141232,17.695252,0.008530,0.004346,1.659289e-08,...,0.000001,0.000000,0.774102,1652.832375,3.911731e-07,5.269024e-10,0.003155,3.325439,51.721583,0.083898


In [13]:
df_inner = newData.merge(data2,on='fips', how='inner')

In [14]:
df_inner

Unnamed: 0,fips,ACID,ENRG,ETOX,EUTR,FOOD,GCC,HAPS,HAZW,HC,...,66,71,72,76,80,86,90,400,515,516
0,1001,1.049800,2609.156621,3.932211,0.091967,1.070762,375.880086,0.028596,0.038196,1.392760e-08,...,32,11,13,4,7,9,8,8,3,0
1,1003,0.280475,15.561882,7.987962,0.290754,7.601684,121.748265,0.062129,0.932868,5.461040e-08,...,121,56,44,16,14,44,32,36,11,7
2,1005,0.210885,0.010255,4.445820,0.027651,0.184201,41.021013,0.014290,0.006247,3.207004e-08,...,22,6,7,0,0,5,0,4,0,0
3,1007,0.059937,30.656456,2.418462,0.009070,0.112118,11.566748,0.004779,0.007600,9.819009e-09,...,16,4,7,0,4,4,3,3,0,0
4,1009,0.066937,0.017388,0.668729,0.008988,0.313152,13.374711,0.006907,0.014673,1.074744e-08,...,38,12,13,4,6,12,10,10,4,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3128,56037,1.288242,6846.474113,1.671191,0.242657,1.548135,468.448494,0.094393,0.074211,2.360126e-07,...,25,8,6,4,5,7,5,4,0,0
3129,56039,0.083954,1.400183,17.679180,0.056704,3.572117,47.566459,0.090532,0.267253,3.504199e-08,...,17,5,0,0,3,6,4,4,0,0
3130,56041,0.192684,3509.922266,0.227323,0.027889,0.588789,62.957404,0.020252,0.015144,3.722919e-08,...,8,4,3,0,4,4,0,0,0,0
3131,56043,0.084944,1652.835530,0.073477,0.012313,0.141232,17.695252,0.008530,0.004346,1.659289e-08,...,5,0,0,0,0,0,0,0,0,0


In [15]:
df_inner.columns

Index(['fips', 'ACID', 'ENRG', 'ETOX', 'EUTR', 'FOOD', 'GCC', 'HAPS', 'HAZW',
       'HC', 'HNC', 'HRSP', 'HTOX', 'JOBS', 'LAND', 'METL', 'MINE', 'MSW',
       'NREN', 'OZON', 'PEST', 'REN', 'SMOG', 'VADD', 'WATR', '1', '3', '17',
       '18', '20', '35', '40', '47', '53', '55', '57', '58', '61', '66', '71',
       '72', '76', '80', '86', '90', '400', '515', '516'],
      dtype='object')

# Final Model

### Multitask Lasso

In [16]:
#splitting to test and train
x1=df_inner.loc[:, 'ACID':'WATR']
y = df_inner.loc[:, '1':'516']

X_train1, X_test1, y_train, y_test = train_test_split(x1, y,
                                                    test_size=0.3,
                                                    random_state = RSEED)

scaler = StandardScaler()
scaler.fit(X_train1)

X_train = scaler.transform(X_train1)
X_test = scaler.transform(X_test1)
    

print("Creating Elastic Net Model and Fitting the Data")
lasso = MultiTaskLasso(random_state=RSEED).fit(X_train, y_train)
    
#Evaluation
r2_score = lasso.score(X_test, y_test)
print("score", r2_score)


Creating Elastic Net Model and Fitting the Data
score 0.8959449771883731


In [21]:
joblib.dump(lasso, './data_viz/national_choropleth/resources/Model2.sav')

['./data_viz/national_choropleth/resources/Model2.sav']