In [43]:
import numpy as np
import pandas as pd
# Setting random seed to get reproducible runs
RSEED = 100

# Importing dataset and processing it

In [63]:
df = pd.read_csv("../data_clean/cancer_industry.csv")

In [64]:
df.head()

Unnamed: 0,locale,fips,areatype,cancer,stateFIPS,state,cancer_description,annual_count_avg,incidence rate_per_100000,incidence rate_lower_95_confidence,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
0,"Autauga County(6,10)",1001,county,1,1,alabama,All Cancer Sites,304,495.6,470.6,...,8.74938e-08,0.0,0.004263,0.0,5.00534e-10,1.61719e-05,0.000365,0.026608,0.869459,0.180875
1,"Autauga County(6,10)",1001,county,1,1,alabama,All Cancer Sites,304,495.6,470.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Autauga County(6,10)",1001,county,1,1,alabama,All Cancer Sites,304,495.6,470.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Autauga County(6,10)",1001,county,1,1,alabama,All Cancer Sites,304,495.6,470.6,...,0.000168707,0.0,0.084219,1558.288943,6.75546e-06,2.14853e-08,1050.804066,8.594629,42.953215,26.7619
4,"Autauga County(6,10)",1001,county,1,1,alabama,All Cancer Sites,304,495.6,470.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
df.columns

Index(['locale', 'fips', 'areatype', 'cancer', 'stateFIPS', 'state',
       'cancer_description', 'annual_count_avg', 'incidence rate_per_100000',
       'incidence rate_lower_95_confidence',
       'incidence rate_upper_95_confidence', 'county', 'name', 'industry_code',
       'industry_detail', 'relevant_naics', 'payann', 'total_compensation',
       'added_value ($)', 'local_tranformation_ind', 'estab', 'emp', 'ACID',
       'ENRG', 'ETOX', 'EUTR', 'FOOD', 'GCC', 'HAPS', 'HAZW', 'HC', 'HNC',
       'HRSP', 'HTOX', 'JOBS', 'LAND', 'METL', 'MINE', 'MSW', 'NREN', 'OZON',
       'PEST', 'REN', 'SMOG', 'VADD', 'WATR'],
      dtype='object')

In [66]:
df.shape

(7030318, 46)

## Only keeping the values that we are interested in

### Keeping general statistics for all types of cancer

In [67]:
dataset = df[df['cancer'] == 1].copy()
dataset.shape

(305666, 46)

### Keeping the columns that we are interested in

In [68]:
# for now we drop the 'fips' column because we supose that the effect of having an idustry present in a certain
# area will be the same in any location
dataset.drop(['locale', 'fips','areatype', 'cancer', 'stateFIPS', 'state',
              'cancer_description', 'annual_count_avg', 'incidence rate_per_100000',
              'incidence rate_lower_95_confidence','incidence rate_upper_95_confidence',
              'industry_detail', 'relevant_naics','county', 'name'], axis=1, inplace=True)
dataset.head()

Unnamed: 0,industry_code,payann,total_compensation,added_value ($),local_tranformation_ind,estab,emp,ACID,ENRG,ETOX,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
0,113000,1.102,6363,8448,0.001463,6,30,0.001144,0.000365,1.186567,...,8.74938e-08,0.0,0.004263,0.0,5.00534e-10,1.61719e-05,0.000365,0.026608,0.869459,0.180875
1,212310,0.0,2385,8559,0.0,1,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2123a0,0.0,3190,8450,0.0,1,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,221100,16.657,54469,195047,0.059647,8,182,1.017563,2609.093009,0.732817,...,0.000168707,0.0,0.084219,1558.288943,6.75546e-06,2.14853e-08,1050.804066,8.594629,42.953215,26.7619
4,221300,0.0,3991,8132,0.0,1,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Now we want to predict the 24 environemental factors from varialbles such as fips, pay_ann, total_compensation, added_value, #estab, #emp

## First we have to create one-hot vectors for idustry_codes and fips values

In [69]:
from sklearn.preprocessing import OneHotEncoder

# Creating One-Hot Encoder
enc = OneHotEncoder(handle_unknown='ignore')
enc_df = pd.DataFrame(enc.fit_transform(dataset[['industry_code']]).toarray())

In [70]:
# merge with main df dataset on key values
dataset = dataset.join(enc_df)
enc_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,325,326,327,328,329,330,331,332,333,334
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
len(dataset['industry_code'].unique())

335

In [73]:
dataset.columns = dataset.columns.astype(str)

In [74]:
dataset.loc[:, '0':]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,325,326,327,328,329,330,331,332,333,334
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7029059,,,,,,,,,,,...,,,,,,,,,,
7029060,,,,,,,,,,,...,,,,,,,,,,
7029061,,,,,,,,,,,...,,,,,,,,,,
7029062,,,,,,,,,,,...,,,,,,,,,,


In [77]:
x_bis = pd.concat([dataset.loc[:, :'emp'], dataset.loc[:, '0':]], axis=1)

In [76]:
x_bis.tail()

Unnamed: 0,industry_code,payann,total_compensation,added_value ($),local_tranformation_ind,estab,emp,0,1,2,...,325,326,327,328,329,330,331,332,333,334
7029059,812200,0.0,30961,46819,0.0,1,0,,,,...,,,,,,,,,,
7029060,812300,0.0,7516,11735,0.0,1,0,,,,...,,,,,,,,,,
7029061,813100,0.177,15995,34235,0.000379,8,15,,,,...,,,,,,,,,,
7029062,813a00,0.0,12255,21646,0.0,1,0,,,,...,,,,,,,,,,
7029063,813b00,0.0,22449,29800,0.0,2,0,,,,...,,,,,,,,,,


## Partitionning our dataset into train and test sets

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

x1 = dataset.loc[:, :'emp' + '0':]
y = dataset.loc[:, 'ACID':'WATR']

X_train1, X_test1, y_train, y_test = train_test_split(x1, y,
                                                    test_size=0.3,
                                                    random_state = RSEED)

scaler = StandardScaler()
scaler.fit(X_train1)

X_train = scaler.transform(X_train1)
X_test = scaler.transform(X_test1)

print("XTrain",X_train.shape)
print("XTest",X_test.shape)

ValueError: could not convert string to float: '531ore'

## Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Create the model with 100 trees
regressor = RandomForestRegressor(n_estimators = 100,
                                  random_state = RSEED)

# Fit on training data
regressor.fit(X_train, y_train)

