In [1]:
import numpy as np
import pandas as pd
# Setting random seed to get reproducible runs
RSEED = 100

# Importing dataset and processing it

In [2]:
df = pd.read_csv("../data_clean/cancer_industry.csv")

In [3]:
df.head()

Unnamed: 0,locale,fips,areatype,cancer,stateFIPS,state,cancer_description,annual_count_avg,incidence rate_per_100000,incidence rate_lower_95_confidence,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
0,"Autauga County(6,10)",1001,county,1,1,alabama,All Cancer Sites,304,495.6,470.6,...,8.74938e-08,0.0,0.004263,0.0,5.00534e-10,1.61719e-05,0.000365,0.026608,0.869459,0.180875
1,"Autauga County(6,10)",1001,county,1,1,alabama,All Cancer Sites,304,495.6,470.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Autauga County(6,10)",1001,county,1,1,alabama,All Cancer Sites,304,495.6,470.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Autauga County(6,10)",1001,county,1,1,alabama,All Cancer Sites,304,495.6,470.6,...,0.000168707,0.0,0.084219,1558.288943,6.75546e-06,2.14853e-08,1050.804066,8.594629,42.953215,26.7619
4,"Autauga County(6,10)",1001,county,1,1,alabama,All Cancer Sites,304,495.6,470.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
df.columns

Index(['locale', 'fips', 'areatype', 'cancer', 'stateFIPS', 'state',
       'cancer_description', 'annual_count_avg', 'incidence rate_per_100000',
       'incidence rate_lower_95_confidence',
       'incidence rate_upper_95_confidence', 'county', 'name', 'industry_code',
       'industry_detail', 'relevant_naics', 'payann', 'total_compensation',
       'added_value ($)', 'local_tranformation_ind', 'estab', 'emp', 'ACID',
       'ENRG', 'ETOX', 'EUTR', 'FOOD', 'GCC', 'HAPS', 'HAZW', 'HC', 'HNC',
       'HRSP', 'HTOX', 'JOBS', 'LAND', 'METL', 'MINE', 'MSW', 'NREN', 'OZON',
       'PEST', 'REN', 'SMOG', 'VADD', 'WATR'],
      dtype='object')

In [5]:
df.shape

(7030318, 46)

## Only keeping the values that we are interested in

### Keeping general statistics for all types of cancer

In [6]:
dataset = df[df['cancer'] == 1].copy()
dataset.shape

(305666, 46)

### Keeping the columns that we are interested in

In [7]:
# for now we drop the 'fips' column because we supose that the effect of having an idustry present in a certain
# area will be the same in any location
dataset.drop(['locale', 'fips','areatype', 'cancer', 'stateFIPS', 'state',
              'cancer_description', 'annual_count_avg', 'incidence rate_per_100000',
              'incidence rate_lower_95_confidence','incidence rate_upper_95_confidence',
              'industry_detail', 'relevant_naics','county', 'name', 'local_tranformation_ind'], axis=1, inplace=True)

In [8]:
dataset.tail()

Unnamed: 0,industry_code,payann,total_compensation,added_value ($),estab,emp,ACID,ENRG,ETOX,EUTR,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
7029059,812200,0.0,30961,46819,1,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7029060,812300,0.0,7516,11735,1,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7029061,813100,0.177,15995,34235,8,15,1e-06,0.0,1e-06,1.47257e-07,...,2.58983e-11,0.0,0.001015,0.0,0.0,0.0,0.0,4.9e-05,0.108083,0.000396
7029062,813a00,0.0,12255,21646,1,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7029063,813b00,0.0,22449,29800,2,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Now we want to predict the 24 environemental factors from varialbles such as fips, pay_ann, total_compensation, added_value, #estab, #emp

## First we have to create one-hot vectors for idustry_codes and fips values

In [9]:
#initializing 1-hot vectors to 0
for ind_code in dataset["industry_code"].unique():
    dataset[ind_code] = 0
    dataset[ind_code] = dataset[ind_code].astype(np.uint8)

In [10]:
dataset.tail()

Unnamed: 0,industry_code,payann,total_compensation,added_value ($),estab,emp,ACID,ENRG,ETOX,EUTR,...,322291,311230,332913,334300,335221,325413,333991,311221,335110,335224
7029059,812200,0.0,30961,46819,1,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7029060,812300,0.0,7516,11735,1,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7029061,813100,0.177,15995,34235,8,15,1e-06,0.0,1e-06,1.47257e-07,...,0,0,0,0,0,0,0,0,0,0
7029062,813a00,0.0,12255,21646,1,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7029063,813b00,0.0,22449,29800,2,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
def set_one_hot(row):
    row[row['industry_code']] = 1
    return row

In [12]:
dataset = dataset.apply(set_one_hot, axis=1)

In [13]:
len(dataset['industry_code'].unique())

335

In [14]:
dataset.tail()

Unnamed: 0,industry_code,payann,total_compensation,added_value ($),estab,emp,ACID,ENRG,ETOX,EUTR,...,322291,311230,332913,334300,335221,325413,333991,311221,335110,335224
7029059,812200,0.0,30961,46819,1,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7029060,812300,0.0,7516,11735,1,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7029061,813100,0.177,15995,34235,8,15,1e-06,0.0,1e-06,1.47257e-07,...,0,0,0,0,0,0,0,0,0,0
7029062,813a00,0.0,12255,21646,1,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7029063,813b00,0.0,22449,29800,2,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
#print(list(dataset.columns))

## Partitionning our dataset into train and test sets

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

x1 = pd.concat([dataset.loc[:, 'payann':'emp'], dataset.loc[:, '113000':]], axis=1)
y = dataset.loc[:, 'ACID':'WATR']

X_train1, X_test1, y_train, y_test = train_test_split(x1, y,
                                                    test_size=0.3,
                                                    random_state = RSEED)

scaler = StandardScaler()
scaler.fit(X_train1)

X_train = scaler.transform(X_train1)
X_test = scaler.transform(X_test1)

print("XTrain",X_train.shape)
print("XTest",X_test.shape)

XTrain (213966, 340)
XTest (91700, 340)


## Elastic Net

### Train on scaled data

In [21]:
from sklearn.linear_model import ElasticNet
# Create the model with 100 trees
eNet = ElasticNet(random_state=RSEED)


# Fit on training data
eNet.fit(X_train, y_train)



ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=100, selection='cyclic', tol=0.0001, warm_start=False)

In [22]:
from sklearn import metrics
train_rf_predictions = eNet.predict(X_train)
rf_predictions = eNet.predict(X_test)
MAE_train = metrics.mean_absolute_error(y_train, train_rf_predictions)
RMSE_train = np.sqrt(metrics.mean_squared_error(y_train, train_rf_predictions))
MAPE_train = 100 * np.mean(abs(train_rf_predictions - y_train)/abs(y_train))
accuracy_train = 100 - MAPE_train
r2_train = metrics.r2_score(y_train, train_rf_predictions)

MAE_test = metrics.mean_absolute_error(y_test, rf_predictions)
RMSE_test = np.sqrt(metrics.mean_squared_error(y_test, rf_predictions))
MAPE_test = 100 * np.mean(abs(rf_predictions - y_test)/abs(y_test))
accuracy_test = 100 - MAPE_test
r2_test = metrics.r2_score(y_test, rf_predictions)
#print('Mean Absolute Error Train:', MAE_train)    
#print('Root Mean Squared Error Train:', RMSE_train)
#print('Mean Absolute Percentage Error Train:', MAPE_train)
print('r2_score Train:',r2_train)
print()

print('r2_score Test:', r2_test)

r2_score Train: 0.03916850896583704

r2_score Test: 0.046095248912788524


In [23]:
from sklearn.model_selection import cross_val_score
cross_val_score(eNet, scaler.transform(x1), y, cv=5, scoring='r2')

array([ 0.04583036,  0.04470354, -0.02553687,  0.04412511,  0.04308019])

In [21]:
n_nodes = []
max_depths = []

for ind_tree in regressor.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')

Average number of nodes 83380
Average maximum depth 88


## Ridge

In [35]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0)

ridge.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [36]:
from sklearn import metrics
train_rf_predictions = ridge.predict(X_train)
rf_predictions = ridge.predict(X_test)
MAE_train = metrics.mean_absolute_error(y_train, train_rf_predictions)
RMSE_train = np.sqrt(metrics.mean_squared_error(y_train, train_rf_predictions))
MAPE_train = 100 * np.mean(abs(train_rf_predictions - y_train)/abs(y_train))
accuracy_train = 100 - MAPE_train
r2_train = metrics.r2_score(y_train, train_rf_predictions)

MAE_test = metrics.mean_absolute_error(y_test, rf_predictions)
RMSE_test = np.sqrt(metrics.mean_squared_error(y_test, rf_predictions))
MAPE_test = 100 * np.mean(abs(rf_predictions - y_test)/abs(y_test))
accuracy_test = 100 - MAPE_test
r2_test = metrics.r2_score(y_test, rf_predictions)
#print('Mean Absolute Error Train:', MAE_train)    
#print('Root Mean Squared Error Train:', RMSE_train)
#print('Mean Absolute Percentage Error Train:', MAPE_train)
print('r2_score Train:',r2_train)
print()

print('r2_score Test:', r2_test)

r2_score Train: 0.11104523850868415

r2_score Test: 0.1115001221173852


In [32]:
from sklearn.model_selection import cross_val_score
cross_val_score(ridge, scaler.transform(x1), y, cv=5, scoring='r2')

array([ 0.11880062,  0.109211  , -0.45352566,  0.11607897,  0.11526357])

In [33]:
from sklearn.model_selection import GridSearchCV
parameters={'alpha':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]}
rf = Ridge(random_state = RSEED)
clf = GridSearchCV(rf, parameters)
clf.fit(X_train, y_train)









GridSearchCV(cv=None, error_score=nan,
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=100,
                             solver='auto', tol=0.001),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,
                                   1.0]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [34]:
print(clf.best_params_)

{'alpha': 1.0}


## Lasso

In [39]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.001, max_iter=10000)
lasso.fit(X_train, y_train)

  positive)
  positive)


Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=10000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [40]:
from sklearn import metrics
train_rf_predictions = lasso.predict(X_train)
rf_predictions = lasso.predict(X_test)
MAE_train = metrics.mean_absolute_error(y_train, train_rf_predictions)
RMSE_train = np.sqrt(metrics.mean_squared_error(y_train, train_rf_predictions))
MAPE_train = 100 * np.mean(abs(train_rf_predictions - y_train)/abs(y_train))
accuracy_train = 100 - MAPE_train
r2_train = metrics.r2_score(y_train, train_rf_predictions)

MAE_test = metrics.mean_absolute_error(y_test, rf_predictions)
RMSE_test = np.sqrt(metrics.mean_squared_error(y_test, rf_predictions))
MAPE_test = 100 * np.mean(abs(rf_predictions - y_test)/abs(y_test))
accuracy_test = 100 - MAPE_test
r2_test = metrics.r2_score(y_test, rf_predictions)
#print('Mean Absolute Error Train:', MAE_train)    
#print('Root Mean Squared Error Train:', RMSE_train)
#print('Mean Absolute Percentage Error Train:', MAPE_train)
print('r2_score Train:',r2_train)
print()

print('r2_score Test:', r2_test)

r2_score Train: 0.09344076708019568

r2_score Test: 0.09904520298051202


In [41]:
from sklearn.model_selection import cross_val_score
cross_val_score(lasso, scaler.transform(x1), y, cv=5, scoring='r2')

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


array([ 0.09835822,  0.09254016, -0.41685174,  0.09535084,  0.10148581])

## Lasso Lars

In [42]:
from sklearn.linear_model import LassoLars

lasso_lars = LassoLars(alpha=0.01)
lasso_lars.fit(X_train, y_train)

LassoLars(alpha=0.01, copy_X=True, eps=2.220446049250313e-16,
          fit_intercept=True, fit_path=True, max_iter=500, normalize=True,
          positive=False, precompute='auto', verbose=False)

In [43]:
from sklearn import metrics
train_rf_predictions = lasso_lars.predict(X_train)
rf_predictions = lasso_lars.predict(X_test)
MAE_train = metrics.mean_absolute_error(y_train, train_rf_predictions)
RMSE_train = np.sqrt(metrics.mean_squared_error(y_train, train_rf_predictions))
MAPE_train = 100 * np.mean(abs(train_rf_predictions - y_train)/abs(y_train))
accuracy_train = 100 - MAPE_train
r2_train = metrics.r2_score(y_train, train_rf_predictions)

MAE_test = metrics.mean_absolute_error(y_test, rf_predictions)
RMSE_test = np.sqrt(metrics.mean_squared_error(y_test, rf_predictions))
MAPE_test = 100 * np.mean(abs(rf_predictions - y_test)/abs(y_test))
accuracy_test = 100 - MAPE_test
r2_test = metrics.r2_score(y_test, rf_predictions)
#print('Mean Absolute Error Train:', MAE_train)    
#print('Root Mean Squared Error Train:', RMSE_train)
#print('Mean Absolute Percentage Error Train:', MAPE_train)
print('r2_score Train:',r2_train)
print()

print('r2_score Test:', r2_test)

r2_score Train: 0.02423209511473272

r2_score Test: 0.027161435770373308


In [44]:
from sklearn.model_selection import cross_val_score
cross_val_score(lasso_lars, scaler.transform(x1), y, cv=5, scoring='r2')

array([ 0.02926215,  0.02717232, -0.40693986,  0.02588138,  0.02767572])

## MultiTaskElasticNet

In [45]:
from sklearn.linear_model import MultiTaskElasticNet

eNet_multi_task = MultiTaskElasticNet(random_state=RSEED)

eNet_multi_task.fit(X_train, y_train)

MultiTaskElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
                    max_iter=1000, normalize=False, random_state=100,
                    selection='cyclic', tol=0.0001, warm_start=False)

In [46]:
from sklearn import metrics
train_rf_predictions = eNet_multi_task.predict(X_train)
rf_predictions = eNet_multi_task.predict(X_test)
MAE_train = metrics.mean_absolute_error(y_train, train_rf_predictions)
RMSE_train = np.sqrt(metrics.mean_squared_error(y_train, train_rf_predictions))
MAPE_train = 100 * np.mean(abs(train_rf_predictions - y_train)/abs(y_train))
accuracy_train = 100 - MAPE_train
r2_train = metrics.r2_score(y_train, train_rf_predictions)

MAE_test = metrics.mean_absolute_error(y_test, rf_predictions)
RMSE_test = np.sqrt(metrics.mean_squared_error(y_test, rf_predictions))
MAPE_test = 100 * np.mean(abs(rf_predictions - y_test)/abs(y_test))
accuracy_test = 100 - MAPE_test
r2_test = metrics.r2_score(y_test, rf_predictions)
#print('Mean Absolute Error Train:', MAE_train)    
#print('Root Mean Squared Error Train:', RMSE_train)
#print('Mean Absolute Percentage Error Train:', MAPE_train)
print('r2_score Train:',r2_train)
print()

print('r2_score Test:', r2_test)

r2_score Train: 0.09962787289397435

r2_score Test: 0.10849633609339421


In [47]:
from sklearn.model_selection import cross_val_score
cross_val_score(eNet_multi_task, scaler.transform(x1), y, cv=5, scoring='r2')

array([0.10611275, 0.11185924, 0.03144025, 0.1189362 , 0.10255662])

## MultiTaskLasso

In [48]:
from sklearn.linear_model import MultiTaskLasso

lasso_multi_task = MultiTaskLasso(random_state=RSEED)
lasso_multi_task.fit(X_train, y_train)


MultiTaskLasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
               normalize=False, random_state=100, selection='cyclic',
               tol=0.0001, warm_start=False)

In [49]:
from sklearn import metrics
train_rf_predictions = lasso_multi_task.predict(X_train)
rf_predictions = lasso_multi_task.predict(X_test)
MAE_train = metrics.mean_absolute_error(y_train, train_rf_predictions)
RMSE_train = np.sqrt(metrics.mean_squared_error(y_train, train_rf_predictions))
MAPE_train = 100 * np.mean(abs(train_rf_predictions - y_train)/abs(y_train))
accuracy_train = 100 - MAPE_train
r2_train = metrics.r2_score(y_train, train_rf_predictions)

MAE_test = metrics.mean_absolute_error(y_test, rf_predictions)
RMSE_test = np.sqrt(metrics.mean_squared_error(y_test, rf_predictions))
MAPE_test = 100 * np.mean(abs(rf_predictions - y_test)/abs(y_test))
accuracy_test = 100 - MAPE_test
r2_test = metrics.r2_score(y_test, rf_predictions)
#print('Mean Absolute Error Train:', MAE_train)    
#print('Root Mean Squared Error Train:', RMSE_train)
#print('Mean Absolute Percentage Error Train:', MAPE_train)
print('r2_score Train:',r2_train)
print()

print('r2_score Test:', r2_test)

r2_score Train: 0.10257116700992519

r2_score Test: 0.10409636280893397


In [50]:
from sklearn.model_selection import cross_val_score
cross_val_score(lasso_multi_task, scaler.transform(x1), y, cv=5, scoring='r2')

array([ 0.10738479,  0.10470038, -0.44847703,  0.11549318,  0.10814868])