# Data preprocessing

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import os

In [2]:
path_file = os.path.join('..', 'data_preparation', 'data','full_dataset.csv')
full_dataset = pd.read_csv(path_file,sep=",")

# Keeping only the useful columns for the study
selected_columns = [
    'UHF42',
    'Natural Gas Use (kBtu)',
    'Total GHG Emissions (Metric Tons CO2e)',
    'number of restaurants',
    'parks superficy',
    'poverty',
    'traffic volume',
    'Borough',
    'PM2.5 | Annual Average 2015',
    'PM2.5 | Annual Average 2016',
    'PM2.5 | Annual Average 2017',
    'O3 | Summer 2015',
    'O3 | Summer 2016',
    'O3 | Summer 2017',
    'PM2.5_AEDV | Estimated annual rate (under age 18) | 2015-2017',
    'PM2.5_AEDV | Estimated annual rate (age 18+) | 2015-2017',
    'PM2.5_CH | 2015-2017',
    'PM2.5_RH | 2015-2017',
    'PM2.5_D | 2015-2017',
    'O3_AEDV | Estimated annual rate (under age 18) | 2015-2017',
    'O3_AEDV | Estimated annual rate (age 18+) | 2015-2017',
    'O3_AH | Estimated annual rate (age 18+) | 2015-2017',
    'O3_AH | Estimated annual rate (under age 18) | 2015-2017',
    'O3_CRD | 2015-2017'
]
selected_df = full_dataset[selected_columns]

In [3]:
# calculating the O3 mean concentration in years 2015 until 2017
selected_df['O3 Particles Concentration'] = ( selected_df['O3 | Summer 2015'] + selected_df['O3 | Summer 2016'] + selected_df['O3 | Summer 2017'] ) / 3
selected_df['PM2.5 Particles Concentration'] = ( selected_df['PM2.5 | Annual Average 2015'] + selected_df['PM2.5 | Annual Average 2016'] + selected_df['PM2.5 | Annual Average 2017'] ) / 3

# we won't make a distinction between diseases based on age. Thus, we will take the mean of the two variables
selected_df['O3 Asthma Emergency Department Visits'] = ( selected_df['O3_AEDV | Estimated annual rate (under age 18) | 2015-2017'] + 
selected_df['O3_AEDV | Estimated annual rate (age 18+) | 2015-2017'] ) / 2
selected_df['O3 Attributable Hospitalizations'] = ( selected_df['O3_AH | Estimated annual rate (age 18+) | 2015-2017'] + selected_df['O3_AH | Estimated annual rate (under age 18) | 2015-2017'] ) / 2
selected_df['PM2.5 Asthma Emergency Department Visits'] = (selected_df['PM2.5_AEDV | Estimated annual rate (under age 18) | 2015-2017'] + selected_df['PM2.5_AEDV | Estimated annual rate (age 18+) | 2015-2017']) / 2

# for simplification, we will observe hospitalizations in general. Thus, we'll combine cardiovascular hospitalizations and respiratory hospitalizations
selected_df['PM2.5 Attributable Hospitalizations'] = ( selected_df['PM2.5_CH | 2015-2017']+ selected_df['PM2.5_RH | 2015-2017'] ) / 2

# renaming columns with more intuitive names
columns_rename= {
 'PM2.5_D | 2015-2017': "PM2.5 Attributable Deaths",
  'O3_CRD | 2015-2017': "O3 Attributable Deaths"
}
selected_df.rename(columns=columns_rename, inplace=True)

# dropping all transformed columns
selected_df.drop(columns=['O3 | Summer 2017', 
                          'O3 | Summer 2016',
                          'O3 | Summer 2015', 
                          'O3_AEDV | Estimated annual rate (under age 18) | 2015-2017',
                          'O3_AEDV | Estimated annual rate (age 18+) | 2015-2017',
                          'O3_AH | Estimated annual rate (age 18+) | 2015-2017',
                          'O3_AH | Estimated annual rate (under age 18) | 2015-2017',
                          'PM2.5 | Annual Average 2015', 
                          'PM2.5 | Annual Average 2016', 
                          'PM2.5 | Annual Average 2017', 
                          'PM2.5_AEDV | Estimated annual rate (under age 18) | 2015-2017',
                          'PM2.5_AEDV | Estimated annual rate (age 18+) | 2015-2017',
                          'PM2.5_CH | 2015-2017',
                          'PM2.5_RH | 2015-2017'], axis=1, inplace=True)

In [4]:
selected_df.to_csv('final-dataset.csv')