# Data preprocessing

In [58]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt

In [59]:
full_dataset = pd.read_csv('full_dataset.csv',sep=",")

we will try to explain :

Health outcomes related to PM2.5 between 2015 to 2017 :
- Asthma Emergency Department Visits
- Asthma Hospitalizations
- Deaths

Health outcomes related to O3 between 2015 to 2017
- Asthma Emergency Department Visits
- Respiratory and Cardiovascular Hospitalizations
- Deaths

As well as the concentration of those two particles, to understand which are the factors that impacts them.

we will concentrate on 2015-2017 data, since those are the years for which we have the most complete data.

In [None]:
columns = full_dataset.columns
for column in columns:
    print(column)

In [None]:
selected_columns = [
    'UHF42',
    'Largest Property Use Type - Gross Floor Area (ft²)',
    'Weather Normalized Site Natural Gas Intensity (therms/ft²)',
    'Natural Gas Use (kBtu)',
    'Weather Normalized Site Natural Gas Use (therms)',
    'Total GHG Emissions (Metric Tons CO2e)',
    'number of restaurants',
    'parks superficy',
    'poverty percentage',
    'traffic volume',
    'Borough',
    'PM2.5 | Annual Average 2009',
    'PM2.5 | Annual Average 2010',
    'PM2.5 | Annual Average 2011',
    'PM2.5 | Annual Average 2012',
    'PM2.5 | Annual Average 2013',
    'PM2.5 | Annual Average 2014',
    'PM2.5 | Annual Average 2015',
    'PM2.5 | Annual Average 2016',
    'PM2.5 | Annual Average 2017',
    'O3 | Summer 2009',
    'O3 | Summer 2010',
    'O3 | Summer 2011',
    'O3 | Summer 2012',
    'O3 | Summer 2013',
    'O3 | Summer 2014',
    'O3 | Summer 2015',
    'O3 | Summer 2016',
    'O3 | Summer 2017',
    'PM2.5_AEDV | Estimated annual rate (under age 18) | 2015-2017',
    'PM2.5_AEDV | Estimated annual rate (age 18+) | 2015-2017',
    'PM2.5_CH | 2015-2017',
    'PM2.5_RH | 2015-2017',
    'PM2.5_D | 2015-2017',
    'O3_AEDV | Estimated annual rate (under age 18) | 2015-2017',
    'O3_AEDV | Estimated annual rate (age 18+) | 2015-2017',
    'O3_AH | Estimated annual rate (age 18+) | 2015-2017',
    'O3_AH | Estimated annual rate (under age 18) | 2015-2017',
    'O3_CRD | 2015-2017'
]

selected_df = full_dataset[selected_columns]

In [ ]:
selected_df['O3 Particles Concentration'] = ( selected_df['O3 | Summer 2017'] + selected_df['O3 | Summer 2016'] + selected_df['O3 | Summer 2015'] ) / 3

In [ ]:
selected_df['O3 Asthma Emergency Department Visits'] = ( selected_df['O3_AEDV | Estimated annual rate (under age 18) | 2015-2017'] + 
selected_df['O3_AEDV | Estimated annual rate (age 18+) | 2015-2017'] ) / 2

In [ ]:
selected_df['O3 Attributable Hospitalizations'] = ( selected_df['O3_AH | Estimated annual rate (age 18+) | 2015-2017'] + selected_df['O3_AH | Estimated annual rate (under age 18) | 2015-2017'] ) / 2

In [ ]:
selected_df['PM2.5 Particles Concentration'] = ( selected_df['PM2.5 | Annual Average 2015'] + selected_df['PM2.5 | Annual Average 2016'] + selected_df['PM2.5 | Annual Average 2017'] ) / 3

In [62]:
selected_df['PM2.5 Asthma Emergency Department Visits'] = (selected_df['PM2.5_AEDV | Estimated annual rate (under age 18) | 2015-2017'] + selected_df['PM2.5_AEDV | Estimated annual rate (age 18+) | 2015-2017']) / 2

In [63]:
selected_df['PM2.5 Attributable Hospitalizations'] = ( selected_df['PM2.5_CH | 2015-2017']+ selected_df['PM2.5_RH | 2015-2017'] ) / 2

In [64]:
D_rename= {
 'PM2.5_D | 2015-2017': "PM2.5 Attributable Deaths",
  'O3_CRD | 2015-2017': "O3 Attributable Deaths"
}
selected_df.rename(columns=D_rename, inplace=True)

In [ ]:
selected_df.drop(columns=['O3 | Summer 2017', 
                          'O3 | Summer 2016',
                          'O3 | Summer 2015', 
                          'O3_AEDV | Estimated annual rate (under age 18) | 2015-2017',
                          'O3_AEDV | Estimated annual rate (age 18+) | 2015-2017',
                          'O3_AH | Estimated annual rate (age 18+) | 2015-2017',
                          'O3_AH | Estimated annual rate (under age 18) | 2015-2017',
                          'PM2.5 | Annual Average 2015', 
                          'PM2.5 | Annual Average 2016', 
                          'PM2.5 | Annual Average 2017', 
                          'PM2.5_AEDV | Estimated annual rate (under age 18) | 2015-2017',
                          'PM2.5_AEDV | Estimated annual rate (age 18+) | 2015-2017',
                          'PM2.5_CH | 2015-2017',
                          'PM2.5_RH | 2015-2017'], axis=1, inplace=True)

In [ ]:
tg_name= ['PM2.5 Asthma Emergency Department Visits', 'PM2.5 Attributable Hospitalizations', 'PM2.5 Attributable Deaths']
tss_arr= df[tg_name].values
#tss_arr= tss_arr.mean(axis=1).reshape(-1, 1)

silhouette_scores = []

# Silhouette scores for k values between 2 and 30
for n_clusters in range(2, 21):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    cluster_labels = kmeans.fit_predict(tss_arr)
    silhouette_avg = silhouette_score(tss_arr, cluster_labels)
    silhouette_scores.append(silhouette_avg)

clear_output()
plt.figure(figsize=(12, 8))
plt.plot(range(2, 21), silhouette_scores, marker='o')
plt.xlabel('Number of clusters', fontsize=15)
plt.xticks(range(2, 21))
plt.ylabel('Silhouette score', fontsize=15)
plt.title(f'Silhouette score analysis for PM2.5 Health Outcomes', fontsize=15)
plt.show()