# Data Cleaning

In this notebook, I will be documenting and justifying the omission of measurements from analysis. First, let us import the packages and scripts that I used.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
from scipy import stats
from dropper import * #contains functions that I created to drop measurements

In [None]:
df = pd.read_csv(r'/home/jovyan/complete_data.csv', index_col = 0, parse_dates =  True)
pm_col = [' PM10 (µg/m^3)', ' PM2.5 (µg/m^3)', ' PM1 (µg/m^3)']
pm = df[pm_col]
pm_newyear = pm['2019-12-31 12':'2020-01-01 06']

In [None]:
# summary statistics on time difference between measurements during the 2020 New Year
pd.DataFrame( 
    (pm_newyear.index[1:] - pm_newyear.index[:-1])/np.timedelta64(1, 'm')
            ).describe()

In [None]:
#conversion of dataframe above to latex table
pd.DataFrame( 
    (pm_newyear.index[1:] - pm_newyear.index[:-1])/np.timedelta64(1, 'm')
            ).describe().style.to_latex()

Let us look at the summary statistics before removing measurements

In [None]:
#Before omitting measurements
pm.describe()

In [None]:
#converting dataframe to a latex table
pm.describe().style.to_latex()

In [None]:
#number of measurements before discarding measurements
pm.count()

In [None]:
coarse_pm

In [None]:
pm10_unfiltered

In [None]:
pm10_unfiltered[1:]

In [None]:
pm10_unfiltered[:-2].to_numpy()

In [None]:
pm10_unfiltered = pm[' PM10 (µg/m^3)'].dropna()

In [None]:
pm10_unfiltered

In [None]:
(pm10_unfiltered.to_numpy()[1:] - pm10_unfiltered.to_numpy()[:-1])

In [None]:
pm10_roc = (pm10_unfiltered.to_numpy()[1:] - pm10_unfiltered.to_numpy()[:-1])/((pm10_unfiltered.index[1:] - pm10_unfiltered.index[:-1])/np.timedelta64(1,'m'))
pm10_roc_max = pm10_roc.max()
pm10_roc_min = pm10_roc.min()

In [None]:
pm10_roc_max

In [None]:

data = []
timestamps = []

for pt1, pt2, pt3, d1, d2, d3 in zip(pm10_unfiltered[:-2].to_numpy(), pm10_unfiltered.[1:].to_numpy(),
                                 pm10_unfiltered[2:].to_numpy(), pm10_unfiltered.index[:-2],
                                 pm10_unfiltered.index[1:], pm10_unfiltered.index[2:]):
    if ((d2-d1)/np.timedelta64(1,'m') > 10) or ((d3-d2)/np.timedelta64(1,'m') > 10): #checking if the three points are not too far from each other (<= 10 mins apart pairwise)
        continue:
    else:
        

In [None]:
coarse_pm = get_forward_slopes(pm, ' PM10 (µg/m^3)')

coarse_pm_spikes  = coarse_pm[(coarse_pm.forward_slope * coarse_pm.fwd_slope_1 <0)]

In [None]:
up_lim_roc

In [None]:
down_lim_roc

In [None]:
coarse_pm_spikes

In [None]:
up_lim_roc

In [None]:
coarse_pm[coarse_pm.timediff_in_min>10].index

In [None]:
coarse_pm[coarse_pm.timediff_in_min>10]

In [None]:
coarse_pm.timediff_in_min.describe()

In [None]:
coarse_pm[coarse_pm.timediff_in_min<=10].timediff_in_min.describe()

In [None]:
pm10_clean = get_anomalies(pm, ' PM10 (µg/m^3)')
pm25_clean = get_anomalies(pm, ' PM2.5 (µg/m^3)')
pm1_clean = get_anomalies(pm, ' PM1 (µg/m^3)')


In [None]:
pm10_clean

In [None]:
pm10_omitted = get_omitted_dates(pm, ' PM10 (µg/m^3)')
pm25_omitted = get_omitted_dates(pm, ' PM2.5 (µg/m^3)')
pm1_omitted = get_omitted_dates(pm, ' PM1 (µg/m^3)')

In [None]:
pm10_omitted.iloc[:,0].describe()

In [None]:
pm25_omitted.iloc[:,0].describe()

In [None]:
pm1_omitted.iloc[:,0].describe()

In [None]:
pm10_clean.describe()

In [None]:
pm25_clean.describe()

In [None]:
pm1_clean.describe()

In [None]:
#checking what are the range of values omitted for PM10
pm.iloc[:,0][(pm.iloc[:,0] > 213.100000)].dropna()

In [None]:
#number of measurements after discarding
print("pm10 # measurements remaining after discarding: ",pm10_clean.dropna().count())
print("pm2.5 # measurements remaining after discarding: ",pm25_clean.dropna().count())
print("pm1 # measurements remaining after discarding: ",pm1_clean.dropna().count())

In [None]:
print("pm10 % remaining remaining after discarding: ",100*pm10_clean.dropna().count()/pm.count()[0])
print("pm2.5 % remaining remaining after discarding: ",100*pm25_clean.dropna().count()/pm.count()[1])
print("pm1 % remaining remaining after discarding: ",100*pm1_clean.dropna().count()/pm.count()[2])

In [None]:
#PM10 rate of change during 2020 New Year (2019 Dec 31 12 PM to 2020 Jan 1 6 AM)
derivative_hist(pm_newyear, ' PM10 (µg/m^3)').forward_slope.hist(bins = np.linspace(-45,25,100), color = 'black')
plt.grid(False)
plt.vlines(-41.400000000000006,0, 115, color = 'gray', linestyle = 'dashed')
plt.vlines(21.599999999999994, 0, 115, color = 'gray', linestyle = 'dashed')
plt.ylabel('count')
plt.xlabel('$PM_{10}$ rate of change ($\mu$g/$m^3$/min)')
plt.show()

In [None]:
#PM2.5 rate of change during 2020 New Year (2019 Dec 31 12 PM to 2020 Jan 1 6 AM)
derivative_hist(pm_newyear, ' PM2.5 (µg/m^3)').forward_slope.hist(bins = np.linspace(-45,25,100), color = 'black')
plt.grid(False)
plt.vlines(-42.50000000000001,0, 170, color = 'gray', linestyle = 'dashed')
plt.vlines(24.800000000000004, 0, 170, color = 'gray', linestyle = 'dashed')
plt.ylabel('count')
plt.xlabel('$PM_{2.5}$ rate of change ($\mu$g/$m^3$/min)')
plt.show()

In [None]:
#PM1 rate of change during 2020 New Year (2019 Dec 31 12 PM to 2020 Jan 1 6 AM)
derivative_hist(pm_newyear, ' PM1 (µg/m^3)').forward_slope.hist(bins = np.linspace(-45,25,100), color = 'black')
plt.grid(False)
plt.vlines(-41.10000000000001,0, 250, color = 'gray', linestyle = 'dashed')
plt.vlines(22.299999999999997, 0, 250, color = 'gray', linestyle = 'dashed')
plt.ylabel('count')
plt.xlabel('$PM_{1}$ rate of change ($\mu$g/$m^3$/min)')
plt.show()

In [None]:
fig = pm_newyear.iloc[:,0].plot(color = 'black')
plt.ylabel(' $PM_{10}$ (µg/m^3)')
plt.xlabel('data and time (mm-dd hh)')
plt.yticks(np.arange(0, 180, 20))
plt.show()

In [None]:
pm_newyear.iloc[:,1].plot(color = 'black')
plt.ylabel(' $PM_{2.5}$ (µg/m^3)')
plt.xlabel('data and time (mm-dd hh)')
plt.yticks(np.arange(0, 180, 20))
plt.show()

In [None]:
pm_newyear.iloc[:,2].plot(color = 'black')
plt.ylabel(' $PM_{1}$ (µg/m^3)')
plt.xlabel('data and time (mm-dd hh)')
plt.yticks(np.arange(0, 180, 20))
plt.show()

In [None]:
#global maximum PM10 value

df['2020-01-21 17:50:11':'2020-01-21 17:59:11'][' PM10 (µg/m^3)'].plot(marker = 'o', color = 'black')
plt.yscale('log')
plt.ylabel('$PM_{10}$ ($\mu$g/m^3)')
plt.xlabel(r'day and time (dd hh:mm)')
# plt.xticks(plt.get_xticks())
# plt.xticklabels(['17:'+str(sec) for sec in range(50,60)])

In [None]:
#uncomment to save pm10, pm2.5, and pm1
# pm10_clean.to_csv('pm10.csv')
# pm25_clean.to_csv('pm25.csv')
# pm1_clean.to_csv('pm1.csv')

In [None]:
pm10 = pd.read_csv('pm10.csv',index_col = 0, parse_dates = True)
pm25 = pd.read_csv('pm25.csv',index_col = 0, parse_dates = True)
pm1 = pd.read_csv('pm1.csv',index_col = 0, parse_dates = True)

In [None]:
#checking whether exported csv has expected values
np.allclose(pm10_clean.to_numpy(),pm10.iloc[:,0].to_numpy())

In [None]:
#checking whether exported csv has expected values
np.allclose(pm25_clean.to_numpy(),pm25.iloc[:,0].to_numpy())

In [None]:
#checking whether exported csv has expected values
np.allclose(pm1_clean.to_numpy(),pm1.iloc[:,0].to_numpy())