In [1]:
import pandas as pd
import numpy as np
from pandas import datetime
from matplotlib import pyplot as plt

"""
Load AirQualityUCI Data
"""

def parser(x):
    return datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

input_file = '/data/AirQualityUCI_refined.csv'

df = pd.read_csv(input_file,
                 index_col=[0],
                 parse_dates=[0],
                 date_parser=parser)

df.head()


  from pandas import datetime


Unnamed: 0_level_0,CO(GT),PT08.S1(CO),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),RH,AH,C6H6(GT)
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2004-03-10 18:00:00,2.6,1360.0,1046.0,166.0,1056.0,113.0,1692.0,1268.0,48.9,0.7578,11.9
2004-03-10 19:00:00,2.0,1292.0,955.0,103.0,1174.0,92.0,1559.0,972.0,47.7,0.7255,9.4
2004-03-10 20:00:00,2.2,1402.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,54.0,0.7502,9.0
2004-03-10 21:00:00,2.2,1376.0,948.0,172.0,1092.0,122.0,1584.0,1203.0,60.0,0.7867,9.2
2004-03-10 22:00:00,1.6,1272.0,836.0,131.0,1205.0,116.0,1490.0,1110.0,59.6,0.7888,6.5


In [2]:
# Visualization setup
%matplotlib
from matplotlib import pyplot as plt
import seaborn; seaborn.set()  # set plot styles
%config InlineBackend.figure_format = 'svg'
plt.rcParams['figure.figsize'] = [10, 5]
plt.ion() # enable the interactive mode

Using matplotlib backend: <object object at 0x000001D1F31FD730>


<matplotlib.pyplot._IonContext at 0x1d1f3fffc70>

In [23]:
# Visualize the 'CO(GT)' variable
df['PT08.S3(NOx)'].plot()

<AxesSubplot:title={'center':'Detecting outliers using Boxplt'}, xlabel='Datetime', ylabel='Density'>

In [4]:
# Linear interpolation
NOx = df['PT08.S3(NOx)'].interpolate().copy()

In [5]:
# Visualize original and imputed data
plt.plot(df['PT08.S3(NOx)'], label='original', zorder=2)
plt.plot(NOx, label='linear interpolation', zorder=1)
plt.legend(loc='best')

<matplotlib.legend.Legend at 0x1d1f79d7cd0>

In [6]:
# Detecting outliers using Boxplot
plt.boxplot(NOx)
plt.title("Detecting outliers using Boxplt")
plt.xlabel('PT08.S3(NOx)')

Text(0.5, 0, 'PT08.S3(NOx)')

In [7]:
# Calculate correlations between variables
corr_matrix = df.corr()
corr_matrix

Unnamed: 0,CO(GT),PT08.S1(CO),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),RH,AH,C6H6(GT)
CO(GT),1.0,0.877203,0.914973,0.792557,-0.701703,0.679262,0.63947,0.851403,0.040218,0.065809,0.845144
PT08.S1(CO),0.877203,1.0,0.892964,0.713654,-0.771938,0.641529,0.682881,0.899324,0.114606,0.135324,0.883795
PT08.S2(NMHC),0.914973,0.892964,1.0,0.704435,-0.796703,0.646245,0.777254,0.880578,-0.09038,0.186933,0.98195
NOx(GT),0.792557,0.713654,0.704435,1.0,-0.655707,0.763111,0.233731,0.787046,0.221032,-0.149323,0.626638
PT08.S3(NOx),-0.701703,-0.771938,-0.796703,-0.655707,1.0,-0.652083,-0.538468,-0.796569,-0.05674,-0.232017,-0.735744
NO2(GT),0.679262,0.641529,0.646245,0.763111,-0.652083,1.0,0.15736,0.708128,-0.091759,-0.335022,0.544039
PT08.S4(NO2),0.63947,0.682881,0.777254,0.233731,-0.538468,0.15736,1.0,0.591144,-0.032188,0.629641,0.765731
PT08.S5(O3),0.851403,0.899324,0.880578,0.787046,-0.796569,0.708128,0.591144,1.0,0.124956,0.070751,0.865689
RH,0.040218,0.114606,-0.09038,0.221032,-0.05674,-0.091759,-0.032188,0.124956,1.0,0.167971,-0.061681
AH,0.065809,0.135324,0.186933,-0.149323,-0.232017,-0.335022,0.629641,0.070751,0.167971,1.0,0.167972


In [8]:
# Choose the least correlated variable
nmhc = df['PT08.S3(NOx)'].copy().interpolate()

In [25]:
# Visualize a scatter plot(CO, RH)
plt.scatter(NOx, nmhc, s=12, c='black')
plt.xlabel('PT08.S3(NOx)')
plt.ylabel('NMHC')
plt.title("Detecting outliers using Boxplot")

Text(0.5, 1.0, 'Detecting outliers using Boxplot')

In [26]:
# Choose the most correlated variable
nmhc = df['PT08.S2(NMHC)'].copy().interpolate() # NMHC: Non-metanic Hydrocarbon

In [27]:
"""
IQR-based Outlier Detection
"""

# Q1, Q2(median), Q3
q1 = NOx.quantile(0.25) #upper 25%
median = NOx.quantile(0.5)
q3 = NOx.quantile(0.75)
print(q1, median, q3)

654.0 804.0 968.0


In [28]:
# IQR, upper_fence, lower_fence
iqr = q3-q1
upper_fence = q3 + 1.5*iqr
lower_fence = q1 - 1.5*iqr
print(iqr,upper_fence, lower_fence)



314.0 1439.0 183.0


In [29]:
# Filtering the outliers
outliers = NOx.loc[(NOx > upper_fence) | (NOx < 0)]
outliers

Datetime
2004-03-11 00:00:00    1462.0
2004-03-11 01:00:00    1453.0
2004-03-11 02:00:00    1579.0
2004-03-11 03:00:00    1705.0
2004-03-11 04:00:00    1818.0
                        ...  
2005-03-01 03:00:00    1791.0
2005-03-01 04:00:00    1804.0
2005-03-01 05:00:00    1727.0
2005-03-01 06:00:00    1677.0
2005-03-07 04:00:00    1490.0
Name: PT08.S3(NOx), Length: 239, dtype: float64

In [30]:
# Mask for outliers
mask = NOx.index.isin(outliers.index)

In [31]:
mask

array([False, False, False, ..., False, False, False])

In [32]:
# Visualize the normal data and outliers ~: not
plt.plot(NOx[~mask], label='normal', color='blue',
    marker='o', markersize=3, linestyle='None')
plt.plot(outliers, label='outliers', color='red',
    marker='x', markersize=3, linestyle='None')
plt.legend(loc='best')

<matplotlib.legend.Legend at 0x1d1fc02faf0>

In [19]:
# Removing the outliers
NOx_refined = NOx.copy()
NOx_refined[mask] = np.nan
print(NOx_refined[mask])

Datetime
2004-03-11 00:00:00   NaN
2004-03-11 01:00:00   NaN
2004-03-11 02:00:00   NaN
2004-03-11 03:00:00   NaN
2004-03-11 04:00:00   NaN
                       ..
2005-03-01 03:00:00   NaN
2005-03-01 04:00:00   NaN
2005-03-01 05:00:00   NaN
2005-03-01 06:00:00   NaN
2005-03-07 04:00:00   NaN
Name: PT08.S3(NOx), Length: 239, dtype: float64


In [20]:
# Linear interpolation for reconstructing outliers removed.
NOx_refined.interpolate(inplace = True)
NOx_refined.plot()

<AxesSubplot:title={'center':'Detecting outliers using Boxplt'}, xlabel='Datetime'>

In [21]:
"""
Detecting Outliers with Z-Scores
"""
import seaborn as sns

sns.distplot(NOx)



<AxesSubplot:title={'center':'Detecting outliers using Boxplt'}, xlabel='PT08.S3(NOx)', ylabel='Density'>

In [22]:
# Mean, Standard deviation
mean = np.mean(NOx) ##평균
std = np.std(NOx) ##표준편차

print(mean, std)

832.7588970823982 255.69616900657363


In [35]:
# Calculate Z-scores for each data points
outliers = []
thres = 3

for i in co :
    z_score = (i - mean)/ std
    if np.abs(z_score) > thres :
        print(z_score)
        outliers.append(i)

-3.2466614588232483
-3.249007993784375
-3.248225815463999
-3.248225815463999
-3.250572350425126
-3.2521367070658767
-3.2521367070658767
-3.252918885386252
-3.2533099745464398
-3.254483242027003
-3.2542876974469093
-3.254092152866815
-3.254092152866815
-3.2525277962260644
-3.249007993784375
-3.248225815463999
-3.2501812612649377
-3.2509634395853135
-3.250572350425126
-3.2493990829445627
-3.245488191342685
-3.248225815463999
-3.248225815463999
-3.245488191342685
-3.2380574972991187
-3.2298446249351764
-3.232973338216678
-3.241577299740808
-3.2509634395853135
-3.252918885386252
-3.2501812612649377
-3.2493990829445627
-3.2513545287455012
-3.253701063706628
-3.254092152866815
-3.254483242027003
-3.253701063706628
-3.2513545287455012
-3.2396218539398696
-3.2421639334810894
-3.2447060130223098
-3.2462703696630606
-3.248616904624187
-3.247052547983436
-3.2462703696630606
-3.245488191342685
-3.2458792805028733
-3.247443637143624
-3.241577299740808
-3.242359478061183
-3.231017892415739
-3.239621

-3.2548743311871906
-3.254092152866815
-3.2435327455417466
-3.234537694857429
-3.244314923862122
-3.2462703696630606
-3.248616904624187
-3.2458792805028733
-3.247052547983436
-3.248616904624187
-3.247052547983436
-3.2462703696630606
-3.242359478061183
-3.242359478061183
-3.2388396756194937
-3.2466614588232483
-3.247052547983436
-3.247834726303812
-3.248225815463999
-3.247052547983436
-3.2533099745464398
-3.253701063706628
-3.254092152866815
-3.2548743311871906
-3.254092152866815
-3.2533099745464398
-3.248225815463999
-3.2458792805028733
-3.245488191342685
-3.245488191342685
-3.2450971021824975
-3.247052547983436
-3.247834726303812
-3.248616904624187
-3.2493990829445627
-3.247834726303812
-3.2404040322602445
-3.237666408138931
-3.237666408138931
-3.243141656381559
-3.247052547983436
-3.247052547983436
-3.2466614588232483
-3.247834726303812
-3.2509634395853135
-3.2533099745464398
-3.254483242027003
-3.254483242027003
-3.253701063706628
-3.2533099745464398
-3.247443637143624
-3.2415772997

-3.248616904624187
-3.2497901721047504
-3.250572350425126
-3.2513545287455012
-3.253701063706628
-3.2548743311871906
-3.2548743311871906
-3.2548743311871906
-3.2533099745464398
-3.244314923862122
-3.2458792805028733
-3.2462703696630606
-3.247443637143624
-3.2462703696630606
-3.2458792805028733
-3.245488191342685
-3.2466614588232483
-3.2466614588232483
-3.244314923862122
-3.2388396756194937
-3.237666408138931
-3.238448586459306
-3.2439238347019344
-3.247834726303812
-3.248616904624187
-3.2501812612649377
-3.252918885386252
-3.2533099745464398
-3.254092152866815
-3.2552654203473788
-3.2552654203473788
-3.2548743311871906
-3.251745617905689
-3.243141656381559
-3.2353198731778043
-3.2349287840176166
-3.2435327455417466
-3.244314923862122
-3.242359478061183
-3.2388396756194937
-3.2404040322602445
-3.2439238347019344
-3.2462703696630606
-3.248225815463999
-3.2458792805028733
-3.245488191342685
-3.249007993784375
-3.254092152866815
-3.254483242027003
-3.2521367070658767
-3.2521367070658767
-3

-3.2501812612649377
-3.2493990829445627
-3.249007993784375
-3.249007993784375
-3.2501812612649377
-3.2497901721047504
-3.248225815463999
-3.242359478061183
-3.2380574972991187
-3.2380574972991187
-3.2364931406583675
-3.2337555165370535
-3.2337555165370535
-3.2447060130223098
-3.2466614588232483
-3.2458792805028733
-3.2458792805028733
-3.2458792805028733
-3.2447060130223098
-3.2439238347019344
-3.2462703696630606
-3.247834726303812
-3.250572350425126
-3.250572350425126
-3.2501812612649377
-3.250572350425126
-3.2513545287455012
-3.2523322516459707
-3.2533099745464398
-3.253701063706628
-3.254092152866815
-3.2525277962260644
-3.2509634395853135
-3.2509634395853135
-3.250572350425126
-3.250572350425126
-3.249007993784375
-3.2501812612649377
-3.2509634395853135
-3.249007993784375
-3.2493990829445627
-3.248616904624187
-3.2458792805028733
-3.2462703696630606
-3.247443637143624
-3.250572350425126
-3.2501812612649377
-3.251745617905689
-3.2513545287455012
-3.2521367070658767
-3.252918885386252

In [36]:
# Simplified version of filtering outliers
outliers = NOx.loc[np.abs((NOx-mean) / std) > 3]

In [37]:
# Comparison of distributions before/after outlier removal
sns.distplot(NOx, axlabel='PT08.S3(NOx)', label='original')
sns.distplot(NOx[~mask], label='outliers removed')
plt.legend(loc='best')

# [exer] Adjust thres



<matplotlib.legend.Legend at 0x18f2bef2fd0>

In [38]:
# Flooring and Capping
floor = NOx.quantile(0.1)
cap = NOx.quantile(0.9)
NOx.loc[NOx < floor] = floor
NOx.loc[NOx > cap] = cap

In [39]:
# Visualize the result
NOx.plot()

<AxesSubplot:title={'center':'Detecting outliers using Boxplt'}, xlabel='Datetime', ylabel='Density'>