In [1]:
import numpy as np
import pandas as pd
from pandas import datetime
from matplotlib import pyplot as plt

  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
# Visualization setup
%matplotlib
from matplotlib import pyplot as plt
import seaborn; seaborn.set()  # set plot styles
%config InlineBackend.figure_format = 'svg'
plt.rcParams['figure.figsize'] = [10, 5] 
plt.ion() # enable the interactive mode

import seaborn as sns
sns.set() #set plot styles

Using matplotlib backend: Qt5Agg


In [4]:
"""
Load the Data
"""
def parser(x):
    return datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

input_file = './data/AirQualityUCI_refined_예제코드.csv'

df = pd.read_csv(input_file, 
                 index_col=[0],
                parse_dates=[0],
                date_parser=parser)

In [10]:
# Interpolate the CO(GT) column
co = df['CO(GT)'].copy() #import CO data
co.interpolate(inplace=True) #linear interpolate

In [12]:
"""
Binning
"""
max_val = co.max()
min_val = co.min()
# print(max_val, min_val)

11.9 0.0


In [14]:
# make interval values
bins = np.linspace(min_val,max_val,6)
bins

array([ 0.  ,  2.38,  4.76,  7.14,  9.52, 11.9 ])

In [16]:
# labels for each bins
labels = ['0<=x<2.38', '2.38<=x<4.76', '4.76<=x<7.14', '7.14<=x<9.52', '9.52<=x<11.9' ]

In [18]:
df['bins'] = pd.cut(co,bins=bins, labels=labels, include_lowest=True) #df에 bins column추가

In [20]:
# print bins
df['bins']

Datetime
2004-03-10 18:00:00    2.38<=x<4.76
2004-03-10 19:00:00       0<=x<2.38
2004-03-10 20:00:00       0<=x<2.38
2004-03-10 21:00:00       0<=x<2.38
2004-03-10 22:00:00       0<=x<2.38
                           ...     
2005-04-04 10:00:00    2.38<=x<4.76
2005-04-04 11:00:00    2.38<=x<4.76
2005-04-04 12:00:00    2.38<=x<4.76
2005-04-04 13:00:00       0<=x<2.38
2005-04-04 14:00:00       0<=x<2.38
Name: bins, Length: 9357, dtype: category
Categories (5, object): [0<=x<2.38 < 2.38<=x<4.76 < 4.76<=x<7.14 < 7.14<=x<9.52 < 9.52<=x<11.9]

In [22]:
# visualize binning
plt.hist(df['bins'], bins = 5)
plt.show()


In [27]:
"""
Log transform
"""
# df.min() # df의 각 컬럼의 min값 볼 수 있음

# original data
sns.distplot(df['PT08.S3(NOx)'])

# Calculate natural log 
df['log'] = np.log10(df['PT08.S3(NOx)'])

In [30]:
df['log'] #데이터 프레임에 '로그' 컬럼 추가
df.min()

CO(GT)                   0
PT08.S1(CO)            647
PT08.S2(NMHC)          383
NOx(GT)                  2
PT08.S3(NOx)           322
NO2(GT)                  2
PT08.S4(NO2)           551
PT08.S5(O3)            221
RH                     9.2
AH                  0.1847
C6H6(GT)                 0
bins             0<=x<2.38
log                2.50786
dtype: object

In [31]:
# visualize log transform
sns.distplot(df['log'])
plt.xlabel('PT08.S3(NOx)')

Text(0.5, 0, 'PT08.S3(NOx)')

In [41]:
"""
One-hot Encoding
"""
# make df

emp_id = pd.Series([1,2,3,4,5])
gender = pd.Series(['Male','Female','Female','Male','Female'])
remark = pd.Series(['Nice','Good','Great','Great','Nice'])

df_emp = pd.DataFrame()
df_emp['id'] = emp_id
df_emp['gender'] = gender
df_emp['remark'] = remark

df_emp

Unnamed: 0,id,gender,remark
0,1,Male,Nice
1,2,Female,Good
2,3,Female,Great
3,4,Male,Great
4,5,Female,Nice


In [42]:
print(df_emp['gender'].unique())
print(df_emp['remark'].unique())

['Male' 'Female']
['Nice' 'Good' 'Great']


In [45]:
df_emp_encoded = pd.get_dummies(df_emp,columns=['gender','remark'])
df_emp_encoded

Unnamed: 0,id,gender_Female,gender_Male,remark_Good,remark_Great,remark_Nice
0,1,0,1,0,0,1
1,2,1,0,1,0,0
2,3,1,0,0,1,0
3,4,0,1,0,1,0
4,5,1,0,0,0,1


In [47]:
"""
Notmalization
"""

# visulaized original data
plt.plot(df['CO(GT)'],label = 'CO')
plt.plot(df['PT08.S2(NMHC)'],label = 'NMHC')
plt.legend(loc='best')


<matplotlib.legend.Legend at 0x1b8ed6abf48>

In [52]:
# Interpolate the NMHC column
nmhc = df['PT08.S2(NMHC)'].copy() #import CO data
nmhc.interpolate(inplace=True) #linear interpolate

In [50]:
# Min-Max Normalization: CO
co_max = co.max()
co_min = co.min()

df['CO_Norm'] = (co - co_min) / (co_max - co_min)
df['CO_Norm']

Datetime
2004-03-10 18:00:00    0.218487
2004-03-10 19:00:00    0.168067
2004-03-10 20:00:00    0.184874
2004-03-10 21:00:00    0.184874
2004-03-10 22:00:00    0.134454
                         ...   
2005-04-04 10:00:00    0.260504
2005-04-04 11:00:00    0.201681
2005-04-04 12:00:00    0.201681
2005-04-04 13:00:00    0.176471
2005-04-04 14:00:00    0.184874
Name: CO_Norm, Length: 9357, dtype: float64

In [53]:
# Min-Max Normalization: NMHC
nmhc_max = nmhc.max()
nmhc_min = nmhc.min()

df['NMHC_Norm'] = (nmhc - nmhc_min) / (nmhc_max - nmhc_min)
df['NMHC_Norm']

Datetime
2004-03-10 18:00:00    0.362097
2004-03-10 19:00:00    0.312398
2004-03-10 20:00:00    0.303659
2004-03-10 21:00:00    0.308575
2004-03-10 22:00:00    0.247406
                         ...   
2005-04-04 10:00:00    0.392135
2005-04-04 11:00:00    0.351720
2005-04-04 12:00:00    0.371382
2005-04-04 13:00:00    0.315674
2005-04-04 14:00:00    0.362643
Name: NMHC_Norm, Length: 9357, dtype: float64

In [54]:
# visulaize normalization data
plt.plot(df['CO_Norm'], label = 'CO(normalized)')
plt.plot(df['NMHC_Norm'], label = 'NMHC(normalized)')

[<matplotlib.lines.Line2D at 0x1b8ef11cd08>]