In [None]:
from dask import dataframe as dd
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from functools import reduce
import seaborn as sns
from sklearn import preprocessing

%matplotlib inline

path = '../data/'

df_sds011 = dd.read_csv(path + 'processed*sds011*').compute()
df_dht22 = dd.read_csv(path + 'processed*dht22*').compute()

df_sds011['timestamp'] = pd.to_datetime(df_sds011['timestamp'], format='%Y-%m-%dT%H:%M:%S')
df_dht22['timestamp'] = pd.to_datetime(df_dht22['timestamp'], format='%Y-%m-%dT%H:%M:%S')


#Problems when remove index category because of multi index
#df_sds011['location'] = df_sds011['location'].astype('category')
#df_dht22['location'] = df_dht22['location'].astype('category')



In [None]:
df_sds011 = df_sds011.set_index(['location', 'timestamp'])
df_dht22 = df_dht22.set_index(['location', 'timestamp'])

df_sds011 = df_sds011.sort_index()
df_dht22 = df_dht22.sort_index()

df_sds011 = df_sds011[['P1', 'P2']]
df_dht22 = df_dht22[['temperature', 'humidity']]

In [None]:
print(df_sds011.isnull().sum())
print(df_dht22.isnull().sum())

In [None]:
df_sds011.describe()

In [None]:
df_dht22.describe()

In [None]:
_ = df_dht22.boxplot(by='location')
_ = df_sds011.boxplot(by='location')

In [None]:
df_sds011.groupby(level=0).plot(kind='kde')

In [None]:
df_dht22.groupby(level=0).plot( kind='kde')

# Outliers

In [None]:
def remove_outlier(df):
    print(df.groupby(level=0).count())
    res = df[np.abs(df - df.mean()) <= (3*df.std())]
    print(res.groupby(level=0).count())
    
    return res


df_sds011 = remove_outlier(df_sds011)
df_dht22 = remove_outlier(df_dht22)

In [None]:
df_sds011.describe()

In [None]:
df_dht22.describe()

In [None]:
#_ = df_dht22.boxplot(by='location')
#_ = df_sds011.boxplot(by='location')

In [None]:
df_sds011.groupby(level=0).plot(kind='kde')

In [None]:
df_dht22.groupby(level=0).plot(kind='kde')

In [None]:
df_sds011.groupby(level=0).describe()

In [None]:
df_dht22.groupby(level=0).describe()

In [None]:
df_sds011 = df_sds011.drop([3123, 10574], level=0)
df_dht22 = df_dht22.drop([3123, 10574], level=0)

# Resample and Merge

In [None]:
def resample(df, freq='60min'):
    level_values = df.index.get_level_values
    return df.groupby([level_values(0)] + [pd.Grouper(freq=freq, level=1)]).mean().dropna()

In [None]:

df_sds011 = resample(df_sds011)
df_dht22 = resample(df_dht22)

In [None]:
print(df_sds011.shape, df_dht22.shape)

In [None]:
data = pd.merge(df_sds011, df_dht22, left_index=True, right_index=True, how='inner')
data = data[data.humidity <= 70.0]
print(data.shape)

# Normalize and Stand

In [None]:
# through normalization, correlations can be better observerd

df_sds011[['P1', 'P2']] =  preprocessing.normalize(df_sds011[['P1', 'P2']], axis=0, norm='max')
scaler = preprocessing.StandardScaler()
df_sds011[['P1', 'P2']] = scaler.fit_transform(df_sds011[['P1', 'P2']])

df_dht22[['temperature', 'humidity']] = preprocessing.normalize(df_dht22[['temperature', 'humidity']], axis=0, norm='max')
scaler = preprocessing.StandardScaler()
df_dht22[['temperature', 'humidity']] = scaler.fit_transform(df_dht22[['temperature', 'humidity']])

In [None]:
print(df_sds011.describe())
print(df_dht22.describe())

In [None]:
import matplotlib.pyplot as plt

alpha = list(data.columns.values)

fig = plt.figure()

ax = fig.add_subplot(111)
cax = ax.matshow(data.corr(), interpolation='nearest')
fig.colorbar(cax)

ax.set_xticklabels(['']+alpha)
ax.set_yticklabels(['']+alpha)

plt.show()



In [None]:
df_dht22[['temperature']].hist(bins=50, by='location')

In [None]:
df_dht22[['humidity']].hist(bins=50, by='location')

In [None]:
bins = int(len(range(int(df_sds011['P1'].min()), int(df_sds011['P1'].max()))) / 5 )
df_sds011[['P1']].hist(bins=bins, by='location')

In [None]:
#bins = int(len(range(int(df_sds011['P2'].min()), int(df_sds011['P2'].max()))) / 5 )
bins = 50

f, axes = plt.subplots(2, 2, figsize=(10, 7), sharex=True)

groups = list(df_sds011.groupby(level=0))


sns.distplot(groups[0][1][['P2']], hist=True, kde=True, 
             bins=bins, color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, ax=axes[0,0])

axes[0,0].title.set_text(groups[0][0])

sns.distplot(groups[1][1][['P2']], hist=True, kde=True, 
             bins=bins, color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, ax=axes[0,1])

axes[0,1].title.set_text(groups[1][0])

sns.distplot(groups[2][1][['P2']], hist=True, kde=True, 
             bins=bins, color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, ax=axes[1,0])

axes[1,0].title.set_text(groups[2][0])


    
plt.tight_layout()
    


In [None]:
#bins = int(len(range(int(df_sds011['P2'].min()), int(df_sds011['P2'].max()))) / 5 )
bins = 15


f, axes = plt.subplots(3, 2, figsize=(10, 7), sharex=True)

#data gets average over location 

#d = data[data['P1'] <= 15]
d = data
sns.distplot(d[['P1']], hist=True, kde=True, 
             bins=bins, color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, ax=axes[0,0])

axes[0,0].title.set_text('P1')

sns.distplot(d[['P2']], hist=True, kde=True, 
             bins=bins, color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, ax=axes[0,1])

axes[0,1].title.set_text('P2')


sns.distplot(boxcox(d[['P1']], lmbda=0), hist=False, kde=True, color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, ax=axes[1,0])

axes[1,0].title.set_text('P1 boxcox')

sns.distplot(boxcox(d[['P2']], lmbda=0), hist=False, kde=True, color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, ax=axes[1,1])

axes[1,1].title.set_text('P2 boxcox')

sns.distplot(d[['temperature']], hist=True, kde=True, 
             bins=bins, color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, ax=axes[2,0])

axes[2,0].title.set_text('Temperature')

sns.distplot(d[['humidity']], hist=True, kde=True, 
             bins=bins, color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, ax=axes[2,1])

axes[2,1].title.set_text('Humidity')

    
plt.tight_layout()
    

In [None]:
features_set = []  
labels = []  
for i in range(60, 1260):  
    features_set.append(apple_training_scaled[i-60:i, 0])
    labels.append(apple_training_scaled[i, 0])

In [None]:
from pandas import DataFrame
df = DataFrame()
df['t'] = [x for x in range(10)]
df['t-1'] = df['t'].shift(1)
df['t+1'] = df['t'].shift(-1)

print(df)


In [None]:
from pandas import DataFrame
from pandas import concat
 
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	"""
	Frame a time series as a supervised learning dataset.
	Arguments:
		data: Sequence of observations as a list or NumPy array.
		n_in: Number of lag observations as input (X).
		n_out: Number of observations as output (y).
		dropnan: Boolean whether or not to drop rows with NaN values.
	Returns:
		Pandas DataFrame of series framed for supervised learning.
	"""
	n_vars = 1 if type(data) is list else data.shape[1]
	df = DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

In [None]:
series_to_supervised([1,2,3], n_in=1, n_out=2, dropnan=False)

In [310]:
d = d.groupby(level=1).mean().sort_index()[['P1', 'P2', 'temperature', 'humidity']]

((2854, 6), (6016, 7), (8870, 6))

In [328]:
series_to_supervised(d[0:3],n_in=2, n_out=1, dropnan=False)

Unnamed: 0_level_0,var1(t-2),var2(t-2),var3(t-2),var4(t-2),var1(t-1),var2(t-1),var3(t-1),var4(t-1),var1(t),var2(t),var3(t),var4(t)
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-11-20 09:00:00,,,,,,,,,11.602083,4.926667,8.091667,68.0875
2017-11-20 10:00:00,,,,,11.602083,4.926667,8.091667,68.0875,11.8016,4.7364,9.316,63.924
2017-11-20 11:00:00,11.602083,4.926667,8.091667,68.0875,11.8016,4.7364,9.316,63.924,11.67,4.79375,10.0375,60.404167


In [326]:
d[0:2]

Unnamed: 0_level_0,P1,P2,temperature,humidity
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-11-20 09:00:00,11.602083,4.926667,8.091667,68.0875
2017-11-20 10:00:00,11.8016,4.7364,9.316,63.924


In [None]:
#Question: How many timesteps?