In [22]:
%matplotlib inline

from sklearn import mixture, gaussian_process, linear_model, svm, naive_bayes
from sklearn import preprocessing

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
import os

# numpy compatibility issues.
for retry in range(5):
    try:
        import statsmodels.api as sm
        break
    except ImportError:
        pass

# project import
import common

In [2]:
# load data
df = common.load_df("../data/insead", "*21*.csv")
print(common.load_df.cache_info())

# preprocess.
df = common.remove_negatives(df)
df = common.identify_ct_shutdown(df)
df = common.replace_missing(df)

# Get the first derivative. The idea is that this makes the time series stationary.
df = common.get_normalized_df(df, scale=(0.1, 1))

CacheInfo(hits=0, misses=1, maxsize=None, currsize=1)


In [3]:
thresh = df.median()
df[df <= df.min()] = df.rolling(1*60*10, min_periods=1).median()

In [5]:
# take a sample
data = df[ ["cwshdr", "loadsys", "drybulb", "ct1kw", "ct2akw", "rh", "cwrhdr", "systotpower", "cwsfhdr"] ].dropna()

df_decomposed = pd.DataFrame()
for col in data.columns:
    resp = sm.tsa.seasonal_decompose(data[col], freq=1*60*24)
    for part in ("seasonal", "trend", "resid"):
        col_name = "{0}_{1}".format(col, part)
        df_decomposed[col_name] = getattr(resp, part)

# first and last of df contains null.
df_decomposed = df_decomposed.dropna()

train_df = df_decomposed["2017-01-01":"2017-01"]
train_df -= train_df.shift(1)
train_df = train_df[1:]

validation_df = df_decomposed["2017-02-01":"2017-02-01"]
validation_df -= validation_df.shift(1)
validation_df = validation_df[1:]

In [26]:
features = [ "cwrhdr_resid", "cwshdr_resid"]
target = ["cwshdr_resid"]
lookback = 10
batch_size = 1

train_x, train_y = common.prepare_keras_data(train_df, features, target, lookback, batch_size)
validation_x, validation_y = common.prepare_keras_data(validation_df, features, target, lookback, batch_size)

reshape_x = lambda x: x.reshape((x.shape[0], x.shape[2]))
reshape_y = lambda y: y.reshape((y.shape[0],))

model = linear_model.LinearRegression()
model.fit(reshape_x(train_x), reshape_y(train_y))
predicted_y = model.predict(reshape_x(validation_x))

print("Score: ", model.score(reshape_x(validation_x), reshape_y(validation_y)))
print("MAPE: ", common.mean_absolute_percent_error(reshape_y(validation_y), predicted_y))

Score:  0.36124110204
MAPE:  171.704502435


In [38]:
features = ["loadsys_resid", "rh_resid", "drybulb_resid", "cwshdr_resid"]
target = ["cwshdr_resid"]
lookback = 10
batch_size = 1

train_x, train_y = common.prepare_keras_data(train_df, features, target, lookback, batch_size)
validation_x, validation_y = common.prepare_keras_data(validation_df, features, target, lookback, batch_size)

reshape_x = lambda x: x.reshape((x.shape[0], x.shape[2]))
reshape_y = lambda y: y.reshape((y.shape[0],))

poly1 = preprocessing.PolynomialFeatures(2)

model = linear_model.LinearRegression()
model.fit(poly1.fit_transform(reshape_x(train_x)), reshape_y(train_y))
predicted_y = model.predict(poly1.transform(reshape_x(validation_x)))

print("Score: ", model.score(poly1.transform(reshape_x(validation_x)), reshape_y(validation_y)))
print("MAPE: ", common.mean_absolute_percent_error(reshape_y(validation_y), predicted_y))

Score:  -1.34588443783
MAPE:  263.694760895


In [30]:
pd.DataFrame({"t": reshape_y(validation_y), "y": predicted_y})

Unnamed: 0,t,y
0,-0.026425,0
1,-0.028459,0
2,-0.028904,0
3,-0.029933,0
4,-0.031424,0
5,-0.031708,0
6,-0.033559,0
7,-0.034288,0
8,-0.034670,0
9,-0.034889,0
