In [10]:
import pandas as pd
import numpy as np
from scipy.stats import norm
from scipy import stats
from tqdm import tqdm
import os
import sys
import pickle

import warnings
warnings.filterwarnings('ignore')

In [5]:
path_data_source = 'smart_heating_dataset/'
path_data_extracted = "extracted_dataset/"
path_data_raw = "current_dataset/"
data_files = os.listdir(path_data_source)
try:
    os.mkdir(path_data_extracted)
    os.mkdir(path_data_raw)
except FileExistsError:
    pass

nums = sorted(list(set([i[:6] for i in data_files if i != "weather.json"])))
params = sorted(list(set([i[6:-5] for i in data_files if i != "weather.json"])))

weather = pd.read_json(path_data_source+"weather.json", typ='frame', date_unit='s')
weather.set_index(pd.to_datetime(weather['time'], unit='ms'), verify_integrity=False, inplace=True)
weather.drop(['time'], inplace=True, axis=1)

t1, t2 = pd.Timestamp('2017-11-30 21:00:00').timestamp(), pd.Timestamp('2018-03-15 03:00:00').timestamp()
times = pd.to_datetime(np.linspace(t1+3*60*60, t2+3*60*60, (t2-t1)//60+1), unit='s')



In [11]:
for num in [i for i in nums if not (i[:-1]+'.csv' in os.listdir(path_data_extracted))]:
    print(num)
    tube = pd.DataFrame() 
    for param in params:
        p = pd.read_json(path_data_source+num+param+".json", typ='series')
        pf = p[times]
        p = p[times].dropna()
        indexes = p.index
        first, last = p[0], p[-1]
        prev, last_id = indexes[0], indexes[-1]
        iter_ids = iter(indexes[1:])
        pf[:prev] = first
        pf[last_id:] = last

        for curr in tqdm(p.index[1:]):
            pf[prev:curr] = np.round(np.linspace(p[prev], p[curr], len(pf[prev:curr])), 4)
            prev = curr
        tube[param] = pf[180:-180]
    tube.to_csv(path_data_extracted+num[:-1]+'.csv', index_label='timestamp')

In [12]:
weather_new = pd.DataFrame()
for param in weather.columns:
    p = weather[param]
    pf = p[times]
    p = p[times].dropna()
    indexes = p.index
    first = p[0]
    prev, last_id = indexes[0], indexes[-1]
    iter_ids = iter(indexes[1:])
    pf[:prev] = first

    for curr in tqdm(p.index[1:]):
        pf[prev:curr] = np.round(np.linspace(p[prev], p[curr], len(pf[prev:curr])))
        prev = curr
    weather_new[param] = pf[180:-180]
weather_new.to_csv(path_data_extracted+'weather.csv', index_label='timestamp')

100%|██████████| 2456/2456 [00:00<00:00, 2907.34it/s]
100%|██████████| 2456/2456 [00:00<00:00, 2962.42it/s]
100%|██████████| 2456/2456 [00:00<00:00, 2925.72it/s]


In [7]:
class Predictor:
    def __init__(self, path_to_data):
        self.counter = 0
        extracted = sorted(os.listdir(path_to_data))[:-1]
        stat_data = []
        tubes = []
        for path in extracted:
            description = tube.describe(percentiles=[.05, .16, 0.5, 0.84, 0.95])
            tube = pd.read_csv(path_to_data+path, index_col='timestamp')[-30*24*60:]
            tubes.append(np.array(tube))
#             description = tube.describe(percentiles=[.05, .16, 0.5, 0.84, 0.95])
            stat_data.append(np.array(description))
        index, columns = tube.index, tube.columns
        self.columns = columns
        self.tube_names = [i[-4:] for i in extracted]
#         self.panel = pd.Panel(tubes, items=self.tube_names, major_axis=index, minor_axis=columns)
        self.stat_data = pd.Panel(stat_data, items=self.tube_names, major_axis=description.index, minor_axis=columns)
        self.no_response = pd.DataFrame(np.zeros((len(extracted), len(columns))), index=extracted, columns=columns)
        self.temp_corr = pd.read_csv('correlation.csv', index_col='temperature')
    
# input_data = pd.DataFrame(data, index=extracted (25 tubes), columns=columns (6 features))
    def get_request(self, input_data, environment):
        params = self.columns
        tube_names = self.tube_names
        return_data = pd.DataFrame(np.zeros((len(tube_names), len(params))), index=tube_names, columns=params)
        for tube_name in tube_names:
            for param in params:
                item = input_data.loc[tube_name, param]
                if np.isnan(item):
                    self.no_response[tube_name, param] += 1
                    if self.no_response[tube_name, param] > 60:
                        return_data.loc[tube_name, param] = 1
                    else:
                        return_data.loc[tube_name, param] = 0
                elif param == "temp_supply":
                    t = self.temp_corr.loc[int(environment), 'temp_inside_1']
                    return_data.loc[tube_name, param] = 0
                    self.no_response[tube_name, param] = 0
                elif param == "temp_return":
                    t = self.temp_corr.loc[int(environment), 'temp_outside_3']
                    return_data.loc[tube_name, param] = 0
                    self.no_response[tube_name, param] = 0
                else:
                    stats = self.stat_data[tube_name, :, param]
                    if stats.loc["16%"] < item < stats.loc["84%"]:
                        return_data.loc[tube_name, param] = 0
                        self.no_response[tube_name, param] = 0
                    elif stats.loc["5%"] < item < stats.loc["95%"]:
                        return_data.loc[tube_name, param] = 1
                        self.no_response[tube_name, param] = 0
                    elif stats.loc["5%"] > item or item > stats.loc["95%"]:
                        return_data.loc[tube_name, param] = 2
                        self.no_response[tube_name, param] = 0
        return np.array(return_data).sum(axis=1)

In [14]:
with open('predictor.pickle', 'wb') as f:
    pickle.dump(Predictor, f)