In [None]:
import numpy as np
import os, sys
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVR
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics
import pandas as pd
from matplotlib import pyplot as plt
plt.rcParams['axes.facecolor']='w'
plt.rcParams['savefig.facecolor']='w'
%matplotlib inline 
#from my_functions import *
from modules.preprocessing import *
#from modules.statistics import *
from modules.learning import *
from scipy import stats
from ast import literal_eval
from modules.experiments import *

import matplotlib.dates as mdates
import time
import pickle

from timeit import default_timer as timer

from glob import glob
import os


# Test the performance of the soiling regression method

## Import Dask

In [None]:
from dask.distributed import Client
from dask.distributed import wait

## Start Dask

In [None]:
client = Client(n_workers=30,threads_per_worker=1)

## Load Training Data

In [None]:
dates_wash_start = pd.to_datetime(pd.Series(['2013-03-11 00:00:00', '2013-07-10 00:00:00', '2013-08-14 00:00:00', '2013-08-21 00:00:00', '2013-08-26 00:00:00']))
dates_wash_stop = pd.to_datetime(pd.Series(['2013-03-12 00:00:00', '2013-07-11 00:00:00', '2013-08-15 00:00:00', '2013-08-22 00:00:00','2013-08-27 00:00:00']))

In [None]:
filenamesTraining = sorted(glob(os.path.join('/path','folder','training_*.csv')))

In [None]:
%%time
fileArraysTraining = []
scaler = MinMaxScaler()
for fn in filenamesTraining:
    df = pd.read_csv(fn)
    df = df.set_index('timestamp')
    df = df.dropna()
    df.index = pd.to_datetime(df.index)
    df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)
    fileArraysTraining.append(df_scaled)

## Extract Rains

In [None]:
%%time
precipitation = []
dates_rain_start = []
dates_rain_stop = []
for i, df in enumerate(fileArraysTraining):
    precipitation.append(pd.concat([pd.Series({min(df.index)-pd.Timedelta('1s'): 0}),df.precipitation, pd.Series({max(df.index)+pd.Timedelta('1s'): 0})]))
    precipitation[i].index = pd.to_datetime(precipitation[i].index)
    df_dates = pd.DataFrame(index = precipitation[i].index)
    df_dates["rain_start"] = precipitation[i][(precipitation[i].shift(-1) > 0) & (precipitation[i] == 0)] # compare current to next
    df_dates["rain_stop"] = precipitation[i][(precipitation[i].shift(1) > 0) & (precipitation[i] == 0)] # compare current to prev
    dates_rain_start.append(pd.Series(df_dates.rain_start.index[df_dates.rain_start.notna()]))
    dates_rain_stop.append(pd.Series(df_dates.rain_stop.index[df_dates.rain_stop.notna()]))


## Find change points

In [None]:
n = len(filenamesTraining)
w1 = 10  # window of days to train (before the rain)
w2 = 5 # window of days to validate (before the rain)
w3 = 10 # window of days to test (after the rain)
error_br_column = 5 #0=r_squared, 1=mae, 2=me, 3=mape, 4=mpe, 5=median error
error_ar_column = 5
thrsh = 1
w_train = 30
feats = ['irradiance', 'mod_temp']
target = 'power'
#indices = np.empty(len(scores), dtype=int)
error_names = {0: "r_squared", 1: "MAE", 2: "ME (true-pred)", 3: "MAPE", 4: "MPE (true-pred)", 5: "Median error"}

In [None]:
%%time
changepoint_ids = []

for i, df in enumerate(fileArraysTraining):
    p_changepoints_start = (pd.Series(dates_rain_start[i]).sort_values())
    p_changepoints_stop = (pd.Series(dates_rain_stop[i]).sort_values())
    error_name_br = error_names[error_br_column] 
    error_name_ar = error_names[error_ar_column]
    errors_br = np.empty((len(dates_rain_start[i]), 6))
    errors_ar = np.empty((len(dates_rain_start[i]), 6))
    scores = np.empty((n, len(dates_rain_start[i])))
    
    #compute errors using one model per rain
    errors_br, errors_ar = errors_at_rains2(df, p_changepoints_start, p_changepoints_stop, target, feats, w1, w2, w3 )
    #set threshold on MAPE error before rain
    mask1 = (errors_br[:,3]<= 0.05)
    #compute scores for the remaining
    scores[i] = -(errors_br[:, error_br_column]-errors_ar[:, error_ar_column])/np.abs(errors_br[:, error_ar_column])
    scores[i][(~mask1)] = np.finfo('d').min
    
    #compute indices to the best no_events rains
    indices = np.argsort(-scores[i])[:(scores>thrsh).sum()] 
    changepoint_ids.append(indices) 

## Train models after changepoints

In [None]:
%%time
models = []
for i, df in enumerate(fileArraysTraining): 
    ref_points = pd.Index(dates_rain_stop[i][changepoint_ids[i]])
    model, training_error, validation_error = train_on_reference_points(df, w_train, ref_points, feats, target)
    models.append(model)


## Read Testing Data

In [None]:
filenamesTesting = sorted(glob(os.path.join('/path','folder','testing_*.csv')))

In [None]:
%%time
fileArraysTesting = []
scaler = MinMaxScaler()
for fn in filenamesTesting:
    df = pd.read_csv(fn)
    df = df.set_index('timestamp')
    df = df.dropna()
    df.index = pd.to_datetime(df.index)
    df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)
    fileArraysTesting.append(df_scaled)

# Test models in new time series

## Sequential Code

In [None]:
def trainModelsNewTimeSeries(df, models, feats, target ):
    y_pred = predict(df, models, feats, target)
    return y_pred

In [None]:
%%time
window = 60
i = 0

total_running_time = 0.0;
running_time = []

while i< len(fileArraysTesting[0])-window:
    start = time.time()
    for j, df in enumerate(fileArraysTesting):
        y_pred = predict(df.iloc[i:i+window], models[0], feats, target)
        
    end = time.time()
    running_time_temp = end - start
    running_time.append(running_time_temp)   
    total_running_time = total_running_time + running_time_temp  
    i = i+window
    
print ("total_running_time = ", total_running_time )

## Parallel Batch Code

In [None]:
def trainModelsNewTimeSeriesBatch(batch_data, models, feats, target ):
    results = []
    for batch_temp in batch_data:
        y_pred = predict(batch_temp, models, feats, target)
        results.append(y_pred)
    return results

In [None]:
def parallel_batch_processing(batch_data, models, feats, target):
    running_time = 0.0
    futures = []
    
    start = time.time()
    for batch in batch_data:
        future = client.submit(trainModelsNewTimeSeriesBatch, batch, models, feats, target )
        futures.append(future)
    
    wait(futures, return_when="ALL_COMPLETED") 
    end = time.time()
    futures = []
    
    running_time = end- start
    
    return running_time

In [None]:
%%time
window = 60
i = 0

batch_data_size = 3334
batch_data = []
batch_data_all =[]
counter = 0;
running_time = []
total_running_time = 0.0 

while i< len(fileArraysTesting[0])-window:
    for j, df in enumerate(fileArraysTesting):
        if (counter < batch_data_size):
            batch_data.append(df.iloc[i:i+window])
            counter = counter +1 
                                                                                                    
        else:
            counter = 0
            batch_data_all.append(batch_data)
            batch_data = []
            batch_data.append( df.iloc[i:i+window])
            counter = counter +1
    
    batch_data_all.append(batch_data)
    batch_data =[]
    
    running_time_temp = parallel_batch_processing(batch_data_all, models[0], feats, target)  
    running_time.append(running_time_temp)   
    total_running_time = total_running_time + running_time_temp
    batch_data_all = []

    i = i+window
    
print("total_running_time  = ", total_running_time )

In [None]:
running_time

## Parallel Code without Batch

In [None]:
def trainModelsNewTimeSeriesParallel(df, models, feats, target ):
    y_pred = predict(df, models, feats, target)
    return y_pred

In [None]:
%%time
window = 60
i = 0

futures = []
running_time = []
total_running_time = 0.0

while i< len(fileArraysTesting[0])-window:
    start = time.time()
    for j, df in enumerate(fileArraysTesting):
        future = client.submit(trainModelsNewTimeSeriesParallel, df.iloc[i:i+window], models[0], feats, target)
        futures.append(future)
        
    wait(futures, return_when="ALL_COMPLETED") 
    end = time.time()
    running_time_temp = end - start
    running_time.append(running_time_temp)
    total_running_time = total_running_time + running_time_temp  
    futures = []
    i = i + window
    
print("total_running_time = " , total_running_time)  

# Close Dask

In [None]:
client.close()