In [1]:
import numpy as np
import os, sys
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVR
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics
import pandas as pd
from matplotlib import pyplot as plt
plt.rcParams['axes.facecolor']='w'
plt.rcParams['savefig.facecolor']='w'
%matplotlib inline 
#from my_functions import *
from modules.preprocessing import *
#from modules.statistics import *
from modules.learning import *
from scipy import stats
from ast import literal_eval
from modules.learning import *
from modules.io import *
from modules.statistics import *

import matplotlib.dates as mdates
import time
import pickle

from timeit import default_timer as timer

from glob import glob
import os


# Test the performance of the soiling regression method

## Import Dask

In [2]:
from dask.distributed import Client
from dask.distributed import wait

## Start Dask

In [3]:
client = Client(n_workers=30,threads_per_worker=1)

## Load Training Data

In [4]:
dates_wash_start = pd.to_datetime(pd.Series(['2013-03-11 00:00:00', '2013-07-10 00:00:00', '2013-08-14 00:00:00', '2013-08-21 00:00:00', '2013-08-26 00:00:00']))
dates_wash_stop = pd.to_datetime(pd.Series(['2013-03-12 00:00:00', '2013-07-11 00:00:00', '2013-08-15 00:00:00', '2013-08-22 00:00:00','2013-08-27 00:00:00']))

In [5]:
filenamesTraining = sorted(glob(os.path.join('/data/data1/synthetic_soiling_data2','training_*.csv')))

In [6]:
%%time
fileArraysTraining = []
scaler = MinMaxScaler()
for fn in filenamesTraining:
    df = pd.read_csv(fn)
    df = df.set_index('timestamp')
    df = df.dropna()
    df.index = pd.to_datetime(df.index)
    df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)
    fileArraysTraining.append(df_scaled)

CPU times: user 17min 17s, sys: 3min 2s, total: 20min 19s
Wall time: 20min 39s


## Extract Rains

In [7]:
%%time
precipitation = []
dates_rain_start = []
dates_rain_stop = []
for i, df in enumerate(fileArraysTraining):
    precipitation.append(pd.concat([pd.Series({min(df.index)-pd.Timedelta('1s'): 0}),df.precipitation, pd.Series({max(df.index)+pd.Timedelta('1s'): 0})]))
    precipitation[i].index = pd.to_datetime(precipitation[i].index)
    df_dates = pd.DataFrame(index = precipitation[i].index)
    df_dates["rain_start"] = precipitation[i][(precipitation[i].shift(-1) > 0) & (precipitation[i] == 0)] # compare current to next
    df_dates["rain_stop"] = precipitation[i][(precipitation[i].shift(1) > 0) & (precipitation[i] == 0)] # compare current to prev
    dates_rain_start.append(pd.Series(df_dates.rain_start.index[df_dates.rain_start.notna()]))
    dates_rain_stop.append(pd.Series(df_dates.rain_stop.index[df_dates.rain_stop.notna()]))




CPU times: user 41min 31s, sys: 1min 39s, total: 43min 11s
Wall time: 41min 48s


## Find change points

In [8]:
n = len(filenamesTraining)
w1 = 10  # window of days to train (before the rain)
w2 = 5 # window of days to validate (before the rain)
w3 = 10 # window of days to test (after the rain)
error_br_column = 5 #0=r_squared, 1=mae, 2=me, 3=mape, 4=mpe, 5=median error
error_ar_column = 5
thrsh = 1
w_train = 30
feats = ['irradiance', 'mod_temp']
target = 'power'
#indices = np.empty(len(scores), dtype=int)
error_names = {0: "r_squared", 1: "MAE", 2: "ME (true-pred)", 3: "MAPE", 4: "MPE (true-pred)", 5: "Median error"}

In [9]:
%%time
changepoint_ids = []

for i, df in enumerate(fileArraysTraining):
    p_changepoints_start = (pd.Series(dates_rain_start[i]).sort_values())
    p_changepoints_stop = (pd.Series(dates_rain_stop[i]).sort_values())
    error_name_br = error_names[error_br_column] 
    error_name_ar = error_names[error_ar_column]
    errors_br = np.empty((len(dates_rain_start[i]), 6))
    errors_ar = np.empty((len(dates_rain_start[i]), 6))
    scores = np.empty((n, len(dates_rain_start[i])))
    
    #compute errors using one model per rain
    errors_br, errors_ar = calc_changepoints_many_models(df, p_changepoints_start, p_changepoints_stop, target, feats, w1, w2, w3 )
    #set threshold on MAPE error before rain
    mask1 = (errors_br[:,3]<= 0.05)
    #compute scores for the remaining
    scores[i] = -(errors_br[:, error_br_column]-errors_ar[:, error_ar_column])/np.abs(errors_br[:, error_ar_column])
    scores[i][(~mask1)] = np.finfo('d').min
    
    #compute indices to the best no_events rains
    indices = np.argsort(-scores[i])[:(scores>thrsh).sum()] 
    changepoint_ids.append(indices) 



CPU times: user 5d 6h 44min 52s, sys: 19d 15h 20min 5s, total: 24d 22h 4min 57s
Wall time: 9h 23min 29s


## Train models after changepoints

In [10]:
%%time
models = []
for i, df in enumerate(fileArraysTraining): 
    ref_points = pd.Index(dates_rain_stop[i][changepoint_ids[i]])
    model, training_error, validation_error = train_on_reference_points(df, w_train, ref_points, feats, target)
    models.append(model)


Training Metrics:
MAE:0.010 
ME(true-pred):-0.000 
MAPE:0.026 
R2: 0.997

Validation Metrics:
MAE:0.010 
ME(true-pred):0.000 
MAPE:0.027 
MPE:0.001 
R2: 0.997

Training Metrics:
MAE:0.010 
ME(true-pred):0.000 
MAPE:0.027 
R2: 0.997

Validation Metrics:
MAE:0.010 
ME(true-pred):0.000 
MAPE:0.027 
MPE:0.000 
R2: 0.997

Training Metrics:
MAE:0.010 
ME(true-pred):0.000 
MAPE:0.027 
R2: 0.997

Validation Metrics:
MAE:0.010 
ME(true-pred):0.000 
MAPE:0.028 
MPE:0.001 
R2: 0.997

Training Metrics:
MAE:0.010 
ME(true-pred):-0.000 
MAPE:0.027 
R2: 0.997

Validation Metrics:
MAE:0.010 
ME(true-pred):0.000 
MAPE:0.027 
MPE:0.000 
R2: 0.997

Training Metrics:
MAE:0.010 
ME(true-pred):0.000 
MAPE:0.026 
R2: 0.997

Validation Metrics:
MAE:0.010 
ME(true-pred):0.000 
MAPE:0.027 
MPE:0.000 
R2: 0.997

Training Metrics:
MAE:0.010 
ME(true-pred):-0.000 
MAPE:0.026 
R2: 0.997

Validation Metrics:
MAE:0.010 
ME(true-pred):0.000 
MAPE:0.027 
MPE:0.001 
R2: 0.997

Training Metrics:
MAE:0.010 
ME(true-pred):

KeyError: "None of [Index(['irradiance', 'mod_temp'], dtype='object')] are in the [columns]"

## Read Testing Data

In [11]:
filenamesTesting = sorted(glob(os.path.join('/data/data1/synthetic_soiling_data2','testing_*.csv')))

In [12]:
%%time
fileArraysTesting = []
scaler = MinMaxScaler()
for fn in filenamesTesting:
    df = pd.read_csv(fn)
    df = df.set_index('timestamp')
    df = df.dropna()
    df.index = pd.to_datetime(df.index)
    df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)
    fileArraysTesting.append(df_scaled)

CPU times: user 14min 28s, sys: 3min 44s, total: 18min 12s
Wall time: 18min 43s


# Test models in new time series

## Sequential Code

In [13]:
def trainModelsNewTimeSeries(df, models, feats, target ):
    y_pred = predict(df, models, feats, target)
    return y_pred

In [14]:
%%time
window = 60
i = 0

total_running_time = 0.0;
running_time = []

while i< len(fileArraysTesting[0])-window:
    start = time.time()
    for j, df in enumerate(fileArraysTesting):
        y_pred = predict(df.iloc[i:i+window], models[0], feats, target)
        
    end = time.time()
    running_time_temp = end - start
    running_time.append(running_time_temp)   
    total_running_time = total_running_time + running_time_temp  
    i = i+window
    
print ("total_running_time = ", total_running_time )

total_running_time =  5478.218723297119
CPU times: user 1h 27min 12s, sys: 9min 19s, total: 1h 36min 32s
Wall time: 1h 31min 18s


## Parallel Batch Code

In [15]:
def trainModelsNewTimeSeriesBatch(batch_data, models, feats, target ):
    results = []
    for batch_temp in batch_data:
        y_pred = predict(batch_temp, models, feats, target)
        results.append(y_pred)
    return results

In [16]:
def parallel_batch_processing(batch_data, models, feats, target):
    running_time = 0.0
    futures = []
    
    start = time.time()
    for batch in batch_data:
        future = client.submit(trainModelsNewTimeSeriesBatch, batch, models, feats, target )
        futures.append(future)
    
    wait(futures, return_when="ALL_COMPLETED") 
    end = time.time()
    futures = []
    
    running_time = end- start
    
    return running_time

In [17]:
%%time
window = 60
i = 0

batch_data_size = 3334
batch_data = []
batch_data_all =[]
counter = 0;
running_time = []
total_running_time = 0.0 

while i< len(fileArraysTesting[0])-window:
    for j, df in enumerate(fileArraysTesting):
        if (counter < batch_data_size):
            batch_data.append(df.iloc[i:i+window])
            counter = counter +1 
                                                                                                    
        else:
            counter = 0
            batch_data_all.append(batch_data)
            batch_data = []
            batch_data.append( df.iloc[i:i+window])
            counter = counter +1
    
    batch_data_all.append(batch_data)
    batch_data =[]
    
    running_time_temp = parallel_batch_processing(batch_data_all, models[0], feats, target)  
    running_time.append(running_time_temp)   
    total_running_time = total_running_time + running_time_temp
    batch_data_all = []

    i = i+window
    
print("total_running_time  = ", total_running_time )

  ([                     irradiance  mod_temp     po ... emp'], 'power')
Consider scattering large objects ahead of time
with client.scatter to reduce scheduler burden and 
keep data on workers

    future = client.submit(func, big_data)    # bad

    big_future = client.scatter(big_data)     # good
    future = client.submit(func, big_future)  # good


total_running_time  =  1881.9890089035034
CPU times: user 35min 31s, sys: 2min 32s, total: 38min 4s
Wall time: 41min 15s


In [18]:
running_time

[21.22065782546997,
 10.36053466796875,
 16.836101531982422,
 10.411165714263916,
 9.924630403518677,
 9.958894491195679,
 16.688936233520508,
 16.861043691635132,
 10.055057287216187,
 10.10726284980774,
 16.865914583206177,
 16.665753841400146,
 9.962754964828491,
 10.004858255386353,
 16.71683406829834,
 16.89554452896118,
 10.189363241195679,
 9.834361553192139,
 9.943933725357056,
 16.81853699684143,
 16.87835431098938,
 10.039936304092407,
 9.978958368301392,
 16.780816555023193,
 16.89661192893982,
 10.041898727416992,
 10.127279043197632,
 16.63619589805603,
 16.81503200531006,
 10.224900722503662,
 10.031450510025024,
 16.900747776031494,
 17.024035930633545,
 10.226491451263428,
 9.95476746559143,
 16.89436411857605,
 16.88947820663452,
 10.243098974227905,
 10.147677183151245,
 16.86095690727234,
 17.007615566253662,
 10.115966796875,
 9.888191938400269,
 10.135563373565674,
 16.74196767807007,
 16.733318328857422,
 10.122098207473755,
 10.03947639465332,
 16.95120620727539,

# Close Dask

In [19]:
client.close()