In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
import seaborn as sns
import tensorflow as tf
import math

from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras import initializers

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
import xgboost as xgb
import sklearn
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import import_ipynb
from DataProcessing import Data
from xgboost_impl import Xgboost


#physical_devices = tf.config.list_physical_devices('GPU')
#tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [4]:
train_a = pd.read_parquet('dataset/A/train_targets.parquet')
train_b = pd.read_parquet('dataset/B/train_targets.parquet')
train_c = pd.read_parquet('dataset/C/train_targets.parquet')

X_train_observed_a = pd.read_parquet('dataset/A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('dataset/B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('dataset/C/X_train_observed.parquet')

X_train_estimated_a = pd.read_parquet('dataset/A/X_train_estimated.parquet') 
X_train_estimated_b = pd.read_parquet('dataset/B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('dataset/C/X_train_estimated.parquet')

X_test_estimated_a = pd.read_parquet('dataset/A/X_test_estimated.parquet')
X_test_estimated_b = pd.read_parquet('dataset/B/X_test_estimated.parquet')
X_test_estimated_c = pd.read_parquet('dataset/C/X_test_estimated.parquet')

In [5]:
data = Data(train_a, train_b, train_c, X_train_observed_a, X_train_observed_b, X_train_observed_c,
                      X_train_estimated_a, X_train_estimated_b, X_train_estimated_c, X_test_estimated_a, 
                      X_test_estimated_b, X_test_estimated_c)

In [None]:
model_a = CatBoostRegressor(iterations=1000,
                           depth=12,
                           task_type="GPU",
                           devices='0:1',
                           eval_metric="MAE",
                           random_seed=42)

model_a.fit(data.A.train_x, data.A.train_y,
            eval_set=(data.A.val_x, data.A.val_y),
            early_stopping_rounds=50,
            use_best_model=True)

In [None]:
space={
        'depth': hp.quniform("depth", 3, 12, 1),
        'learning_rate': hp.uniform ('learning_rate', 1e-3, 0.1),
    }


trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 10,
                        trials = trials)

print("The best hyperparameters are : ","\n")
print(best_hyperparams)

In [None]:
def objective(space):
        
    model = CatBoostRegressor(iterations=1000,
                           depth=space['depth'],
                           learning_rate=space['learning_rate'],
                           task_type="GPU",
                           devices='0:1',
                           eval_metric="MAE",
                           random_seed=42,
                           silent=True)

    building = data.C
    
    model.fit(building.train_x, building.train_y,
                eval_set=(building.val_x, building.val_y),
                early_stopping_rounds=50,
                use_best_model=True)
        
        
    val_pred = model.predict(building.val_x)
    
    mae_val = mean_absolute_error(val_pred, building.val_y)
                    
    return {'loss' : mae_val, 'status' : STATUS_OK}

In [None]:
xg = Xgboost(data)

xg.model_a

In [None]:
def modelfit(alg, xs, ys, val_xs, val_ys, useTrainCV=True, cv_folds=5, early_stopping_rounds=10):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(xs, label=ys)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, 
                          early_stopping_rounds=early_stopping_rounds, verbose_eval =True)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(xs, ys, eval_set=[(xs, ys),(val_xs, val_ys)], verbose=True)
        
    #Predict training set:
    dtrain_predictions = alg.predict(xs)
    dval_predictions = alg.predict(val_xs)
    
    mae = mean_absolute_error(dtrain_predictions, ys)
    mae_val = mean_absolute_error(dval_predictions, val_ys)
    print(mae, mae_val)
                    
    feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [None]:
import gc
gc.collect()

In [None]:
model_a = xgb.XGBRegressor(
 device='cuda',
 learning_rate =0.05,
 n_estimators=1693,
 max_depth=16,
 min_child_weight=9,
 gamma=4.709926652039647,
 subsample=0.5746022561573897,
 colsample_bytree=0.925119931399705,
 seed=42,
 eval_metric= 'mae',
 booster='gbtree',
 reg_alpha=77.7952642777926,
 reg_lambda=102.6220459955603,
)

modelfit(model_a, data.A.train_x, data.A.train_y, data.A.val_x, data.A.val_y, useTrainCV = False)

In [None]:
model_b = xgb.XGBRegressor(
 device='cuda',
 learning_rate =0.05,
 n_estimators=2880,
 max_depth=10,
 min_child_weight=2,
 gamma=6.9462927163070525,
 subsample=0.5425452253269976,
 colsample_bytree=0.8615770908405836,
 seed=42,
 eval_metric= 'mae',
 booster='gbtree',
 reg_alpha = 39.56391755892025,
 reg_lambda = 165.13746485969003,
)

modelfit(model_b, data.B.train_x, data.B.train_y, data.B.val_x, data.B.val_y, useTrainCV = False)

In [None]:
model_c = xgb.XGBRegressor(
 device='cuda',
 learning_rate =0.05,
 n_estimators=2049,
 max_depth=11,
 min_child_weight=6,
 gamma=2.700424640722136,
 subsample=0.7625820679319437,
 colsample_bytree=0.6696305568496206,
 seed=42,
 eval_metric= 'mae',
 booster='gbtree',
 reg_alpha = 146.95411105137276,
 reg_lambda = 125.49465203052867,
)

modelfit(model_c, data.C.train_x, data.C.train_y, data.C.val_x, data.C.val_y, useTrainCV = False)

In [None]:
preds_a_train = np.absolute(model_a.predict(train_data_a))
preds_a_val = np.absolute(model_a.predict(val_data_a))
preds_a_test = np.absolute(model_a.predict(test_a))

preds_b_train = np.absolute(model_b.predict(train_data_b))
preds_b_val = np.absolute(model_b.predict(val_data_b))
preds_b_test = np.absolute(model_b.predict(test_b))

preds_c_train = np.absolute(model_c.predict(train_data_c))
preds_c_val = np.absolute(model_c.predict(val_data_c))
preds_c_test = np.absolute(model_c.predict(test_c))

fig, axs = plt.subplots(7, figsize=(10, 15))
axs[0].plot((train_a_y), color="blue")
axs[0].plot(preds_a_train, color="red", alpha=0.5)

axs[1].plot((val_a_y), color="blue")
axs[1].plot(preds_a_val, color="red", alpha=0.5)


axs[2].plot((train_b_y), color="blue")
axs[2].plot(preds_b_train, color="red", alpha=0.5)

axs[3].plot((val_b_y), color="blue")
axs[3].plot(preds_b_val, color="red", alpha=0.5)

axs[4].plot((train_c_y), color="blue")
axs[4].plot(preds_c_train, color="red", alpha=0.5)

axs[5].plot((val_c_y), color="blue")
axs[5].plot(preds_c_val, color="red", alpha=0.5)

axs[6].plot(preds_a_test, color="blue")
axs[6].plot(preds_b_test, color="green")
axs[6].plot(preds_c_test, color="red")


In [None]:
def objective(space):
    
    alg = xgb.XGBRegressor(
     device='cuda',
     learning_rate =0.2,
     n_estimators= 500,
     max_depth= round(space['max_depth']),
     min_child_weight= space['min_child_weight'],
     gamma=space['gamma'],
     subsample= space['subsample'],
     colsample_bytree= space['colsample_bytree'],
     nthread=4,
     seed=42,
     eval_metric= 'mae',
     booster='gbtree',
     reg_lambda=space['reg_lambda'],
     reg_alpha=space['reg_alpha'],
     early_stopping_rounds = 10
    )
        #data.B.train_x, data.B.train_y, data.B.val_x, data.B.val_y
    #Fit the algorithm on the data
    alg.fit(data.B.train_x, data.B.train_y, eval_set=[(data.B.train_x, data.B.train_y),
                                                      (data.B.val_x, data.B.val_y)], verbose=False)
        
    #Predict training set:
    dval_predictions = alg.predict( data.B.val_x)
    
    mae_val = mean_absolute_error(dval_predictions, data.B.val_y)
                    
    return {'loss' : mae_val, 'status' : STATUS_OK}

In [None]:
space={
        'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 0,9),
        'reg_alpha' : hp.uniform('reg_alpha', 0,180),
        'reg_lambda' : hp.uniform('reg_lambda', 0,180),
        'subsample' : hp.uniform('subsample', 0.5,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
    }


trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 1000,
                        trials = trials)

print("The best hyperparameters are : ","\n")
print(best_hyperparams)

#{'colsample_bytree': 0.8563576286836063, 'gamma': 2.3530939377340916, 'max_depth': 17.0, 'min_child_weight': 9.0, 'reg_alpha': 5.362099435153887, 'reg_lambda': 64.39374187968356, 'subsample': 0.6745835850987192}

In [6]:
MAX_EPOCHS = 30

def compile_and_fit(model, train_x, train_y, val_x, val_y, patience=3):
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                    patience=3,
                                                    mode='min',
                                                    restore_best_weights=True)

    model.compile(loss=tf.keras.losses.MeanSquaredError(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=[tf.keras.metrics.MeanAbsoluteError()])

    history = model.fit(x= train_x, y=train_y, epochs=MAX_EPOCHS,
                      validation_data=(val_x, val_y),
                       callbacks=[early_stopping])
    return history

In [7]:
def split_and_batch(df):
    df_labels = df.pop("pv_measurement")
    
    BATCH_SIZE=30

    df_arr = np.array(df, dtype=np.float32)
    df_label_arr = np.array(df_labels, dtype=np.float32)

    # Calculate the number of batches needed
    num_batches = df_arr.shape[0] // BATCH_SIZE

    # Reshape the 2D array into a 3D array with shape (num_batches, BATCH_SIZE, 48)
    if df_arr.shape[0] % BATCH_SIZE == 0:
        # If the data size is a multiple of BATCH_SIZE
        data_3d = df_arr.reshape(num_batches, BATCH_SIZE, -1)
        label_3d = df_label_arr.reshape(num_batches, BATCH_SIZE, -1)

    else:
        # If there's some remaining data that doesn't fit perfectly into batches
        remaining_rows = df_arr.shape[0] % BATCH_SIZE
        data_3d = df_arr[:-remaining_rows].reshape(num_batches, BATCH_SIZE, -1)
        label_3d = df_label_arr[:-remaining_rows].reshape(num_batches, BATCH_SIZE, -1)

    return data_3d, label_3d

In [8]:
def batch(df):
    BATCH_SIZE=30

    df_arr = np.array(df, dtype=np.float32)

    # Calculate the number of batches needed
    num_batches = df_arr.shape[0] // BATCH_SIZE

    # Reshape the 2D array into a 3D array with shape (num_batches, BATCH_SIZE, 48)
    if df_arr.shape[0] % BATCH_SIZE == 0:
        # If the data size is a multiple of BATCH_SIZE
        data_3d = df_arr.reshape(num_batches, BATCH_SIZE, -1)

    else:
        # If there's some remaining data that doesn't fit perfectly into batches
        remaining_rows = df_arr.shape[0] % BATCH_SIZE
        data_3d = df_arr[:-remaining_rows].reshape(num_batches, BATCH_SIZE, -1)

    return data_3d
    

In [9]:
train_a_x = batch(data.A.train_x)
train_a_y = batch(data.A.train_y)
val_a_x = batch(data.A.val_x)
val_a_y = batch(data.A.val_y)

train_b_x = batch(data.B.train_x)
train_b_y = batch(data.B.train_y)
val_b_x = batch(data.B.val_x)
val_b_y = batch(data.B.val_y)

train_c_x = batch(data.C.train_x)
train_c_y = batch(data.C.train_y)
val_c_x = batch(data.C.val_x)
val_c_y = batch(data.C.val_y)

test_a = batch(data.A.test_x)
test_b = batch(data.B.test_x)
test_c = batch(data.C.test_x)

In [10]:
lstm_model_a = tf.keras.models.Sequential([ 
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM( 32, return_sequences=True, kernel_initializer=tf.keras.initializers.GlorotNormal(),
                    bias_initializer=initializers.Constant(0.1))
    ),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(48, return_sequences=True, dropout=0.5, bias_initializer=initializers.Constant(0.1))
    ),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True, dropout=0.5, bias_initializer=initializers.Constant(0.1))
    ),

    tf.keras.layers.Dense(units=1, activation="relu", bias_initializer=initializers.Constant(0.1))

])

lstm_model_b = tf.keras.models.clone_model(lstm_model_a)
lstm_model_c = tf.keras.models.clone_model(lstm_model_b)



In [None]:
history_a = compile_and_fit(lstm_model_a,train_a_x, train_a_y, val_a_x, val_a_y)
print(f'measure a: {lstm_model_a.evaluate(val_a_x, val_a_y)}')

#[0.0038667283952236176, 0.024362364783883095]
#[0.004080631770193577, 0.023939160630106926]

Epoch 1/30
Epoch 2/30

In [None]:
history_b = compile_and_fit(lstm_model_b,train_b_x, train_b_y, val_b_x, val_b_y)
print(f'measure b: {lstm_model_b.evaluate(val_b_x, val_b_y)}')

# [0.003871380351483822, 0.023782264441251755]

In [None]:
history_c = compile_and_fit(lstm_model_c,train_c_x, train_c_y, val_c_x, val_c_y)
print(f'measure c: {lstm_model_c.evaluate(val_c_x, val_c_y)}')

#[0.0031701738480478525, 0.023126540705561638]

In [None]:
#lstm_model_a.get_weight_paths()

In [None]:
pred_a = lstm_model_a.predict(test_a)
pred_b = lstm_model_b.predict(test_b)
pred_c = lstm_model_c.predict(test_c)

pred_a_val = lstm_model_a.predict(val_a_x)
pred_b_val = lstm_model_b.predict(val_b_x)
pred_c_val = lstm_model_c.predict(val_c_x)

pred_a_train = lstm_model_a.predict(train_a_x)
pred_b_train = lstm_model_b.predict(train_b_x)
pred_c_train = lstm_model_c.predict(train_c_x)

In [None]:
pred_a = pred_a.flatten() 
pred_b = pred_b.flatten() 
pred_c = pred_c.flatten() 

pred_a_val = pred_a_val.flatten() 
pred_b_val = pred_b_val.flatten() 
pred_c_val = pred_c_val.flatten() 

pred_a_train = pred_a_train.flatten() 
pred_b_train = pred_b_train.flatten() 
pred_c_train = pred_c_train.flatten() 

In [None]:
#lstm_model_a.save("./a_2.keras")
#lstm_model_b.save("./b_2.keras")
#lstm_model_c.save("./c_2.keras")

In [None]:
plt.figure(figsize=(10,6))
plt.plot(pred_a, color="blue")
plt.plot(pred_b, color="red")
plt.plot(pred_c, color="green")

In [None]:

fig, axs = plt.subplots(3)

axs[0].plot(train_a_y.flatten(), color="blue")
axs[0].plot(pred_a_train, color="orange")

axs[1].plot(train_b_y.flatten(), color="blue")
axs[1].plot(pred_b_train, color="orange")

axs[2].plot(train_c_y.flatten(), color="blue")
axs[2].plot(pred_c_train, color="orange")

In [None]:
fig, axs = plt.subplots(3)

axs[0].plot(val_a_y.flatten(), color="blue")
axs[0].plot(pred_a_val, color="orange")

axs[1].plot(val_b_y.flatten(), color="blue")
axs[1].plot(pred_b_val, color="orange")

axs[2].plot(val_c_y.flatten(), color="blue")
axs[2].plot(pred_c_val, color="orange")

In [None]:
test = pd.read_csv('test.csv')

test

In [None]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission

In [None]:
submit = np.concatenate((np.concatenate((preds_a_test, preds_b_test)),preds_c_test))

In [None]:
plt.plot(submit)

In [None]:
from datetime import datetime

now = datetime.now() # current date and time

# Example, let the predictions be random values
test['prediction'] = submit
sample_submission = sample_submission[['id']].merge(test[['id', 'prediction']], on='id', how='left')
sample_submission.to_csv("xgboost_4.csv", index=False)