## 1. Random prediction

For the first prediction we experimented with and creating a submission with random values to set as a baseline for future attempts.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [None]:
def list_directory_tree_with_os_walk(starting_directory):
    for root, directories, files in os.walk(starting_directory):
        print(f"Directory: {root}")
        for file in files:
            print(f"  File: {file}")

list_directory_tree_with_os_walk('.')

In [None]:
train_a = pd.read_parquet('data/A/train_targets.parquet')
train_b = pd.read_parquet('data/B/train_targets.parquet')
train_c = pd.read_parquet('data/C/train_targets.parquet')

In [None]:
X_train_estimated_a = pd.read_parquet('data/A/X_train_estimated.parquet')
X_train_estimated_b = pd.read_parquet('data/B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('data/C/X_train_estimated.parquet')


In [None]:
X_train_observed_a = pd.read_parquet('data/A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('data/B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('data/C/X_train_observed.parquet')


In [None]:
X_test_estimated_a = pd.read_parquet('data/A/X_test_estimated.parquet')
X_test_estimated_b = pd.read_parquet('data/B/X_test_estimated.parquet')
X_test_estimated_c = pd.read_parquet('data/C/X_test_estimated.parquet')


In [None]:
# Plotting a single feature
fig, axs = plt.subplots(3, 1, figsize=(20, 10), sharex=True)
feature_name = 'absolute_humidity_2m:gm3'
X_train_observed_a[['date_forecast', feature_name]].set_index('date_forecast').plot(ax=axs[0], title='Train/Test A', color='red')
X_train_estimated_a[['date_forecast', feature_name]].set_index('date_forecast').plot(ax=axs[0], title='Train/Test A', color='blue')
X_test_estimated_a[['date_forecast', feature_name]].set_index('date_forecast').plot(ax=axs[0], title='Train/Test  A', color='green')

X_train_observed_b[['date_forecast', feature_name]].set_index('date_forecast').plot(ax=axs[1], title='Train/Test  B', color='red')
X_train_estimated_b[['date_forecast', feature_name]].set_index('date_forecast').plot(ax=axs[1], title='Train/Test  B', color='blue')
X_test_estimated_b[['date_forecast', feature_name]].set_index('date_forecast').plot(ax=axs[1], title='Train/Test  B', color='green')

X_train_observed_c[['date_forecast', feature_name]].set_index('date_forecast').plot(ax=axs[2], title='Train/Test  C', color='red')
X_train_estimated_c[['date_forecast', feature_name]].set_index('date_forecast').plot(ax=axs[2], title='Train/Test  C', color='blue')
X_test_estimated_c[['date_forecast', feature_name]].set_index('date_forecast').plot(ax=axs[2], title='Train/Test  C', color='green')

In [None]:
test = pd.read_csv('data/test.csv')
test


In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission

In [None]:
# Example, let the predictions be random values
test['prediction'] = np.random.rand(len(test))
sample_submission = sample_submission[['id']].merge(test[['id', 'prediction']], on='id', how='left')
sample_submission.to_csv('my_first_submission.csv', index=False)

## 2. Decision Tree

The second submission was based on the previous code but now using a basic Decision Tree regressor from SKLearn. 

#### Making combined dataframes

In [None]:
#Remove all non hourly values
X_train_all_a.set_index('date_forecast', inplace=True)
mask = X_train_all_a.index.minute == 0
X_train_all_a = X_train_all_a[mask]

#### Making the Model 

In [None]:
X = data.drop(["pv_measurement",'date_calc'], axis=1)
y = data.pv_measurement

model = DecisionTreeRegressor(random_state=1)
model.fit(X, y)
predictions = model.predict(X_test_a.drop(['date_forecast', 'date_calc'], axis=1))
print(predictions)

In [None]:
test = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')
test['prediction'] = np.random.rand(len(test))
test.prediction[0..predictions.size] = predictions
sample_submission = sample_submission[['id']].merge(test[['id', 'prediction']], on='id', how='left')
sample_submission.to_csv('my_first_submission.csv', index=False)

## 3. Exploratory Data Analysis

This is the initial Exploratory Data Analysis we made. This notebook contains also a Decision Tree regressor but now has better logic for creating the submission file.

# Project task - Solar Energy Production
This notebook will contain the exploratory data analysis, all manipulation and cleaning of the given datasets and finally the creation of predictions using the different ML models

In [None]:
#importing necessary datasets
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline

#Setting max display options to avoid local crashes
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

## Reading in the datasets

In [None]:
Y_train_a = pd.read_parquet('data/A/train_targets.parquet')
Y_train_b = pd.read_parquet('data/B/train_targets.parquet')
Y_train_c = pd.read_parquet('data/C/train_targets.parquet')

In [None]:
X_train_estimated_a = pd.read_parquet('data/A/X_train_estimated.parquet')
X_train_estimated_b = pd.read_parquet('data/B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('data/C/X_train_estimated.parquet')

In [None]:
X_train_observed_a = pd.read_parquet('data/A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('data/B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('data/C/X_train_observed.parquet')

In [None]:
X_test_estimated_a = pd.read_parquet('data/A/X_test_estimated.parquet')
X_test_estimated_b = pd.read_parquet('data/B/X_test_estimated.parquet')
X_test_estimated_c = pd.read_parquet('data/C/X_test_estimated.parquet')

### Combining the datasets
A seperate combined dataset will be made for the A, B and C location.\
Each location will have a X dataset (containing only feature values) and a complete training dataset (X_train combined with the Y_train values) which can be used to check for feature correlation. 


*Note: The resolution is different between the X and Y datasets, as Y each row in Y contains an hour and each row in X is per 15 minutes. To begin with the X datset will be naivly changed to have a resolution of every hour aswell. However we will later explore feature exploration techniques to retain more data from X by artifically increasing the resolution of the Y data*

#### Location A

In [None]:
#Create the X dataset for location A
X_train_estimated_a["estimated"] = 1.0
X_test_estimated_a["estimated"] = 1.0
X_train_observed_a["estimated"] = 0.0
X_train_observed_a["date_calc"] = X_train_observed_a["date_forecast"].copy() #date calc column added to observed data so that observed and estimated has equal nr cols

X_train_estimated_a["train/test"] = "train"
X_train_observed_a["train/test"] = "train"
X_test_estimated_a["train/test"] = "test"

X_all_a= pd.concat([X_train_observed_a, X_train_estimated_a, X_test_estimated_a], ignore_index=True)
X_all_a.set_index('date_forecast', inplace=True)

*Create the complete training set for A*

In [None]:
#Create the complete train dataset for location A
train_all_a = pd.concat([X_train_observed_a.drop("train/test", axis=1), X_train_estimated_a.drop("train/test", axis=1)], ignore_index=True)
train_all_a.set_index('date_forecast', inplace=True)

#remove all non hourly values
train_all_a = train_all_a[train_all_a.index.minute == 0]

#Set the index as the date forecast for y aswell
Y_train_a.set_index('time', inplace=True)

#Concatenate with y data
train_all_a = pd.concat([train_all_a, Y_train_a], axis=1)

In [None]:
train_all_a.head()

#### Location B

In [None]:
#Combine all data from B
X_train_estimated_b["estimated"] = 1.0
X_test_estimated_b["estimated"] = 1.0
X_train_observed_b["estimated"] = 0.0
X_train_observed_b["date_calc"] = X_train_observed_b["date_forecast"].copy()

X_train_estimated_b["train/test"] = "train"
X_train_observed_b["train/test"] = "train"
X_test_estimated_b["train/test"] = "test"

X_all_b= pd.concat([X_train_observed_b, X_train_estimated_b, X_test_estimated_b], ignore_index=True)

*Create the complete training set*

In [None]:
#Create the complete train dataset for location A
train_all_b = pd.concat([X_train_observed_b.drop("train/test", axis=1), X_train_estimated_b.drop("train/test", axis=1)], ignore_index=True)
train_all_b.set_index('date_forecast', inplace=True)
#remove all non hourly values
train_all_b = train_all_b[train_all_b.index.minute == 0]

#Set the index as the date forecast for y aswell
Y_train_b.set_index('time', inplace=True)

#Concatenate with y data
train_all_b = pd.concat([train_all_b, Y_train_b], axis=1)

In [None]:
train_all_b.head()

In [None]:
#Drop the first row, as it is not in the X data
train_all_b.drop("2018-12-31 23:00:00", inplace=True)

#### Location C

In [None]:
#Combine all data from C
X_train_estimated_c["estimated"] = 1.0
X_test_estimated_c["estimated"] = 1.0
X_train_observed_c["estimated"] = 0.0
X_train_observed_c["date_calc"] = X_train_observed_c["date_forecast"].copy()
X_train_estimated_c["train/test"] = "train"
X_train_observed_c["train/test"] = "train"
X_test_estimated_c["train/test"] = "test"

X_all_c= pd.concat([X_train_observed_c, X_train_estimated_c, X_test_estimated_c], ignore_index=True)

*Create Complete training data for location C*

In [None]:
#Create the complete train dataset for location A
train_all_c = pd.concat([X_train_observed_c.drop("train/test", axis=1), X_train_estimated_c.drop("train/test", axis=1)], ignore_index=True)
train_all_c.set_index('date_forecast', inplace=True)
#remove all non hourly values
train_all_c = train_all_c[train_all_c.index.minute == 0]

#Set the index as the date forecast for y aswell
Y_train_c.set_index('time', inplace=True)

#Concatenate with y data
train_all_c = pd.concat([train_all_c, Y_train_c], axis=1)

In [None]:
train_all_c.drop("2018-12-31 23:00:00", inplace=True)

#### Lets make a combined dataset for all locations
This is in order to quickly check correlations and get a brief overview about all locations

In [None]:
copy_a = X_all_a.copy()
copy_b = X_all_b.copy()
copy_c = X_all_c.copy()

copy_a["location"] = "A"
copy_b["location"] = "B"
copy_c["location"] = "C"
X_all = pd.concat([copy_a, copy_b, copy_c])

In [None]:
copy_a_train = train_all_a.copy()
copy_b_train = train_all_b.copy()
copy_c_train = train_all_c.copy()

copy_a_train["location"] = "A"
copy_b_train["location"] = "B"
copy_c_train["location"] = "C"

train_all = pd.concat([copy_a_train, copy_b_train, copy_c_train])

In [None]:
train_all.head()

## Exploratory Data Analysis

Lets start by making some quick graphs of the all the data combined

In [None]:
fig, axes = plt.subplots(nrows=17, ncols=3, figsize=(30, 100))
axes = axes.flatten()

for i, column in enumerate(X_all.drop("date_calc", axis=1).columns):
    sns.histplot(data=X_all[column], kde=False, ax=axes[i])
    axes[i].set_xlabel(column, fontsize = 20)


*Observations*
- Based on this is seems like snow density is completely filled with Nan values
- We have a random dip in the date forecast at the end of 2022
- Snow Density has VERY low count compared to the other features
- All the radian features have a strange distribution

#### Lets start to look at the correlation between the data and the target y values (PV measurement)

In [None]:
plt.figure(figsize = (50, 60), dpi=200)
sns.heatmap(train_all.drop(["location"], axis=1).corr(), annot = True); 

In [None]:
plt.figure(figsize = (50,90), dpi=200)
sns.heatmap(train_all.drop(["location"], axis=1).corr()[["pv_measurement"]], annot = True); 

In [None]:
train_all.drop(["location"], axis=1).corr()[["pv_measurement"]]

*Notes*
- wind_speed_w_1000hPa:ms seems to not be very important
- All snow values seem to not be very important (except maybe snow depth)
- dew_or_rime:idx is very useless

## Data Cleanup

In [None]:
data_a = train_all_a.dropna(axis=0).drop(['date_calc', 'estimated'], axis=1)
data_b = train_all_b.dropna(axis=0).drop(['date_calc', 'estimated'], axis=1)
data_c = train_all_c.dropna(axis=0).drop(['date_calc', 'estimated'], axis=1)

## Decision Tree Test

In [None]:
from sklearn.tree import DecisionTreeRegressor 

X_t = data_a.drop('pv_measurement',axis=1)
y_t = data_a.pv_measurement
model = DecisionTreeRegressor(random_state=1)
model.fit(X_t, y_t)
X_p = X_test_estimated_a.drop(['date_forecast', 'date_calc', 'estimated', 'train/test'], axis = 1)
predictions = model.predict(X_p)
out_pd = pd.concat([X_test_estimated_a.date_forecast, pd.DataFrame(predictions)], axis=1)
out_pd=out_pd.rename(columns = {0:'prediction','date_forecast':'time'})
out_pd['location'] = 'A'
out_pd.set_index('time',inplace=True)

## Create submission

In [None]:
test = pd.read_csv('data/test.csv')
test.time = pd.to_datetime(test.time)
sample_submission = pd.read_csv('data/sample_submission.csv')
# test['prediction'] = np.random.rand(len(test))
test.set_index('time',inplace=True)
df1 = test
df2 = out_pd

merged_df = df1.reset_index().merge(df2.reset_index(), on=['time', 'location'], how='left', suffixes=('_original', '_new'))

# # Use combine_first to replace NaN values in 'prediction_new' with the original 'prediction' values
merged_df['prediction_new'] = merged_df['prediction_new'].combine_first(merged_df['prediction_original'])

# # Drop the original 'prediction' column
merged_df.drop('prediction_original', axis=1, inplace=True)

# # Rename 'prediction_new' to 'prediction'
merged_df.rename(columns={'prediction_new': 'prediction'}, inplace=True)

sample_submission = sample_submission[['id']].merge(merged_df[['id', 'prediction']], on='id', how='left')
sample_submission.to_csv('my_first_submission.csv', index=False)

Submit

In [None]:
# kaggle competitions submit -c solar-energy-production-forecasting -f submission.csv -m "Message"

## Autosklearn Classifier

We then tried to improve the code from above by using AutoSKLearn, in this case it is mistakenly a classifier. Most of the code remains the same but using a different model.

In [None]:
import autosklearn.classification

X_t = data_a.drop('pv_measurement',axis=1)
y_t = data_a.pv_measurement
model = autosklearn.classification.AutoSklearnClassifier(random_state=1)
model.fit(X_t, y_t)
X_p = X_test_estimated_a.drop(['date_forecast', 'date_calc', 'estimated', 'train/test'], axis = 1)
predictions = model.predict(X_p)
out_pd = pd.concat([X_test_estimated_a.date_forecast, pd.DataFrame(predictions)], axis=1)
out_pd=out_pd.rename(columns = {0:'prediction','date_forecast':'time'})
out_pd['location'] = 'A'
out_pd.set_index('time',inplace=True)

## 5. Python script

For the next few attempts, we moved to a Python script to try different models, this initial script now uses the Decision Tree regressor that is more suitable for our scenario. It also contains extra helper functions and better formatting with the help from ChatGPT. Here we started to also do some model evaluation with included cross-validation functions.

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
import subprocess
import argparse

def execute_cmd(command):
    """
    Execute a command using the command line.
    
    Parameters:
    - command (str): The command to be executed.

    Returns:
    - str: The output of the command.
    """
    try:
        # Run the command and get the output
        result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT, text=True)
        return result.strip()
    except subprocess.CalledProcessError as e:
        # If the command returns a non-zero exit code, an exception will be raised.
        # Here, we catch the exception and return the error output.
        return e.output.strip()

def one_hot_to_categorical(df, col1, col2):
    """
    Convert one-hot encoded columns to a single categorical column.
    
    Parameters:
    - df: DataFrame containing the one-hot encoded columns.
    - col1: Name of the first one-hot encoded column.
    - col2: Name of the second one-hot encoded column.

    Returns:
    - A DataFrame with the categorical values.
    """
    conditions = [
        (df[col1] == 1),
        (df[col2] == 1)
    ]
    choices = [col1, col2]
    result_df = pd.DataFrame({
        'Category': np.select(conditions, choices, default='A')
    })
    return result_df

def load_datasets():
    X_test  = pd.read_parquet('data/prepared_datasets/no_Nan_hotone_encoding/X_test.parquet')
    X_train = pd.read_parquet('data/prepared_datasets/no_Nan_hotone_encoding/X_train.parquet')
    Y_train = pd.read_parquet('data/prepared_datasets/no_Nan_hotone_encoding/Y_train.parquet')
    return X_train, Y_train, X_test

def train_and_predict(X_train, Y_train, X_test):
    model = DecisionTreeRegressor(random_state=1)
    scores = cross_val_score(model, X_train, Y_train, cv=5)
    print("Cross-validation scores:", scores)
    print("Average cross-validation score:", scores.mean())
    model.fit(X_train, Y_train)
    predictions = model.predict(X_test)
    return predictions

def prepare_submission(predictions, X_test):
    pd_predictions = pd.DataFrame(predictions)
    index_df = X_test.index.to_frame()
    out_pd = pd.concat([index_df.reset_index(drop=True), pd_predictions.reset_index(drop=True)], axis=1)
    out_pd = out_pd.rename(columns={0: 'prediction', 'date_forecast': 'time'})
    out_pd['location'] = one_hot_to_categorical(X_test, 'B', 'C')
    out_pd.set_index('time', inplace=True)
    return out_pd

def merge_with_sample(out_pd):
    test = pd.read_csv('data/test.csv')
    test.time = pd.to_datetime(test.time)
    sample_submission = pd.read_csv('data/sample_submission.csv')
    test.set_index('time', inplace=True)
    
    merged_df = test.reset_index().merge(out_pd.reset_index(), on=['time', 'location'], how='left', suffixes=('_original', '_new'))
    merged_df['prediction_new'] = merged_df['prediction_new'].combine_first(merged_df['prediction_original'])
    merged_df.drop('prediction_original', axis=1, inplace=True)
    merged_df.rename(columns={'prediction_new': 'prediction'}, inplace=True)
    return sample_submission[['id']].merge(merged_df[['id', 'prediction']], on='id', how='left')

def main():
    pd.set_option('display.max_rows', 200)
    pd.set_option('display.max_columns', 200)
    
    X_train, Y_train, X_test = load_datasets()
    predictions = train_and_predict(X_train, Y_train, X_test)
    out_pd = prepare_submission(predictions, X_test)
    sample_submission = merge_with_sample(out_pd)
    sample_submission.to_csv('submission.csv', index=False)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Prints the provided string.")
    parser.add_argument("-m", "--message", type=str, help="Kaggle submission message.")
    args = parser.parse_args()
    main()
    if args.message:
        execute_cmd(f'kaggle competitions submit -c solar-energy-production-forecasting -f submission.csv -m "{args.message}"')
        


## AutoML

Now we added the AutoML we tried a few attempts ago into the script to compare. Here AutoML tried for a long time to obtain the best model possible.

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import subprocess
import argparse
from sklearn.metrics import mean_squared_error
import autosklearn.regression

def train_and_predict(X_train, y_train, X_test, model_type="regressor"):
    """
    Train and predict.

    Parameters:
    - X_train: Training features
    - y_train: Training labels/targets
    - X_test: Test features
    - model_type (str): Either "regressor" for Decision Tree or "automl" for auto-sklearn.

    Returns:
    - DataFrame: Predictions
    """
    if model_type == "regressor":
        model = DecisionTreeRegressor(random_state=1)
    elif model_type == "automl":
        model = autosklearn.regression.AutoSklearnRegressor(
            time_left_for_this_task=600,
            per_run_time_limit=60,
            n_jobs=-1,
            tmp_folder="/tmp/autosklearn_classification_example_tmp",
        )
    else:
        raise ValueError(f"Invalid model_type: {model_type}. Expected 'regressor' or 'classifier'.")

    scores = cross_val_score(model, X_train, y_train, cv=5)
    print("Cross-validation scores:", scores)
    print("Average cross-validation score:", scores.mean())
    
    model.fit(X_train, y_train)
    return pd.DataFrame(predictions)

def validate(predicted_df, target_df):
    train_targets = pd.read_parquet('data/A/train_targets.parquet')
    
    # Check if the number of samples in df and train_targets are the same
    if len(predicted_df) != len(target_df):
        raise ValueError(f"Validate: Inconsistent number of samples: predicted_df has {len(predicted_df)} samples while target_df has {len(target_df)} samples.")
    
    target_df.time = pd.to_datetime(target_df.time)
    target_df.set_index('time', inplace=True)

    # Set the 'time' column of df to match the index of target_df
    predicted_df.time = pd.to_datetime(target_df.index)

    # Compute RMSE
    rmse = mean_squared_error(target_df, predicted_df, squared=False)
    return rmse

## 7. Initial Catboost & XGBoost

This is our first attempt to explore the possibility to use CatBoost and XGBoost. We added the two tools to the script and did a couple submissions to see their potential.

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import subprocess
import argparse
from sklearn.metrics import mean_squared_error
import autosklearn.regression
from catboost import Pool, CatBoostRegressor
from xgboost import XGBRegressor

def train_and_predict(X_train, y_train, X_test, model_type="regressor"):
    """
    Train and predict.

    Parameters:
    - X_train: Training features
    - y_train: Training labels/targets
    - X_test: Test features
    - model_type (str): Either "regressor" for Decision Tree or "automl" for auto-sklearn.

    Returns:
    - DataFrame: Predictions
    """
    if model_type == "regressor":
        model = DecisionTreeRegressor(random_state=1)
    elif model_type == "automl":
        model = autosklearn.regression.AutoSklearnRegressor(
            time_left_for_this_task=600,
            per_run_time_limit=60,
            n_jobs=-1,
            tmp_folder="/tmp/autosklearn_classification_example_tmp"
        )
    elif model_type == "catboost":
        model = CatBoostRegressor()
    elif model_type == 'xgboost':
        model = XGBRegressor()
    else:
        raise ValueError(f"Invalid model_type: {model_type}. Expected 'regressor' or 'classifier'.")

    scores = cross_val_score(model, X_train, y_train, cv=5)
    print("Cross-validation scores:", scores)
    print("Average cross-validation score:", scores.mean())
    
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    try:
        print(model.show_models())
        print(model.leaderboard())
        # print("MSE:", mean_squared_error(y_test, predictions))
        print(model.get_configuration_space(X_train, y_train))
    except:
        print("")
    return pd.DataFrame(predictions)

## 8. Catboost Update

Catboost produced good results so we tried to perform some data engineering to improve it and update some of its parameters.

In [None]:
def load_datasets():
    X_test  = pd.read_parquet('data/prepared_datasets/non_cleaned/X_test.parquet')
    X_train = pd.read_parquet('data/prepared_datasets/non_cleaned/X_train.parquet')
    y_train = pd.read_parquet('data/prepared_datasets/non_cleaned/Y_train.parquet')
    return X_train, y_train, X_test

def train_and_predict(X_train, y_train, X_test, model_type="regressor"):
    """
    Train and predict.

    Parameters:
    - X_train: Training features
    - y_train: Training labels/targets
    - X_test: Test features
    - model_type (str): Either "regressor" for Decision Tree or "automl" for auto-sklearn.

    Returns:
    - DataFrame: Predictions
    """
    if model_type == "regressor":
        model = DecisionTreeRegressor(random_state=1)
    elif model_type == "automl":
        model = autosklearn.regression.AutoSklearnRegressor(
            time_left_for_this_task=600,
            per_run_time_limit=60,
            n_jobs=-1,
            tmp_folder="/tmp/autosklearn_classification_example_tmp"
        )
    elif model_type == "catboost":
        cat_features = ['location']
        model = CatBoostRegressor(
            cat_features=cat_features,
            verbose=100
        )

### Stacked Catboost

This attempt we checked the possibility of stacking the catboost models fitted to each location separately.

In [None]:
X_test_A = None
X_test_B = None
X_test_C = None
X_train_A = None
X_train_B = None
X_train_C = None
y_train_A = None
y_train_B = None
y_train_C = None

def load_datasets():
    X_test  = pd.read_parquet('data/prepared_datasets/only_y_cleaned/X_test.parquet')
    X_test_A = X_test[X_test['location'] == 'A']
    X_test_B = X_test[X_test['location'] == 'B']
    X_test_C = X_test[X_test['location'] == 'C']
    X_train = pd.read_parquet('data/prepared_datasets/only_y_cleaned/X_train.parquet')
    X_train_A = X_train[X_train['location'] == 'A']
    X_train_B = X_train[X_train['location'] == 'B']
    X_train_C = X_train[X_train['location'] == 'C']
    y_train = pd.read_parquet('data/prepared_datasets/only_y_cleaned/Y_train.parquet')
    y_train_A = y_train[y_train['location'] == 'A']
    y_train_B = y_train[y_train['location'] == 'B']
    y_train_C = y_train[y_train['location'] == 'C']

In [None]:
    elif model_type == "catboost":
        cat_features = ['location']
        
        model = CatBoostRegressor(
            cat_features=cat_features,
            verbose=100
        )
        model_B = CatBoostRegressor(
            cat_features=cat_features,
            verbose=100
        )
        
        model_C = CatBoostRegressor(
            cat_features=cat_features,
            verbose=100
        )

In [None]:

    # scores = cross_val_score(model, X_train, y_train, cv=5)
    # print("Cross-validation scores:", scores)
    # print("Average cross-validation score:", scores.mean())
    
    # model.fit(X_train, y_train)
    # predictions = model.predict(X_test)
    
    model.fit(X_train_A, y_train_A)
    predictions1 = model.predict(X_test_A)
    
    ts = TimeSeriesSplit(n_splits = 10)
    cross_val_score(model, X_train_A, y_train_A, cv=ts, scoring='neg_mean_absolute_error')
    
    model_B.fit(X_train_B, y_train_B)
    predictions2 = model_B.predict(X_test_B)
        
    model_C.fit(X_train_C, y_train_C)
    predictions3 = model_C.predict(X_test_C)
        
    predictions = np.concatenate([predictions1, predictions2, predictions3])

## Stacked A & B + C Catboost

We also tried stacking A and he merged B and C location since A was much more different from the B and C datasets.

In [None]:
    model_B.fit(pd.concat([X_train_B,X_train_C], axis=0), pd.concat([y_train_B, y_train_C],axis=0))
    predictions2 = model_B.predict(pd.concat([X_test_B, X_test_C],axis=0))
        
    predictions = np.concatenate([predictions1, predictions2])

## Submission Catboost: run average data in 1 large dataset but with loss_function=MAE
Same model as before, but the average dataset was used.
Hyperparamaters were choosen as default except for depth which was choosen to be 7 as it's a good average of the optune hyperparamaters.
Submission are done as before, but we now cap the negative values.



In [None]:
#avg given
def resample_df_hourly_keep_categorical(df, hourly_df, categorical_col_list):
    """ 
    Hourly df must just be hourly index in the dataframe
    """
    indices_to_keep = hourly_df.index
    resampled_df = df.resample('1H').mean()
    
    for col in categorical_col_list:
        resampled_df[col] = df.resample('1H')[col].agg(lambda x: x.mode()[0] if not x.isna().all() else np.nan)

    resampled_df = resampled_df[resampled_df.index.isin(indices_to_keep)]
    return resampled_df

In [None]:
#Create a pool of data
train_pool = Pool(X_train, y_train, cat_features=["location"])
test_pool = Pool(X_test, cat_features=["location"]) 

cat_feature = ["location"]

#init model and fit it
catboost_model = CatBoostRegressor(iterations=1000, depth=9, loss_function="MAE")
catboost_model.fit(train_pool)

In [None]:
catboost_model.get_feature_importance() #Gives a clear indication of which features the model decides to split on

In [None]:
#make predictions
predictions = pd.DataFrame(catboost_model.predict(test_pool))
print(predictions)
predictions = predictions.clip(lower=0)

In [None]:
#When using non-averaged dataset the model 
catboost_model.get_feature_importance()

array([7.14454410e-01, 2.33463122e-01, 3.75397728e-01, 3.78254372e+00,
       6.35562704e+00, 6.48771233e-01, 1.91280388e-04, 5.86727196e-01,
       6.00286286e-01, 1.28453266e+00, 1.39580817e+01, 2.78000924e+00,
       2.20047230e+00, 4.79723248e+01, 2.58961845e-02, 3.21505005e-05,
       1.52380537e-01, 6.11855239e-06, 4.26232953e-03, 2.48798006e-02,
       6.49045335e-03, 4.82516561e-01, 1.26288040e-01, 2.37755133e-01,
       3.20094971e-01, 3.11507508e-01, 2.63411360e-04, 1.01815617e-02,
       9.72448560e-01, 2.07196407e-01, 3.77712014e-02, 0.00000000e+00,
       2.06891077e-03, 2.01484585e-01, 5.53600765e+00, 3.78416731e+00,
       3.62264096e-01, 1.28156413e+00, 2.02655676e+00, 3.06863983e-01,
       3.35129932e-01, 9.41018340e-01, 3.63968603e-01, 9.18614039e-04,
       1.67854391e-01, 2.77279026e-01])

## Catboost - using location per location, but without / with an avg of the X_features
Just simply running with the non averaged dataset per location. THe only changed code is below

In [None]:
#Load inn datasets
X_test  = pd.read_parquet('data/prepared_datasets/avg/only_y_cleaned/X_test.parquet')
X_train = pd.read_parquet('data/prepared_datasets/avg/only_y_cleaned/X_train.parquet')
y_train = pd.read_parquet('data/prepared_datasets/avg/only_y_cleaned/Y_train.parquet')
y_train_a = pd.read_parquet('data/prepared_datasets/avg/only_y_cleaned/Y_train_a.parquet')
y_train_b = pd.read_parquet('data/prepared_datasets/avg/only_y_cleaned/Y_train_b.parquet')
y_train_c = pd.read_parquet('data/prepared_datasets/avg/only_y_cleaned/Y_train_c.parquet')

In [None]:
#final training and predictions
X_train_a = X_train[X_train["location"] == "A"].drop("location", axis=1)
X_train_b = X_train[X_train["location"] == "B"].drop("location", axis=1)
X_train_c = X_train[X_train["location"] == "C"].drop("location", axis=1)

X_test_a = X_test[X_test["location"] == "A"].drop("location", axis=1)
X_test_b = X_test[X_test["location"] == "B"].drop("location", axis=1)
X_test_c = X_test[X_test["location"] == "C"].drop("location", axis=1)
          
train_pool_a = Pool(X_train_a, y_train_a)
train_pool_b = Pool(X_train_b, y_train_b)
train_pool_c = Pool(X_train_c, y_train_c)


test_pool_a = Pool(X_test_a) 
test_pool_b = Pool(X_test_b) 
test_pool_c = Pool(X_test_c) 

catboost_model_a = CatBoostRegressor(iterations=1000, loss_function="MAE")
catboost_model_b = CatBoostRegressor(iterations=1000, loss_function="MAE")
catboost_model_c = CatBoostRegressor(iterations=1000, loss_function="MAE")

catboost_model_a.fit(train_pool_a)
catboost_model_b.fit(train_pool_b)
catboost_model_c.fit(train_pool_c)


pred_a = pd.DataFrame(catboost_model_a.predict(test_pool_a))
pred_b = pd.DataFrame(catboost_model_b.predict(test_pool_b))
pred_c = pd.DataFrame(catboost_model_c.predict(test_pool_c))

predictions = pd.DataFrame(pd.concat([pred_a, pred_b, pred_c]))

In [None]:
#Gives a clear indication of which features the model decides to split on
catboost_model_a.get_feature_importance() 
catboost_model_b.get_feature_importance()
catboost_model_c.get_feature_importance()

Which gives:

[5.62286267e-01 5.10924360e-01 6.87543964e-01 7.09947835e+00
 1.74363313e+01 9.85944986e-01 2.20760153e-04 5.58126247e-01
 4.49025269e+00 8.30970146e-01 2.18527336e+01 9.53879922e+00
 8.92806168e+00 0.00000000e+00 4.22781187e-02 3.11757422e-04
 1.56871203e-01 1.95304107e-03 2.49194089e-02 1.57744874e-02
 2.63958073e-02 3.28201853e-01 6.36442997e-01 2.39626670e-01
 3.18970195e-01 8.77317441e-01 1.17008386e-03 1.39479027e-01
 1.20973054e+00 1.72680735e-01 2.71240488e-02 0.00000000e+00
 4.68738664e-03 2.42363183e-01 4.78004233e+00 8.46048186e+00
 2.24121290e-02 9.89359658e-01 2.69676091e+00 2.51194813e+00
 5.19759356e-01 1.55562505e+00 4.77258531e-01 4.11353283e-06
 3.83763961e-02]
[1.88022554e+00 1.15419262e+00 1.00922562e+00 7.48111890e+00
 1.12733488e+01 2.72871859e+00 1.39014285e-02 1.88626231e+00
 2.97976929e+00 1.34600324e+00 1.82486388e+01 7.59204454e+00
 4.27088547e+00 0.00000000e+00 1.19532079e-02 1.10894864e-04
 2.77594143e-01 5.77725208e-03 7.84931755e-03 1.27768791e-02
 1.99097121e-02 1.06270592e+00 4.29635354e-01 5.97065686e-02
 1.14551067e+00 9.21013435e-01 1.77340617e-05 2.49663816e-02
 2.00966070e+00 9.08311203e-01 6.96748329e-02 0.00000000e+00
 7.38318777e-04 3.68993942e-01 3.11252224e+00 1.83882500e+01
 6.19693328e-02 3.92594191e+00 2.02225386e+00 4.13964400e-01
 4.39549987e-01 1.26118441e+00 9.49111657e-01 0.00000000e+00
 2.24010622e-01]
[1.05183303e+00 5.47707872e-01 4.65832167e-01 9.62827924e+00
 1.61346601e+01 5.59971031e-01 3.32030535e-06 1.83163516e+00
 1.02105691e+00 9.94807280e-01 1.55418440e+01 1.42847250e+01
 2.46223005e+00 0.00000000e+00 2.78951279e-01 2.64209261e-04
 1.41965549e+00 8.09852309e-04 3.12673556e-02 8.59573696e-03
 1.80568128e+00 8.51329761e-01 5.67418183e-01 1.13536266e-01
 1.10745070e+00 5.78633083e-01 7.94598073e-17 2.64155735e-01
 8.67329333e-01 9.04708781e-01 3.28500374e-01 0.00000000e+00
 1.06570958e-01 2.69129368e-01 1.18569463e+00 1.38882300e+01
 2.71306048e-01 6.33875372e+00 3.17699212e-01 7.24903457e-01
 7.82524613e-01 1.10959525e+00 7.16590555e-01 1.79745716e-02
 6.18155048e-01]

## Catboost but with logCosh loss function
The logCosh loss func is like the MAE but derivable and stays convex, hopefully this gives us better scores
All the code is the same, the non avg dataset is used and combined dataset is also used.
No changed is EDA

In [None]:
#Load inn datasets
X_test  = pd.read_parquet('data/prepared_datasets/only_y_cleaned/X_test.parquet')
X_train = pd.read_parquet('data/prepared_datasets/only_y_cleaned/X_train.parquet')
y_train = pd.read_parquet('data/prepared_datasets/only_y_cleaned/Y_train.parquet')

#Create a pool of data
train_pool = Pool(X_train, y_train, cat_features=["location"])
test_pool = Pool(X_test, cat_features=["location"]) 

cat_feature = ["location"]

#init model and fit it
catboost_model = CatBoostRegressor(iterations=1000, depth=9, loss_function="LogCosh")
catboost_model.fit(train_pool)
catboost:model.get_feature_importance()

array([4.72592899e-01, 4.41204367e-01, 3.84270690e-01, 2.10027644e+00,
       6.59232391e+00, 5.12073235e-01, 4.20970309e-04, 6.54575329e-01,
       1.23954700e+00, 7.62895522e-01, 1.26428425e+01, 5.33669134e+00,
       2.00509473e+00, 4.65390852e+01, 9.04155075e-03, 0.00000000e+00,
       2.35266493e-01, 9.80361685e-05, 8.22728932e-03, 3.84898460e-02,
       9.08180785e-02, 5.32835508e-01, 1.89160001e-01, 2.02757084e-01,
       4.05391975e-01, 3.17490715e-01, 1.42871927e-06, 2.04189961e-01,
       1.34250717e+00, 2.04729946e-01, 5.97507981e-02, 0.00000000e+00,
       2.12119598e-03, 1.43597432e-01, 5.77122856e+00, 3.95295454e+00,
       4.82876996e-02, 1.23904967e+00, 1.58323040e+00, 1.14037688e+00,
       3.30905708e-01, 1.17824748e+00, 4.07264867e-01, 1.95389108e-03,
       1.79484175e-01, 4.96647498e-01])

## Improved EDA by removing duplicates 
The changed code in the EDA is below
- We tried both with only removing non-zero values and removing all duplicate values over another threshold.
- The feature importances remained virtually unchanged between tries

In [None]:
def remove_duplicates_over_threshold(df, threshold, col):
    threshold = 4  # Set your threshold here

    # Create a boolean mask to identify consecutive duplicates (non-zero values)
    mask = df[col].ne(df[col].shift()).cumsum()

    # Create a mask to identify rows that need to be kept (consecutive zeros, first in a group, or the first row of a group)
    rank_within_group = df.groupby(mask)[col].rank(method='first')
    keep_mask = (df[col] == 0) | (rank_within_group <= threshold - 1) | (mask != mask.shift())

    # Filter the DataFrame based on the keep_mask
    df_filtered = df[keep_mask]
    return df_filtered

train_all_filtered = remove_duplicates_over_threshold(train_all, 5, "pv_measurement")
print(train_all["pv_measurement"].info())
print(train_all_filtered["pv_measurement"].info())


train_all = remove_duplicates_over_threshold(train_all, 5, "pv_measurement")

In [None]:
def remove_duplicates_over_threshold_under_val(df, col, threshold, val):
    # Define the threshold
    threshold

    # Create a boolean mask for values less than 1
    mask = df[col] < val

    # Use cumulative sum to identify consecutive groups of values less than 1
    groups = (mask != mask.shift()).cumsum()

    # Filter out groups that don't meet the threshold
    valid_groups = groups[mask].value_counts() >= threshold
    valid_mask = groups.map(valid_groups.get).fillna(False)

    # Select rows that meet the criteria
    filtered_df = df[~valid_mask]
    return filtered_df

train_all_filtered = remove_duplicates_over_threshold_under_val(train_all, "pv_measurement", 48, 1)
print(train_all["pv_measurement"].info())
print(train_all_filtered["pv_measurement"].info())

train_all = remove_duplicates_over_threshold_under_val(train_all, "pv_measurement", 40, 1)

In [None]:
catboost_model.get_feature_importance()

array([5.71567256e-01, 4.18943761e-01, 4.79040534e-01, 2.26873274e+00,
       6.71437767e+00, 6.56502672e-01, 1.51028125e-03, 6.30578336e-01,
       1.17645691e+00, 9.28116951e-01, 1.03661404e+01, 3.86780450e+00,
       4.41558314e+00, 2.05461327e+01, 3.07241613e-02, 2.25237295e-07,
       2.03511521e-01, 0.00000000e+00, 5.76017922e-03, 2.70136885e-02,
       3.53194734e-03, 6.25759006e-01, 1.94652305e-01, 1.73253656e-01,
       2.71579813e-01, 1.61276690e-01, 1.82465864e-04, 6.76767197e-02,
       1.15612868e+00, 2.38809134e-01, 2.74421868e-02, 0.00000000e+00,
       2.76030891e-03, 3.29892964e-01, 5.81940730e+00, 6.44946627e+00,
       5.78456790e-02, 7.20871918e-01, 1.75597218e+00, 1.10972078e+00,
       5.19280417e-01, 1.11229712e+00, 4.89601711e-01, 7.72833327e-04,
       1.72535890e-01, 2.52307844e+01])

It now weights more on the snow water feature

## Cagboost - taking the avg and removing duplicates with a threshold of 9
- No changed EDA or Model, only a change in data

In [None]:
#Load inn datasets
X_test  = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/X_test.parquet')
X_train = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/X_train.parquet')
y_train = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/Y_train.parquet')
y_train_a = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/Y_train_a.parquet')
y_train_b = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/Y_train_b.parquet')
y_train_c = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/Y_train_c.parquet')

#Create a pool of data
train_pool = Pool(X_train, y_train, cat_features=["location"])
test_pool = Pool(X_test, cat_features=["location"]) 

cat_feature = ["location"]

#init model and fit it
catboost_model = CatBoostRegressor(iterations=1000, depth=9, loss_function="LogCosh", cat_features=cat_feature)
catboost_model.fit(train_pool)
print("")

In [None]:
catboost_model.get_feature_importance()

array([7.32350783e-01, 2.85228912e-01, 4.64423211e-01, 2.65772002e+00,
       6.79382843e+00, 5.63235254e-01, 1.58971861e-04, 7.14547511e-01,
       1.54153096e+00, 1.61590896e+00, 9.87036516e+00, 5.90169639e+00,
       3.48208012e+00, 2.43886989e+01, 2.96466324e-02, 1.70394143e-05,
       1.84520658e-01, 2.86706848e-05, 6.65691294e-03, 2.53587352e-02,
       3.65127523e-02, 3.81672809e-01, 3.91912356e-01, 2.56655871e-01,
       6.73457813e-01, 1.88787672e-01, 5.97604128e-05, 2.95753886e-02,
       3.69660461e-01, 2.83380598e-01, 4.04856707e-02, 0.00000000e+00,
       3.54154150e-03, 4.53837486e-01, 2.75040585e+00, 5.29590786e+00,
       3.31961327e-01, 1.20555840e+00, 1.59547574e+00, 1.09898773e+00,
       5.04095718e-01, 1.31024818e+00, 4.68696145e-01, 6.25010978e-05,
       1.17533127e-01, 2.29535251e+01])

## Attempted to add time features 
Added time feature for day, month, year - this resulted in a much worse score

In [None]:
#Load inn datasets
X_test  = pd.read_parquet('data/prepared_datasets/avg/time_features/X_test.parquet')
X_train = pd.read_parquet('data/prepared_datasets/avg/time_features/X_train.parquet')
y_train = pd.read_parquet('data/prepared_datasets/avg/time_features/Y_train.parquet')
y_train_a = pd.read_parquet('data/prepared_datasets/avg/time_features/Y_train_a.parquet')
y_train_b = pd.read_parquet('data/prepared_datasets/avg/time_features/Y_train_b.parquet')
y_train_c = pd.read_parquet('data/prepared_datasets/avg/time_features/Y_train_c.parquet')

In [None]:
#Make the non-cyclical features 
train_all["hour"] = train_all.index.hour
train_all["day"] = train_all.index.day
train_all["month"] = train_all.index.month
train_all["year"] = train_all.index.year

X_test["hour"] = X_test.index.hour
X_test["day"] = X_test.index.hour
X_test["month"] = X_test.index.hour
X_test["year"] = X_test.index.hour

In [None]:
#Create a pool of data
train_pool = Pool(X_train, y_train, cat_features=["location"])
test_pool = Pool(X_test, cat_features=["location"]) 

cat_feature = ["location"]

#init model and fit it
catboost_model = CatBoostRegressor(iterations=1000, depth=9, loss_function="LogCosh")
catboost_model.fit(train_pool)
catboost:model.get_feature_importance()

In [None]:
array([5.84361316e-01, 4.17091513e-01, 2.96880145e-01, 2.43437044e+00,
       4.32688271e+00, 5.62287811e-01, 1.06781780e-04, 6.55233426e-01,
       1.77960798e+00, 5.68483469e-01, 1.25757987e+01, 3.35238284e+00,
       4.24354725e+00, 2.70343617e+01, 5.76731375e-03, 1.55130847e-05,
       2.43668225e-01, 2.87369588e-04, 9.47068259e-03, 1.70520459e-02,
       1.12954553e-01, 4.31647511e-01, 2.51294488e-01, 4.38010946e-01,
       2.62406292e-01, 2.90149046e-01, 3.17302370e-04, 2.71996969e-02,
       1.73608779e+00, 2.56813887e-01, 2.91341776e-02, 0.00000000e+00,
       3.76342692e-03, 2.19281392e-01, 1.29732337e+00, 6.03619154e+00,
       2.32270574e-01, 9.61655907e-01, 9.59912491e-01, 6.93837093e-01,
       5.34365778e-01, 7.48165307e-01, 5.07625357e-01, 6.63314403e-04,
       1.24189301e-01, 2.03899539e+01, 1.55471300e+00, 6.08523329e-01,
       1.06854582e+00, 1.11534606e+00])

## Stacking attempts of the best catboosts from here on!

Stacked the catboost pred of the avg non duplicates (theshold 10) and non-avg dataset with simple logcosh catboost
- No changes in eda except for theshold function changed
- New catboost models below

In [None]:
#Load inn datasets
X_test  = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/X_test.parquet')
X_train_avg = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/X_train.parquet')
y_train_avg = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/Y_train.parquet')
y_train_a_avg = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/Y_train_a.parquet')
y_train_b_avg = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/Y_train_b.parquet')
y_train_c_avg = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/Y_train_c.parquet')

In [None]:
#Load inn datasets
X_test  = pd.read_parquet('data/prepared_datasets/only_y_cleaned/X_test.parquet')
X_train_non_avg = pd.read_parquet('data/prepared_datasets/only_y_cleaned/X_train.parquet')
y_train_non_avg = pd.read_parquet('data/prepared_datasets/only_y_cleaned/Y_train.parquet')
y_train_a_non_avg = pd.read_parquet('data/prepared_datasets/only_y_cleaned/Y_train_a.parquet')
y_train_b_non_avg = pd.read_parquet('data/prepared_datasets/only_y_cleaned/Y_train_b.parquet')
y_train_c_non_avg = pd.read_parquet('data/prepared_datasets/only_y_cleaned/Y_train_c.parquet')

In [None]:
#Create a pool of data
train_pool_avg = Pool(X_train_avg, y_train_avg, cat_features=["location"])
test_pool_avg = Pool(X_test, cat_features=["location"]) 

train_pool_non_avg = Pool(X_train_non_avg, y_train_non_avg, cat_features=["location"])
test_pool_non_avg = Pool(X_test, cat_features=["location"]) 

#init models and fit them
catboost_model_avg = CatBoostRegressor(iterations=1000, depth=9, loss_function="LogCosh", verbose=100)
catboost_model_avg.fit(train_pool_avg)

catboost_model_non_avg = CatBoostRegressor(iterations=1000, depth=9, loss_function="LogCosh", verbose=100)
catboost_model_non_avg.fit(train_pool_non_avg)

In [None]:
pred_avg = catboost_model_avg.predict(test_pool_avg)
pred_non_avg = catboost_model_non_avg.predict(test_pool_non_avg)

pred_stacked = pd.DataFrame((pred_avg + pred_non_avg) / 2)

predictions = pred_stacked.clip(lower=0)

def replace_under_0_2(x):
    if x < 0.2:
        return 0
    else:
        return x
    
predictions = pred_stacked.applymap(replace_under_0_2)

In [None]:
def remove_duplicates_over_threshold(df, threshold, col):
    threshold # Set your threshold here

    # Create a boolean mask to identify consecutive duplicates (non-zero values)
    mask = df[col].ne(df[col].shift()).cumsum()

    # Create a mask to identify rows that need to be kept (consecutive zeros, first in a group, or the first row of a group)
    rank_within_group = df.groupby(mask)[col].rank(method='first')
    keep_mask = (df[col] == 0) | (rank_within_group <= threshold - 1) | (mask != mask.shift())

    # Filter the DataFrame based on the keep_mask
    df_filtered = df[keep_mask]
    return df_filtered

#train_all_filtered = remove_duplicates_over_threshold(train_all, 9, "pv_measurement")
#print(train_all["pv_measurement"].info())
#print(train_all_filtered["pv_measurement"].info())


train_all = remove_duplicates_over_threshold(train_all, 10, "pv_measurement")

#We then saved the new avg data to the datafolder with the same datcleaning as before

#### Attempting to stack with only summer data
-all code the same except for added summer data for avg /non-avg case

In [None]:
#The remaining data is the same as before.
#This was done to the averaged dataset and the non-averaged dataset
train_all_summer = train_all[(train_all.index.month >= 5) & (train_all.index.month <= 7) & (train_all.index.day >= 1) & (train_all.index.day <= 30)]

In [None]:
#Load inn datasets
X_test  = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/X_test.parquet')
X_train_avg = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/X_train.parquet')
y_train_avg = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/Y_train.parquet')
y_train_a_avg = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/Y_train_a.parquet')
y_train_b_avg = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/Y_train_b.parquet')
y_train_c_avg = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/Y_train_c.parquet')

In [None]:
#Load inn datasets
X_test  = pd.read_parquet('data/prepared_datasets/only_y_cleaned/X_test.parquet')
X_train_non_avg = pd.read_parquet('data/prepared_datasets/only_y_cleaned/X_train.parquet')
y_train_non_avg = pd.read_parquet('data/prepared_datasets/only_y_cleaned/Y_train.parquet')
y_train_a_non_avg = pd.read_parquet('data/prepared_datasets/only_y_cleaned/Y_train_a.parquet')
y_train_b_non_avg = pd.read_parquet('data/prepared_datasets/only_y_cleaned/Y_train_b.parquet')
y_train_c_non_avg = pd.read_parquet('data/prepared_datasets/only_y_cleaned/Y_train_c.parquet')

In [None]:
#Load inn datasets
X_test  = pd.read_parquet('data/prepared_datasets/avg/summer/X_test.parquet')
X_train_avg_summer = pd.read_parquet('data/prepared_datasets/avg/summer/X_train.parquet')
y_train_avg_summer = pd.read_parquet('data/prepared_datasets/avg/summer/Y_train.parquet')
y_train_a_avg_summer = pd.read_parquet('data/prepared_datasets/avg/summer/Y_train_a.parquet')
y_train_b_avg_summer = pd.read_parquet('data/prepared_datasets/avg/summer/Y_train_b.parquet')
y_train_c_avg_summer = pd.read_parquet('data/prepared_datasets/avg/summer/Y_train_c.parquet')

In [None]:
#Load inn datasets
X_test  = pd.read_parquet('data/prepared_datasets/only_y_cleaned/X_test.parquet')
X_train_non_avg_summer = pd.read_parquet('data/prepared_datasets/summer/X_train.parquet')
y_train_non_avg_summer = pd.read_parquet('data/prepared_datasets/summer/Y_train.parquet')
y_train_a_non_avg_summer = pd.read_parquet('data/prepared_datasets/summer/Y_train_a.parquet')
y_train_b_non_avg_summer = pd.read_parquet('data/prepared_datasets/summer/Y_train_b.parquet')
y_train_c_non_avg_summer = pd.read_parquet('data/prepared_datasets/summer/Y_train_c.parquet')

In [None]:
#Create a pool of data
train_pool_avg = Pool(X_train_avg, y_train_avg, cat_features=["location"])
test_pool_avg = Pool(X_test, cat_features=["location"]) 

train_pool_non_avg = Pool(X_train_non_avg, y_train_non_avg, cat_features=["location"])
test_pool_non_avg = Pool(X_test, cat_features=["location"]) 

#init models and fit them
catboost_model_avg = CatBoostRegressor(iterations=1000, depth=9, loss_function="LogCosh", verbose=100)
catboost_model_avg.fit(train_pool_avg)

catboost_model_non_avg = CatBoostRegressor(iterations=1000, depth=9, loss_function="LogCosh", verbose=100)
catboost_model_non_avg.fit(train_pool_non_avg)

In [None]:
#Create a pool of data
train_pool_avg_summer = Pool(X_train_avg_summer, y_train_avg_summer, cat_features=["location"])
test_pool_avg_summer = Pool(X_test, cat_features=["location"]) 

train_pool_non_avg_summer = Pool(X_train_non_avg_summer, y_train_non_avg_summer, cat_features=["location"])
test_pool_non_avg_summer = Pool(X_test, cat_features=["location"]) 

#init models and fit them
catboost_model_avg_summer = CatBoostRegressor(iterations=1000, depth=9, loss_function="LogCosh", verbose=100)
catboost_model_avg_summer.fit(train_pool_avg_summer)

catboost_model_non_avg_summer = CatBoostRegressor(iterations=1000, depth=9, loss_function="LogCosh", verbose=100)
catboost_model_non_avg_summer.fit(train_pool_non_avg_summer)

In [None]:
#Make predictions for summer
pred_avg_summer = catboost_model_avg_summer.predict(test_pool_avg_summer)
pred_non_avg_summer = catboost_model_non_avg_summer.predict(test_pool_non_avg_summer)

pred_stacked_summer = (pred_avg_summer + pred_non_avg_summer) / 2

In [None]:
#Make predictions for non-summer
pred_avg = catboost_model_avg.predict(test_pool_avg)
pred_non_avg = catboost_model_non_avg.predict(test_pool_non_avg)

pred_stacked = (pred_avg + pred_non_avg) / 2

In [None]:
pred_tot_stacked = pd.DataFrame(0.65*pred_stacked + 0.35*pred_stacked_summer)

predictions = pred_tot_stacked.clip(lower=0)

def replace_under_0_2(x):
    if x < 0.2:
        return 0
    else:
        return x
    
predictions = pred_tot_stacked.applymap(replace_under_0_2)

#### Attempting to submit only data for summer
-all code same as before. however only the stack containing the summer data was submitted. This got a score of 150, which indicates we maybe should stack them more similarly 

## adding validation of different summer to the stack
Simply adding avg non duplicates data with optuna tuning for different summers

In [None]:
def test_train_split(df):
    dates_2 = (df.index >= '2023-04-01') & (df.index <= '2023-04-15')
    dates_1 = (df.index >= '2020-05-01') & (df.index <= '2020-08-01')

    test_set = df[dates_1 | dates_2]

    training_set = df[~(dates_1 | dates_2)]

    X_train = training_set.drop("pv_measurement", axis=1)
    y_train = training_set['pv_measurement']

    X_test = test_set.drop("pv_measurement", axis=1)
    y_test = test_set['pv_measurement'] 

    
    
    return X_train, X_test, y_train, y_test

X_trainnew_a, X_test_new_a, y_train_new_a, y_test_a = test_train_split(pd.concat([X_train[X_train["location"] == "A"].drop("location", axis=1), y_train_a], axis=1))
X_train_new_b, X_test_new_b, y_train_new_b, y_test_b = test_train_split(pd.concat([X_train[X_train["location"] == "B"].drop("location", axis=1), y_train_b], axis=1))
X_train_new_c, X_test_new_c, y_train_new_c, y_test_c = test_train_split(pd.concat([X_train[X_train["location"] == "C"].drop("location", axis=1), y_train_c], axis=1))

X_train_loc_a, X_test_loc_a, y_train_loc_a, y_test_a = test_train_split(pd.concat([X_train[X_train["location"] == "A"], y_train_a], axis=1))
X_train_loc_b, X_test_loc_b, y_train_loc_b, y_test_b = test_train_split(pd.concat([X_train[X_train["location"] == "B"], y_train_b], axis=1))
X_train_loc_c, X_test_loc_c, y_train_loc_c, y_test_c = test_train_split(pd.concat([X_train[X_train["location"] == "C"], y_train_c], axis=1))


X_train_new = pd.concat([X_train_loc_a, X_train_loc_b, X_train_loc_c])
X_test_new = pd.concat([X_test_loc_a, X_test_loc_b, X_test_loc_c])
y_train_new = pd.concat([y_train_loc_a, y_train_loc_b, y_train_loc_c])
y_test = pd.concat([y_test_a, y_test_b, y_test_c])

In [None]:
def test_train_split(df):
    dates_2 = (df.index >= '2023-04-01') & (df.index <= '2023-04-15')
    dates_1 = (df.index >= '2021-05-01') & (df.index <= '2021-08-01')

    test_set = df[dates_1 | dates_2]

    training_set = df[~(dates_1 | dates_2)]

    X_train = training_set.drop("pv_measurement", axis=1)
    y_train = training_set['pv_measurement']

    X_test = test_set.drop("pv_measurement", axis=1)
    y_test = test_set['pv_measurement'] 

    
    
    return X_train, X_test, y_train, y_test

X_trainnew_a, X_test_new_a, y_train_new_a, y_test_a = test_train_split(pd.concat([X_train[X_train["location"] == "A"].drop("location", axis=1), y_train_a], axis=1))
X_train_new_b, X_test_new_b, y_train_new_b, y_test_b = test_train_split(pd.concat([X_train[X_train["location"] == "B"].drop("location", axis=1), y_train_b], axis=1))
X_train_new_c, X_test_new_c, y_train_new_c, y_test_c = test_train_split(pd.concat([X_train[X_train["location"] == "C"].drop("location", axis=1), y_train_c], axis=1))

X_train_loc_a, X_test_loc_a, y_train_loc_a, y_test_a = test_train_split(pd.concat([X_train[X_train["location"] == "A"], y_train_a], axis=1))
X_train_loc_b, X_test_loc_b, y_train_loc_b, y_test_b = test_train_split(pd.concat([X_train[X_train["location"] == "B"], y_train_b], axis=1))
X_train_loc_c, X_test_loc_c, y_train_loc_c, y_test_c = test_train_split(pd.concat([X_train[X_train["location"] == "C"], y_train_c], axis=1))


X_train_new = pd.concat([X_train_loc_a, X_train_loc_b, X_train_loc_c])
X_test_new = pd.concat([X_test_loc_a, X_test_loc_b, X_test_loc_c])
y_train_new = pd.concat([y_train_loc_a, y_train_loc_b, y_train_loc_c])
y_test = pd.concat([y_test_a, y_test_b, y_test_c])

In [None]:
def objective(trial, X_train, y_train):
    params = {
        "iterations": trial.suggest_int("iterations", 300, 4000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 13),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 2, 10),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.3, 1.0),
        "random_strength": trial.suggest_float("random_strength", 0.3, 1.0),
        "border_count": trial.suggest_int("border_count", 1, 1000),
        "rsm": trial.suggest_float("rsm", 0.05, 1),
        "loss_function": "LogCosh"
    }

    catboost_model_val = CatBoostRegressor(**params, verbose=200)
    catboost_model_val.fit(train_pool)
    pred = pd.DataFrame(catboost_model_val.predict(test_pool))
    MAE = mean_absolute_error(y_test, pred)

    return MAE
    
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, X_train_new, y_train_new), n_trials=70)

#Both were validated for and averaged between each other then added to the previous stack, the best result was found by not including the summer data.
We then tested adding a training of location per location to the stack

In [None]:
#create training location per location with prev found hyperparamaters focusing on 2020 summer
params_a = {'iterations': 2511, 'learning_rate': 0.013387708538234228, 'depth': 9, 'min_data_in_leaf': 37, 'l2_leaf_reg': 9, 'bagging_temperature': 0.6037942407543951, 'random_strength': 0.7203046909719719, 'border_count': 792, 'rsm': 0.5142593972884377, "loss_function": "LogCosh"}
params_b = {'iterations': 1647, 'learning_rate': 0.008285667949530987, 'depth': 2, 'min_data_in_leaf': 7, 'l2_leaf_reg': 6, 'bagging_temperature': 0.638344903820208, 'random_strength': 0.9388444992830671, 'border_count': 434, 'rsm': 0.8347735297026142, "loss_function": "LogCosh"}
params_c = {'iterations': 2216, 'learning_rate': 0.02932152249119453, 'depth': 10, 'min_data_in_leaf': 21, 'l2_leaf_reg': 2, 'bagging_temperature': 0.8784102456931138, 'random_strength': 0.6901129919784297, 'border_count': 119, 'rsm': 0.8571604856673344, "loss_function": "LogCosh"}

X_train_a = X_train_non_avg[X_train_non_avg["location"] == "A"].drop("location", axis=1)
X_train_b = X_train_non_avg[X_train_non_avg["location"] == "B"].drop("location", axis=1)
X_train_c = X_train_non_avg[X_train_non_avg["location"] == "C"].drop("location", axis=1)

X_test_a = X_test[X_test["location"] == "A"].drop("location", axis=1)
X_test_b = X_test[X_test["location"] == "B"].drop("location", axis=1)
X_test_c = X_test[X_test["location"] == "C"].drop("location", axis=1)
          
train_pool_a = Pool(X_train_a, y_train_a_non_avg)
train_pool_b = Pool(X_train_b, y_train_b_non_avg)
train_pool_c = Pool(X_train_c, y_train_c_non_avg)


test_pool_a = Pool(X_test_a) 
test_pool_b = Pool(X_test_b) 
test_pool_c = Pool(X_test_c) 

catboost_model_a = CatBoostRegressor(**params_a)
catboost_model_b = CatBoostRegressor(**params_b)
catboost_model_c = CatBoostRegressor(**params_c)

catboost_model_a.fit(train_pool_a)
catboost_model_b.fit(train_pool_b)
catboost_model_c.fit(train_pool_c)


pred_a = catboost_model_a.predict(test_pool_a)
pred_b = catboost_model_b.predict(test_pool_b)
pred_c = catboost_model_c.predict(test_pool_c)

In [None]:
pred_a = catboost_model_a.predict(test_pool_a)
pred_b = catboost_model_b.predict(test_pool_b)
pred_c = catboost_model_c.predict(test_pool_c)
pred_non_avg_per_location = np.concatenate((pred_a, pred_b, pred_c))

pred_non_avg_per_location[pred_non_avg_per_location < 0] = 0

In [None]:
pred_tot_stacked = pd.DataFrame((pred_stacked + pred_non_avg_per_location)/2)

predictions = pred_tot_stacked.clip(lower=0)

def replace_under_0_2(x):
    if x < 0.2:
        return 0
    else:
        return x
    
predictions = pred_tot_stacked.applymap(replace_under_0_2)

#### We also tested using location per location without avg

In [None]:
params_avg_a = {'iterations': 2511, 'learning_rate': 0.013387708538234228, 'depth': 9, 'min_data_in_leaf': 37, 'l2_leaf_reg': 9, 'bagging_temperature': 0.6037942407543951, 'random_strength': 0.7203046909719719, 'border_count': 792, 'rsm': 0.5142593972884377, "loss_function": "LogCosh"}
params_avg_b = {'iterations': 1647, 'learning_rate': 0.008285667949530987, 'depth': 2, 'min_data_in_leaf': 7, 'l2_leaf_reg': 6, 'bagging_temperature': 0.638344903820208, 'random_strength': 0.9388444992830671, 'border_count': 434, 'rsm': 0.8347735297026142, "loss_function": "LogCosh"}
params_avg_c = {'iterations': 2216, 'learning_rate': 0.02932152249119453, 'depth': 10, 'min_data_in_leaf': 21, 'l2_leaf_reg': 2, 'bagging_temperature': 0.8784102456931138, 'random_strength': 0.6901129919784297, 'border_count': 119, 'rsm': 0.8571604856673344, "loss_function": "LogCosh"}

X_train_avg_a = X_train_avg[X_train_avg["location"] == "A"].drop("location", axis=1)
X_train_avg_b = X_train_avg[X_train_avg["location"] == "B"].drop("location", axis=1)
X_train_avg_c = X_train_avg[X_train_avg["location"] == "C"].drop("location", axis=1)

X_test_a = X_test[X_test["location"] == "A"].drop("location", axis=1)
X_test_b = X_test[X_test["location"] == "B"].drop("location", axis=1)
X_test_c = X_test[X_test["location"] == "C"].drop("location", axis=1)
          
train_pool_avg_a = Pool(X_train_avg_a, y_train_a_avg)
train_pool_avg_b = Pool(X_train_avg_b, y_train_b_avg)
train_pool_avg_c = Pool(X_train_avg_c, y_train_c_avg)


test_pool_a = Pool(X_test_a) 
test_pool_b = Pool(X_test_b) 
test_pool_c = Pool(X_test_c) 

catboost_model_avg_a = CatBoostRegressor(**params_avg_a, verbose=400)
catboost_model_avg_b = CatBoostRegressor(**params_avg_b, verbose=400)
catboost_model_avg_c = CatBoostRegressor(**params_avg_c, verbose=400)

catboost_model_avg_a.fit(train_pool_a)
catboost_model_avg_b.fit(train_pool_b)
catboost_model_avg_c.fit(train_pool_c)

pred_avg_a = catboost_model_avg_a.predict(test_pool_a)
pred_avg_b = catboost_model_avg_b.predict(test_pool_b)
pred_avg_c = catboost_model_avg_c.predict(test_pool_c)

pred_avg_per_location = np.concatenate((pred_avg_a, pred_avg_b, pred_avg_c))

pred_avg_per_location[pred_avg_per_location < 0] = 0

pred_tot_stacked = pd.DataFrame((pred_stacked + pred_non_avg_per_location + pred_avg_per_location)/3)

predictions = pred_tot_stacked.clip(lower=0)

def replace_under_0_2(x):
    if x < 0.2:
        return 0
    else:
        return x
    
predictions = pred_tot_stacked.applymap(replace_under_0_2)

We then added a new avg dataset, where the low duplicated values (typically where there is snow covering the sensor) was removed.
This was then fitted on a catboost model and added to the stack. THe code to create the new data is given below

In [None]:

def remove_duplicates_over_threshold_under_val(df, col, threshold, val):
    # Define the threshold
    threshold

    # Create a boolean mask for values less than 1
    mask = df[col] == 0

    # Use cumulative sum to identify consecutive groups of values less than 1
    groups = (mask != mask.shift()).cumsum()

    # Filter out groups that don't meet the threshold
    valid_groups = groups[mask].value_counts() >= threshold
    valid_mask = groups.map(valid_groups.get).fillna(False)

    # Select rows that meet the criteria
    filtered_df = df[~valid_mask]
    return filtered_df

train_all_filtered = remove_duplicates_over_threshold_under_val(train_all, "pv_measurement", 48, 1)
print(train_all["pv_measurement"].info())
print(train_all_filtered["pv_measurement"].info())

train_all = remove_duplicates_over_threshold_under_val(train_all, "pv_measurement", 40, 1)

#The new training data is then saved to a prepared data folder


In [None]:
#New catboost model with the removed lows
train_pool_avg_no_lows = Pool(X_train_avg_no_lows, y_train_avg_no_lows, cat_features=["location"])
test_pool = Pool(X_test, cat_features=["location"]) 

catboost_model_avg_no_lows = CatBoostRegressor(iterations=1000, depth=9, loss_function="LogCosh", verbose=100)
catboost_model_avg_no_lows.fit(train_pool_avg_no_lows)

In [None]:
pred_tot_stacked = pd.DataFrame((pred_avg + pred_non_avg + pred_non_avg_per_location + pred_avg_no_lows)/4)

predictions = pred_tot_stacked.clip(lower=0)

def replace_under_0_2(x):
    if x < 0.2:
        return 0
    else:
        return x
    
predictions = pred_tot_stacked.applymap(replace_under_0_2)

We then tried a new stack with the summer data and the above stack, this resulted in worse scores.
After we tried the same stack as above, but with default hyperparamaters for catboost but trained location per location - resulted in worse scores

## Autogluon

This is one Autogluon notebook we attempted alongside Catboost, it produced promising results but the results where not consistent.

In [None]:
import pandas as pd
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from IPython.display import display
from functools import reduce
import os
import glob

In [None]:
def merge_df(dataframes):
    """
    Merges multiple pandas DataFrames on their index.

    Parameters:
    dataframes (list of pandas.DataFrame): The list of DataFrames to merge.
    how (str): Type of merge to perform:
        - 'left': use only keys from left frame (SQL: left outer join)
        - 'right': use only keys from right frame (SQL: right outer join)
        - 'outer': use union of keys from both frames (SQL: full outer join)
        - 'inner': use intersection of keys from both frames (SQL: inner join)
    remove_duplicates (bool): Whether to remove duplicated rows after merging.

    Returns:
    pandas.DataFrame: The merged DataFrame.
    """
    # Start with the first DataFrame in the list
    merged_df = dataframes[0]

    # Iteratively merge each DataFrame in the list
    for df in dataframes[1:]:
        merged_df = merged_df.join(df, how='left')

    return merged_df


In [None]:
def concat_df(dataframes, axis='index'):
    # Identify common columns by intersecting all DataFrame columns
    common_columns = reduce(lambda x, y: x.intersection(y.columns), dataframes, dataframes[0].columns)

    # Reindex all DataFrames to these common columns
    dfs_common = [df[common_columns] for df in dataframes]

    # Concatenate the reindexed DataFrames
    return pd.concat(dfs_common, axis=axis)

In [None]:
def read_parquet(filepath):
    try:
        # Read the file
        df = pd.read_parquet(filepath)
        
        columns = ['date_forecast', 'time']

        for column in columns:
            if column in df.columns:
                df.set_index(column, inplace=True)
                break
        else:
            print("Datetime column not found")
            
        # Get the location from the filepath
        location = os.path.basename(os.path.dirname(filepath))
        data_type = os.path.basename(filepath).rsplit('.parquet')[0]
        
        # Create a MultiIndex
        df.index = pd.MultiIndex.from_product([[data_type], [location], df.index], names=['Data_Type', 'Location', 'Time'])
        
        return df

    except Exception as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame()

In [None]:
def get_min_time_delta(df):

    data_sorted = df.sort_index()
    time_deltas = data_sorted.index.to_series().diff().dropna()
    non_zero_deltas = time_deltas[time_deltas != pd.Timedelta(0)]
    min_time_delta = non_zero_deltas.min().total_seconds()

    if min_time_delta < 3600:
        # Convert to minutes
        return f"{min_time_delta // 60:.0f}T"
    else:
        # Convert to hours
        return f"{min_time_delta // 3600:.0f}H"


# Start

In [None]:

dataframes = []
directory = '/home/andres/ml/data/'
filepaths = glob.glob(os.path.join(directory, '**', '*.parquet'), recursive=True)

for filepath in filepaths:
    df = read_parquet(filepath)
    dataframes.append(df)
data = pd.concat(dataframes)

In [None]:
X_train_observed = data.xs('X_train_observed')
X_train_estimated = data.xs('X_train_estimated')
X_train = pd.concat([X_train_observed, X_train_estimated])
Y_train = data.xs('train_targets')

In [None]:
def interpolate(data):

    interpolation_methods = {
        'absolute_humidity_2m:gm3': 'index',
        'air_density_2m:kgm3': 'index',
        'ceiling_height_agl:m': 'index',
        'clear_sky_energy_1h:J': 'cubic',
        'clear_sky_rad:W': 'cubic',
        'cloud_base_agl:m': 'pchip',
        'dew_or_rime:idx': 'nearest',
        'dew_point_2m:K': 'linear',
        'diffuse_rad:W': 'cubic',
        'diffuse_rad_1h:J': 'cubic',
        'direct_rad:W': 'cubic',
        'direct_rad_1h:J': 'cubic',
        'effective_cloud_cover:p': 'index',
        'elevation:m': 'pad',
        'fresh_snow_12h:cm': 'zero',
        'fresh_snow_1h:cm': 'zero',
        'fresh_snow_24h:cm': 'zero',
        'fresh_snow_3h:cm': 'zero',
        'fresh_snow_6h:cm': 'zero',
        'is_day:idx': 'pad',
        'is_in_shadow:idx': 'pad',
        'msl_pressure:hPa': 'time',
        'precip_5min:mm': 'index',
        'precip_type_5min:idx': 'nearest',
        'pressure_100m:hPa': 'index',
        'pressure_50m:hPa': 'index',
        'prob_rime:p': 'index',
        'rain_water:kgm2': 'index',
        'relative_humidity_1000hPa:p': 'index',
        'sfc_pressure:hPa': 'time',
        'snow_density:kgm3': 'zero',
        'snow_depth:cm': 'nearest',
        'snow_drift:idx': 'pad',
        'snow_melt_10min:mm': 'index',
        'snow_water:kgm2': 'index',
        'sun_azimuth:d': 'cubic',
        'sun_elevation:d': 'cubic',
        'super_cooled_liquid_water:kgm2': 'index',
        't_1000hPa:K': 'index',
        'total_cloud_cover:p': 'index',
        'visibility:m': 'index',
        'wind_speed_10m:ms': 'index',
        'wind_speed_u_10m:ms': 'index',
        'wind_speed_v_10m:ms': 'index',
        'wind_speed_w_1000hPa:ms': 'index',
        'pv_measurement':'index'
    }
    for column in data.columns:
        data[column].interpolate(method=interpolation_methods.get(column, 'linear'), inplace=True)
        if data[column].isna().any():
            print(column + ' has NaN, replacing NaN with 0.')
            data[column] = data[column].fillna(0)
    return data

In [None]:
sub_dfs = []
for index_combination, sub_df in X_train.groupby(level='Location'):
    sub_df = sub_df.reset_index(level='Location').resample('15T').asfreq(fill_value=None).drop(['pv_measurement','date_calc'], axis=1)
    Y_train = Y_train.pv_measurement.reset_index(level='Location')
    interpolate(Y_train)
    Y_train.set_index('Location', inplace=True, append=True)
    Y_train = Y_train.swaplevel()
    train_data = merge_df([interpolate(sub_df),Y_train])
    train_data['Location'] = index_combination
    train_data.set_index('Location', inplace=True, append=True)
    train_data = train_data.swaplevel()
    sub_dfs.append(train_data)
df = pd.concat(sub_dfs)

In [None]:
df = df.reset_index()
df['Time'] = df['Time'].astype('datetime64[ns]')
train_data = TimeSeriesDataFrame.from_data_frame(
    df,
    id_column="Location",
    timestamp_column="Time",
)

In [None]:
test = pd.read_csv('/home/andres/ml/data/test.csv', index_col=['location', 'time'], parse_dates=['time'])
sub_df = test.loc['A']
test = sub_df
num_predictions = test.shape[0]


In [None]:
(train_data_s, test_data_s) = train_data.train_test_split(prediction_length=num_predictions)

In [None]:
X_pred = read_parquet('/home/andres/ml/data/A/X_test_estimated.parquet',  'date_forecast','A')
freq = get_min_time_delta(X_pred)
X_pred = X_pred.resample(freq).asfreq(fill_value=None)
interpolate(X_pred)
X_pred['timestamp'] = X_pred.index.astype('datetime64[ns]')
print(X_pred['timestamp'])
X_pred['location'] = 'A'
X_pred = TimeSeriesDataFrame.from_data_frame(
    X_pred,
    id_column="location",
    timestamp_column="timestamp",
)


# Model Selection & Training

In [None]:
print(num_predictions)

In [None]:
predictor = TimeSeriesPredictor(
    prediction_length=num_predictions,
    path="autogluon",
    target="pv_measurement",
    eval_metric="MSE",
)

predictor.fit(
    train_data_s,
    presets="fast_training",
    time_limit=600,
)

# Model Evaluation

In [None]:
display(predictor.leaderboard(test_data_s, silent=True))

In [None]:
predictor.evaluate(test_data_s)

# Testing

In [None]:
predictions = predictor.predict(train_data, known_covariates=X_pred)
print(predictions.info())