# H2O
This Jupyter notebook contains all necessary steps to reproduce the projects group Kaggle submission. Notice that you would need the "parquet files" in the same folder structure as the original data file.

In [None]:
# Data Processing Tools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Machine Learning Models
import catboost as cb

# Machine Learning Tools
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score


In [None]:
def data_length_matching(train: pd.DataFrame, obs: pd.DataFrame)-> tuple[pd.DataFrame, pd.DataFrame]:
    """
    This function is intended to ensure that both the training data and
    the observed data are sorted, and contain the same number of entries. 
    """

    # Cut the data frames so that their date match.
    obs_feature_test = obs[obs['date_forecast'].isin(train['time'])].sort_values(by=['date_forecast'])  # sortert etter datao
    # If only one of them has the date ensure that the other also has the same sorting.
    train_feature_test = train[train['time'].isin(obs['date_forecast'])].sort_values(by=['time'])       # sortert etter datao

    return train_feature_test, obs_feature_test



In [None]:
def square_df(df: pd.DataFrame, timeStamps: str, measurements: list[str]):
    squared_df = pd.DataFrame()
    df = df.copy()
    for measurement in measurements:
        # Calculate derivative estimates
        squared_df['squared_' + measurement + '_2'] = df[measurement]**2
    return squared_df

def upscale_(df: pd.DataFrame, feature: str, upscale: int) -> pd.DataFrame:
    df = df.copy()
    upscale_df = pd.DataFrame()
    
    upscale_df["uscale_" + feature] = df[feature]*upscale

    return upscale_df

def dot_df(df: pd.DataFrame, dot_feature: str, features: list[str]) -> pd.DataFrame:
    df = df.copy()
    dot_df = pd.DataFrame()

    for feature in features:
        dot_df[dot_feature + '_dot_' + feature] = df[dot_feature] * df[feature]

    return dot_df

def log_df(df: pd.DataFrame, features: list[str]):
    
    df = df.copy()
    log_df = pd.DataFrame()

    for feature in features:
        df[feature] = abs(df[feature])
        df[feature] = df[feature] + 1
        log_df['log_' + feature] =  np.log(df[feature])

    return log_df


def difference_df(df: pd.DataFrame, timeStamps: str, measurements: list[str]) -> pd.DataFrame:
    """
    Adds a derivative column to the pandas dataframe. May be used to create time dependency.
    """
    der_df = pd.DataFrame()
    df = df.copy()
    # Sort DataFrame by timestamp
    df = df.sort_values(timeStamps) 

    # Calculate time differences
    df['time_diff'] = df[timeStamps].diff()

    for measurement in measurements:
        # Calculate derivative estimates
        der_df['derivative_' + measurement + '_d'] = df[measurement].diff()
    
    df = df.drop('time_diff', axis =  1)

    # Since the first element will result in a NaN, we must backfill this one.
    der_df = der_df.interpolate(method='linear')
    der_df = der_df.bfill()
    
    return der_df

def double_derivative_from_df(df: pd.DataFrame, timeStamps: str, measurements: list[str]) -> pd.DataFrame:
    """
    Adds a derivative column to the pandas dataframe. May be used to create time dependency.
    """
    dder_df = pd.DataFrame()
    df = df.copy()
    # Sort DataFrame by timestamp
    df = df.sort_values(timeStamps) 

    # Calculate time differences
    df['time_diff'] = df[timeStamps].diff()

    # Calculate derivative estimates
    for measurement in measurements:
        dder_df['double_derivative_' + measurement + '_dd'] = df[measurement].diff() / (divmod(df['time_diff'].dt.total_seconds(), 60)[0]**2)
    
    df = df.drop('time_diff', axis=1)
    
    # Since the first element will result in a NaN, we must backfill this one.
    dder_df = dder_df.interpolate(method='linear')
    dder_df = dder_df.bfill()

    return dder_df

def daily_accumulated_val_df(df: pd.DataFrame, timeStamps: str, measurements: list[str]) -> pd.DataFrame:
    
    i_df = pd.DataFrame()
    df = df.copy()
    # Sort DataFrame by timestamp
    df = df.sort_values(timeStamps)

    # Create a new column for the date
    df['date'] = df[timeStamps].dt.date

    for measurement in measurements:
        # Calculate the integral value for each day
        i_df['integral_' + measurement + '_integral'] = df.groupby('date')[measurement].cumsum()
    
    df = df.drop('date', axis=1)

    return i_df

def daily_accumulated_val_squared_df(df: pd.DataFrame, timeStamps: str, measurements: list[str]) -> pd.DataFrame:
    
    di_df = pd.DataFrame()
    df = df.copy()
    # Sort DataFrame by timestamp
    df = df.sort_values(timeStamps)

    # Create a new column for the date
    df['date'] = df[timeStamps].dt.date

    for measurement in measurements:
        # Calculate the integral value for each day
        di_df['double_integral_' + measurement + '_dintegral'] = df.groupby('date')[measurement].cumsum()**2
    
    df = df.drop('date', axis=1)

    return di_df

def time_data_from_df(df: pd.DataFrame, timestamps: str) -> pd.DataFrame: 
    # Extracting components
    time_df = pd.DataFrame()
    df = df.copy()
    time_df['day_of_year:day'] = df[timestamps].dt.dayofyear
    time_df['month:month'] = df[timestamps].dt.month
    #time_df['year:year'] = df[timestamps].dt.year
    time_df['hour:hour'] = df[timestamps].dt.hour
    return time_df


# Should modify this
def n_largest_freq(df: pd.DataFrame, measurements: list[str], n_largest: int):
    """
    Generates values based on the largest frequencies that are present.
    """
    df = df.copy()
    freq_df = pd.DataFrame()

    for measurement in measurements:
        signal = df[measurement].values

        fft_result = np.fft.fft(signal)
        
        
        indices = np.argsort(np.abs(fft_result))[::-1][:n_largest]

        for i, idx in enumerate(indices):
            # Set all other frequency components to zero
            fft_result_filtered = np.zeros_like(fft_result)
            fft_result_filtered[idx] = fft_result[idx]

            # Compute IFFT
            ifft_result = np.fft.ifft(fft_result_filtered)

            # Add the filtered results to the dataframe
            freq_df["filtered_freq_" + str(i) +"_" + measurement] = ifft_result.real


    return freq_df

def freq_comb(df: pd.DataFrame, features: list[str]) -> np.array:
    """
    Takes the fourier transform of multiple signals add them together, and then takes the inverse.

    features: Are what you would like to combine.
    df: Chosen dataframe containing feature information.
    """

    total_fft = 0
    
    for feat in features:
        # Finding the signal directly might be wrong due to timestamps and such, but might still be helpful. It is not correct, but improvements like day by day sampling might be useful.
        signal = df[feat].values

        # Min-max scaling
        scaled_signal = min_max_scale(signal)
        
        fft = np.fft.fft(scaled_signal)
        total_fft = total_fft + fft
    
    ifft_result = np.fft.ifft(total_fft)

    return ifft_result.real

def min_max_scale(signal: np.array) -> np.array:
    # Calculate min and max values
    min_val = np.min(signal)
    max_val = np.max(signal)

    # Min-max scaling
    scaled_signal = (signal - min_val) / (max_val - min_val)

    return scaled_signal

def shifted_values_24_h(y: pd.DataFrame, measurement: str)->pd.DataFrame:
    df = pd.DataFrame()
    for i in range(1, 25):
        df[measurement + 'n-' + str(i)] = y[measurement].shift(i)
    
    return df

def merge_features(df: pd.DataFrame):
    # Extract the part before ":" in column names
    df.columns = df.columns.str.split(':').str[0]

    # Group by modified column names and sum values
    grouped_df = df.groupby(df.columns, axis=1).sum()

    return grouped_df

def duplicates(df: pd.DataFrame)->None:
    df = df.copy()
    # Assuming df is your DataFrame and 'column_name' is the column you're interested in
    duplicate_counts = df['pv_measurement'].value_counts()
    duplicate_counts = duplicate_counts[duplicate_counts > 1]

    print(duplicate_counts)

def remove_double_entries(y: pd.DataFrame) -> pd.DataFrame:
    # Assuming df is your DataFrame and 'column_name' is the column you're interested in
    df = y.copy()
    mask = (df['pv_measurement'] != df['pv_measurement'].shift()) | (df['pv_measurement'] == 0)
    filtered_df = df[mask]
    return filtered_df


In [None]:
"""
A couple of functions to generate the approprate features for both training and prediction data.
"""

def train_data_processing(X: pd.DataFrame, y: pd.DataFrame, filter_list: list[str] = [], months: list[int] = [], feedback: bool = False):
   
    # Removing NaN values. If there are missing values treat start and end points as beginning and end of a line. MIGHT BE BAD FOR THE TRAINING DATA DROP USELESSNESS!
    X = X.interpolate(method='linear', limit_direction = "both")
    
    # Extract necesarry values for feature generation.
    timestamps = "date_forecast"
    measurements = list(X.columns.values)
    
    measurements.remove(timestamps)
    # print(measurements)

    # Probable features that may be used
    squared_df = square_df(X, timestamps, measurements)
    # print("Squared")
    # print(squared_df.columns.values)
    
    der_df = difference_df(X, timestamps, measurements)
    # print("Derivative")
    # print(der_df.columns.values)
    
    dder_df = double_derivative_from_df(X, timestamps, measurements)
    # print("Double Derivative")
    # print(dder_df.columns.values)
    
    int_df = daily_accumulated_val_df(X, timestamps, measurements)
    # print("Integral")
    # print(int_df.columns.values)
    
    dint_df = daily_accumulated_val_squared_df(X, timestamps, measurements)
    # print("Double Integral")
    # print(dint_df.columns.values)
    
    l_df = log_df(X, measurements)
    # print("Log")
    # print(l_df.columns.values)
    
    dotted_df = dot_df(X, 'direct_rad:W', measurements)
    # print("Dotted")
    # print(dotted_df.columns.values)

    n_largest_freq_df =  n_largest_freq(X, measurements, n_largest = 5)
    # print("N-largest")
    # print(n_largest_freq_df.columns.values)
    
    time_df = time_data_from_df(X, timestamps)
    # print("Time")
    # print(time_df.columns.values)

    X = pd.concat([X, squared_df, der_df, dder_df, dint_df, int_df, l_df, dotted_df, n_largest_freq_df, time_df], axis = "columns")
    
    if len(months) > 0:
        X =  X[X['date_forecast'].dt.month.isin(months)]

    if len(filter_list) > 0:
        X = X[filter_list + ["date_forecast"]]

    # Additional features
    duplicates(y)
    y = remove_double_entries(y)
    #der_y = difference_df(y, "time", ["pv_measurement"])
    # der_y_shifted = shifted_values_24_h(der_y, "derivative_pv_measurement_d")
    y_shifted =  shifted_values_24_h(y, "pv_measurement")
    y.reset_index(drop = True)
    y_shifted.reset_index(drop = True)
    # Adding together the added features to one dataframe.
    y_BIG = pd.concat([y, y_shifted])
    X.reset_index(drop = True)

    # Making sure that the two dataframes match in length.
    y_BIG, X = data_length_matching(y_BIG, X)

    # Get our desired output
    y = y_BIG["pv_measurement"]
   
    
    if feedback:
        # Removing datetime object column.
        y_features = y_BIG.drop('pv_measurement', axis=1)
        y_features = y_features.drop('time', axis=1)
        y_features = y_features.reset_index(drop = True)
        
        
        y_features = y_features.reset_index(drop = True)
        X = X.reset_index(drop = True)
        X = pd.concat([X, y_features], axis = 1)
        
    
    # Removing datetime object column
    X = X.drop(timestamps, axis=1)
    
    X = X.reset_index(drop = True)

    return X, y

def pred_data_processing(X_pred: pd.DataFrame) -> pd.DataFrame:
    """
    A function that reads
    """
    
    # Removing NaN values. If there are missing values treat start and end points as beginning and end of a line.
    X_pred = X_pred.interpolate(method = 'linear')
    X_pred = X_pred.bfill()

    # Extract necesarry values for feature generation.
    timestamps = "date_forecast"

    # Removing date-time from measurements
    measurements = list(X_pred.columns.values)
    measurements.remove("date_forecast")
    measurements.remove("date_calc")

    # Probable features that may be used
    squared_df = square_df(X_pred, timestamps, measurements)
    der_df = difference_df(X_pred, timestamps, measurements)
    dder_df = double_derivative_from_df(X_pred, timestamps, measurements)
    int_df = daily_accumulated_val_df(X_pred, timestamps, measurements)
    dint_df = daily_accumulated_val_squared_df(X_pred, timestamps, measurements)
    l_df = log_df(X_pred, measurements)
    dotted_df = dot_df(X_pred, 'direct_rad:W', measurements)
    time_df = time_data_from_df(X_pred, timestamps)
    n_largest_freq_df =  n_largest_freq(X_pred, measurements, n_largest = 5)

    X_pred_new = pd.concat([X_pred, squared_df, der_df, dder_df, dint_df, int_df, l_df, dotted_df, n_largest_freq_df, time_df], axis = "columns")

    X_pred_new = X_pred_new.drop("date_calc", axis = 1)
    return X_pred_new


In [None]:
def create_training_and_pred_data(file_paths: list[str], months: list[int] = []):
    buildings = ['A', 'B', 'C']
    paths = []
    
    for i, path in enumerate(file_paths):
        # Retrieve data        
        y = pd.read_parquet(path[0])
        X_estimated = pd.read_parquet(path[1])
        X_observed = pd.read_parquet(path[2])
        X_pred = pd.read_parquet(path[3])

        # Processing and cleaning data
        y = y.dropna()
        X_estimated = X_estimated.drop("date_calc", axis = 1)
        X = pd.concat([X_observed, X_estimated], axis = 0, ignore_index=True)
        
        X, y= train_data_processing(X, y, months=months)

        X_pred = pred_data_processing(X_pred)
        
        X_path = buildings[i] + "/" + "X.csv"
        X.to_csv(path_or_buf = X_path, sep='\t')

        y_path = buildings[i] + "/" + "y.csv"
        y.to_csv(path_or_buf = y_path, sep='\t')

        X_pred_path = buildings[i] + "/" + "X_pred.csv"
        X_pred.to_csv(path_or_buf = X_pred_path, sep='\t')
        
        
        paths.append([X_path, y_path, X_pred_path])

    return paths 



In [None]:
file_paths = [['A/train_targets.parquet', 'A/X_train_estimated.parquet', 'A/X_train_observed.parquet', 'A/X_test_estimated.parquet'],
              ['B/train_targets.parquet', 'B/X_train_estimated.parquet', 'B/X_train_observed.parquet', 'B/X_test_estimated.parquet'],
              ['C/train_targets.parquet', 'C/X_train_estimated.parquet', 'C/X_train_observed.parquet', 'C/X_test_estimated.parquet']]

file_paths = create_training_and_pred_data(file_paths = file_paths)

In [None]:
"""
A class to organize the different steps in the machine learning pipeline. The class contains some nice helper functions
that helps the user gain insight into what features the model finds the most usefull.
"""


class learner:
    def __init__(self, file_paths: list[list[str]], features: list[str] = [], save_folder: str = "", feedback: bool = False, categorical_features: list[str] = []) -> None:
        self.file_paths = file_paths
        self.features = features
        self.save_folder = save_folder + "/"
        self.buildings = ["A", "B", "C"]
        self.feedback = feedback
        self.categorical_features = categorical_features

    def create_training_data_multi_model(self):
        self.X_train_sets = []
        self.X_test_sets = []
        self.y_train_sets = []
        self.y_test_sets = []
        self.X_pred_sets = []
        self.y_sets = []

        for i, path in enumerate(self.file_paths):
            
            X = pd.read_csv(path[0], sep='\t')[self.features]
            
            y = pd.read_csv(path[1], sep='\t')['pv_measurement']

            X_pred = pd.read_csv(path[2], sep='\t')[self.features + ["date_forecast"]]
            X_pred["date_forecast"] = pd.to_datetime(X_pred["date_forecast"])
            
            for cat_feat in self.categorical_features:
                X_pred[cat_feat] = X_pred[cat_feat].astype(int)
                X[cat_feat] = X[cat_feat].astype(int)

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=True)
            

            # ================= SAVING ALL SETS ======================
            self.X_train_sets.append(X_train)
            self.X_test_sets.append(X_test)
            self.y_train_sets.append(y_train)
            self.y_test_sets.append(y_test)
            self.X_pred_sets.append(X_pred)
            self.y_sets.append(y)
            


    def create_training_data(self):
        list_y = []
        list_X = []
        list_X_pred = []
        scalers = []
    
        for i, path in enumerate(self.file_paths):
            
            X = pd.read_csv(path[0], sep='\t')[self.features]
            y = pd.read_csv(path[1], sep='\t')['pv_measurement']
            X_pred = pd.read_csv(path[2], sep='\t')[self.features + ["date_forecast"]]
            X_pred["date_forecast"] = pd.to_datetime(X_pred["date_forecast"])

            y.plot()
            plt.show()
            # =================  TEST DATA  ================
            X_pred['building'] = self.buildings[i]
            
            list_X_pred.append(X_pred)

            # =================TRAINING DATA================
            
            # ADD A FUNCTION TO GENERATE BUILDING FEATURE.
            X['building'] = self.buildings[i]

            list_y.append(y)
            list_X.append(X)

        self.scalers = scalers
        # Add all the lists together. However there is a need to add set
        y = pd.concat(list_y, axis= 0, ignore_index=True)
        X = pd.concat(list_X, axis= 0, ignore_index=True)
        X_pred = pd.concat(list_X_pred, axis = 0, ignore_index=True)
        
        X = X.reset_index(drop=True)
        X_pred = X_pred.reset_index(drop=True)

        for cat_feat in self.categorical_features:
            X_pred[cat_feat] = X_pred[cat_feat].astype(int)
            X[cat_feat] = X[cat_feat].astype(int)
        
        X_train, X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.15, shuffle=True)
        
        
        self.X_train, self.X_test, self.X_pred = X_train, X_test, X_pred
    
    def fit_multi_model_h2o(self):
        
        h2o.init()
        self.models = []
        
        for i in range(len(self.X_train_sets)):
            train_frame =  pd.concat([self.X_train_sets[i], self.y_train_sets[i]], axis = "columns")
            h2o_frame = h2o.H2OFrame(train_frame)

            x_train_columns = h2o_frame.columns

            model = H2OAutoML(sort_metric='MAE', max_models=10, exclude_algos=["DeepLearning"])

            model.train(x = x_train_columns, y = "pv_measurement", training_frame=h2o_frame)
            self.models.append(model)


    def fit_multi_model(self):
        
        self.models = []

        for i in range(len(self.X_train_sets)): 
            
            if len(self.categorical_features) > 0: 
                train_dataset = cb.Pool(self.X_train_sets[i], self.y_train_sets[i], cat_features = self.categorical_features)
            else: 
                train_dataset = cb.Pool(self.X_train_sets[i], self.y_train_sets[i])
                
            model = cb.CatBoostRegressor(loss_function="MAE", logging_level='Silent')
                
            grid = {'iterations': [100, 150, 200],
                    'learning_rate': [0.03, 0.1],
                    'depth': [2, 4, 6, 8],
                    'l2_leaf_reg': [0.2, 0.5, 1, 3]}
            model.grid_search(grid, train_dataset, verbose=False)

            self.models.append(model)
        return self.models


    def get_performance_multi_model_h2o(self):
        mae_sum = 0

        for i in range(len(self.X_test_sets)):
            h2o_frame = h2o.H2OFrame(self.X_test_sets[i])

            pred_h2o_frame = self.models[i].predict(h2o_frame)
            pred_df = pred_h2o_frame.as_data_frame()
            pred = pred_df['predict'].to_list()
            mae = (mean_absolute_error(self.y_test_sets[i], np.array(pred)))
            mae_sum = mae + mae_sum

        print("MAE: ", mae_sum)

    def predict_multi_model_h2o(self):
        preds = []
        
        for i in range(len(self.X_pred_sets)):
            h2o_frame = h2o.H2OFrame(self.X_pred_sets[i])

            pred_h2o_frame = self.models[i].predict(h2o_frame)
            pred_df = pred_h2o_frame.as_data_frame()
            X_pred = pred_df['predict'].to_list()
            
            unformated_pred = np.array(X_pred)
            
            unformated_pred_df = pd.DataFrame()
            unformated_pred_df["date_forecast"] = self.X_pred_sets[i]["date_forecast"]
            unformated_pred_df["building"] = self.buildings[i]

            # Use the replace method with the specified column and dictionary
            unformated_pred_df["pv_measurement"] = pd.Series(np.array(unformated_pred))
            unformated_pred_df["pv_measurement"] = unformated_pred_df["pv_measurement"].apply(lambda x: max(0, x))

            unformated_pred_df["pv_measurement"].plot()
            plt.show()

            preds.append(unformated_pred_df)

        unformated_pred_df = pd.concat(preds, axis = 0, ignore_index = True)
        # Should add a save method, so that not all work gets lost 😕
        
        pred = self._format_predictions(unformated_pred_df)
        self._save_predictions(pred)

    def predict_multi_model(self):
        preds = []
        
        for i in range(len(self.X_pred_sets)):
            if self.feedback:
                X_pred_without_feedback = self.X_pred_sets[i].drop('date_forecast', axis = 1)
                pred_list = []
                # First feedback

                
                y_df = self.y_sets[i].to_frame()

                feedback_init_row = shifted_values_24_h(y_df, "pv_measurement").iloc[-1]
                
                feedback_row = feedback_init_row

                

                for j, idx in enumerate(X_pred_without_feedback.index.to_list()):
                   
                    X_pred_with_feedback = pd.concat([X_pred_without_feedback.loc[idx].reset_index(drop=True), feedback_row.reset_index(drop=True)], axis=0, ignore_index=True)
                    
                    feedback = self.models[i].predict(X_pred_with_feedback)
                    
                    # Shift the values to the right
                    feedback_row.iloc[1:] = feedback_row.iloc[:-1].values
                    
                    feedback_row.iloc[0] = feedback
                    pred_list.append(feedback)

                unformated_pred = np.array(pred_list)
                
                series = pd.Series(unformated_pred)
                series.plot()
                plt.show()

            else:
                X_pred = self.X_pred_sets[i].drop('date_forecast', axis = 1)
                unformated_pred = self.models[i].predict(X_pred)

            
            unformated_pred_df = pd.DataFrame()
            unformated_pred_df["date_forecast"] = self.X_pred_sets[i]["date_forecast"]
            unformated_pred_df["building"] = self.buildings[i]

            # Use the replace method with the specified column and dictionary
            unformated_pred_df["pv_measurement"] = pd.Series(np.array(unformated_pred))
            unformated_pred_df["pv_measurement"] = unformated_pred_df["pv_measurement"].apply(lambda x: max(0, x))

            unformated_pred_df["pv_measurement"].plot()
            plt.show()

            preds.append(unformated_pred_df)

        unformated_pred_df = pd.concat(preds, axis = 0, ignore_index = True)
        # Should add a save method, so that not all work gets lost :/
        
        pred = self._format_predictions(unformated_pred_df)
        self._save_predictions(pred)

    def get_performance_multi_model(self) -> None:
        mae_sum = 0
        for i in range(len(self.X_test_sets)):
            pred = self.models[i].predict(self.X_test_sets[i])
            pd.Series(pred).plot()
            pd.Series(self.y_test_sets[i]).plot()
            plt.show()
            mae = (mean_absolute_error(self.y_test_sets[i], np.array(pred)))
            mae_sum = mae + mae_sum

        print("Mean absolute error: ", mae_sum/len(self.X_test_sets))



    def fit_model(self) -> None:
        """
        Based on the selected model the class switches between what model is doing the learning. 
        """

        #============ SHOULD BE PLACED WITHIN A LIST OF FUNCTIONS ===================#
        # Add a function that picks between different models, and processes the data based on this
        train_dataset = cb.Pool(self.X_train, self.y_train, cat_features=['building'])

        self.model = cb.CatBoostRegressor(loss_function="MAE", logging_level='Silent')

        grid = {'iterations': [100, 150, 200],
                'learning_rate': [0.03, 0.1],
                'depth': [2, 4, 6, 8],
                'l2_leaf_reg': [0.2, 0.5, 1, 3]}

        self.model.grid_search(grid, train_dataset, verbose=False)
        

    def get_performance(self) -> None:
        pred = self.model.predict(self.X_test)
        pd.Series(pred).plot()
        pd.Series(self.y_test).plot()
        
        mae = (mean_absolute_error(self.y_test, pred))
        print("Mean Abs: {:.2f}".format(mae))

    def predict(self) -> None:
        
        X_pred = self.X_pred.drop('date_forecast', axis = 1)
        unformated_pred = self.model.predict(X_pred)
        
        unformated_pred_df = pd.DataFrame()
        unformated_pred_df["date_forecast"] = self.X_pred["date_forecast"]
        unformated_pred_df["building"] = self.X_pred["building"]

        replace_dict = {0: 'A', 1: 'B', 2: 'C'}

        # Use the replace method with the specified column and dictionary
        unformated_pred_df["building"] = unformated_pred_df["building"].replace(replace_dict)
        plot_ser = pd.Series(unformated_pred)
        plot_ser.plot()
        plt.show()
        unformated_pred_df["pv_measurement"] = pd.Series(unformated_pred)
        unformated_pred_df["pv_measurement"] = unformated_pred_df["pv_measurement"].apply(lambda x: max(0, x))
    
        pred = self._format_predictions(unformated_pred_df)
        self._save_predictions(pred)

    def save_best_features(self, filename: str, N: int = 0):
        if N == 0:
            N = len(self.X_test.columns.values) - 1
        best_features_df = pd.DataFrame()

        feature_importance = self.model.get_feature_importance()

        # Pair feature names with their importance scores
        feature_importance_dict = dict(zip(self.model.feature_names_, feature_importance))

        # Sort features by importance
        sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

        # Print or use the top features
        top_features = sorted_feature_importance[:N]  # Replace N with the number of top features you want
        
        # Saving to list
        labels = list(self.X_test.columns.values)
        best_features = []

        for feat in top_features:
            best_features.append(feat[0])
        
        best_features_df["Model"] = pd.Series(np.array(best_features))

        best_features_df.to_csv("tests/" + self.save_folder + "single_learner" + '.csv', sep ='\t')




    def save_best_features_multi_model(self, filename: str, N: int = 0):
        if N == 0:
            N = len(self.X_test_sets[0].columns.values) - 1
        best_features_df = pd.DataFrame()

        for i, X in enumerate(self.X_test_sets):
           
            feature_importance = self.models[i].get_feature_importance()

            # Pair feature names with their importance scores
            feature_importance_dict = dict(zip(self.models[i].feature_names_, feature_importance))

            # Sort features by importance
            sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

            # Print or use the top features
            top_features = sorted_feature_importance[:N]  # Replace N with the number of top features you want
            
            # Saving to list
            labels = list(X.columns.values)
            best_features = []

            for feat in top_features:
                best_features.append(feat[0])
            
            best_features_df["Model " + self.buildings[i]] = pd.Series(np.array(best_features))

        best_features_df.to_csv("tests/" + self.save_folder + filename + '.csv', sep ='\t')


    def _format_predictions(self, unformated_pred: pd.DataFrame) -> pd.DataFrame:
        
        # 
        to_be_submitted_index = pd.read_csv("test.csv")

        #convert the "time" column to datetime
        to_be_submitted_index["time"] = pd.to_datetime(to_be_submitted_index["time"])
        pred = pd.merge(unformated_pred, to_be_submitted_index, how='inner', left_on=['date_forecast', 'building'], right_on=["time", "location"])
        
        return pred
        
        
    def _save_predictions(self, pred: pd.DataFrame)->None:
        #Make the index and pv_measurement column into a csv file
        pred[["id", "pv_measurement"]].rename(columns={"id" : "id" , "pv_measurement" : "prediction"}).to_csv("tests/" + self.save_folder + "model_pred.csv", index=False)



In [None]:



features_with_too_many_NaN = ['ceiling_height_agl:m', 'cloud base agl:m', 'snow density:km3']
features_with_bad_qualities = ['elevation:m', 'fresh_snow_3h:cm', 'wind_speed_w_1000hPa:ms', 'snow_drift:idx', 'fresh_snow_12h:cm', 'fresh_snow_24h:cm', 'fresh_snow_1h:cm', 'snow_depth:cm', 'fresh_snow_6h:cm', 'snow_melt_10min:mm']
categorical_features = ['dew_or_rime:idx'] # 'is_day:idx', 'is_in_shadow:idx', 

features = default_features
features = [item for item in features if item not in features_with_too_many_NaN]
features = [item for item in features if item not in features_with_bad_qualities]

dervative_features = ["derivative_" + item + "_d" for item in features]

features = features + derivative_features + time_features

cat_features = [] #categorical_features

multi_learner = learner(file_paths = file_paths, features = features, categorical_features = categorical_features)
multi_learner.create_training_data_multi_model()
multi_learner.fit_multi_model_h2o()
multi_learner.predict_multi_model_h2o()

# Iterate thorugh several learners and add them to the 