# Imports

In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import warnings
import data_func.read_data as read_data
warnings.filterwarnings("ignore", category=FutureWarning, module="xgboost")


[           date_forecast  absolute_humidity_2m:gm3  air_density_2m:kgm3  \
0    2023-05-01 00:00:00                       4.4                1.286   
1    2023-05-01 00:15:00                       4.3                1.287   
2    2023-05-01 00:30:00                       4.3                1.287   
3    2023-05-01 00:45:00                       4.3                1.287   
4    2023-05-01 01:00:00                       4.3                1.287   
...                  ...                       ...                  ...   
2875 2023-07-03 22:45:00                       9.1                1.207   
2876 2023-07-03 23:00:00                       9.1                1.207   
2877 2023-07-03 23:15:00                       9.1                1.208   
2878 2023-07-03 23:30:00                       9.0                1.208   
2879 2023-07-03 23:45:00                       9.0                1.208   

      ceiling_height_agl:m  clear_sky_energy_1h:J  clear_sky_rad:W  \
0               912.700012  

# Load datasets

In [2]:
dataframes = read_data.get_training_data()
X_frames_train = dataframes[0]
Y_frames_train = dataframes[1]
X_frames_test = read_data.get_test_data()



[           date_forecast  absolute_humidity_2m:gm3  air_density_2m:kgm3  \
0    2023-05-01 00:00:00                       4.4                1.286   
1    2023-05-01 00:15:00                       4.3                1.287   
2    2023-05-01 00:30:00                       4.3                1.287   
3    2023-05-01 00:45:00                       4.3                1.287   
4    2023-05-01 01:00:00                       4.3                1.287   
...                  ...                       ...                  ...   
2875 2023-07-03 22:45:00                       9.1                1.207   
2876 2023-07-03 23:00:00                       9.1                1.207   
2877 2023-07-03 23:15:00                       9.1                1.208   
2878 2023-07-03 23:30:00                       9.0                1.208   
2879 2023-07-03 23:45:00                       9.0                1.208   

      ceiling_height_agl:m  clear_sky_energy_1h:J  clear_sky_rad:W  \
0               912.700012  

In [3]:
print(len(Y_frames_train))


3


In [4]:
XTRA = X_frames_train[0]
XTRB = X_frames_train[1]
XTRC = X_frames_train[2]

YA = Y_frames_train[0]
YB = Y_frames_train[1]
YC = Y_frames_train[2]


## Cleaning X

Found a hole in the X data on location A

1|2
---|---
![1](./feature_cleaning/img/X_A_1.png) | ![2](./feature_cleaning/img/X_A_2.png)

This prevents our aggregation method to match the target data after these data points

Trying to remove this point and reindex to fix the issue. 

In [5]:
print("A: \n", len(XTRA))
ind = XTRA[(XTRA['date_forecast'] == pd.to_datetime('2022-10-21 00:00:00'))].index
print(ind)
XTRA = XTRA.drop(ind).reset_index(drop=True)
print(len(XTRA))
XTRA.head()


A: 
 136245
Int64Index([118664], dtype='int64')
136244


Unnamed: 0,date_forecast,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,...,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,estimated,estimation_calc_forecast_difference
0,2019-06-02 22:00:00,7.7,1.23,1744.900024,0.0,0.0,1744.900024,0.0,280.299988,0.0,...,0.0,285.899994,100.0,39640.101562,3.7,-3.6,-0.8,-0.0,0,0
1,2019-06-02 22:15:00,7.7,1.229,1734.0,0.0,0.0,1734.0,0.0,280.299988,0.0,...,0.0,286.100006,100.0,40123.898438,3.6,-3.6,-0.6,-0.0,0,0
2,2019-06-02 22:30:00,7.7,1.228,1723.5,0.0,0.0,1723.5,0.0,280.299988,0.0,...,0.0,286.299988,100.0,40628.300781,3.6,-3.6,-0.4,-0.0,0,0
3,2019-06-02 22:45:00,7.7,1.226,1713.400024,0.0,0.0,1713.400024,0.0,280.299988,0.0,...,0.0,286.600006,100.0,41153.601562,3.5,-3.5,-0.2,-0.0,0,0
4,2019-06-02 23:00:00,7.7,1.225,1703.599976,0.0,0.0,1703.599976,0.0,280.299988,0.0,...,0.0,286.799988,100.0,41699.898438,3.5,-3.5,0.0,-0.0,0,0


Same problem for B-set at `2022-05-02 23:00:00` and C-set at `2022-04-25 21:00:00`

In [6]:
print("B: \n", len(XTRB))
ind = XTRB[(XTRB['date_forecast'] == pd.to_datetime('2022-05-02 21:00:00'))].index
print(ind)
XTRB = XTRB.drop(ind).reset_index(drop=True)
print(len(XTRB))

print("C: \n",len(XTRC))
ind = XTRC[(XTRC['date_forecast'] == pd.to_datetime('2022-04-25 21:00:00'))].index
print(ind)
XTRC = XTRC.drop(ind).reset_index(drop=True)
print(len(XTRC))


B: 
 134505
Int64Index([116916], dtype='int64')
134504
C: 
 134401
Int64Index([116244], dtype='int64')
134400


In [7]:
# Assuming X_train and Y_train are lists of DataFrames
X_train = [XTRA, XTRB, XTRC]
Y_train = [YA, YB, YC]


def new_fix_y_holes(y: pd.DataFrame) -> (pd.DataFrame):
    Y = y
    index_to_drop = []
    consecutive_values = []
    
    for i in range(len(Y)):
        current_value = Y['pv_measurement'].iloc[i]
        if not consecutive_values or consecutive_values[0] == current_value:
            consecutive_values.append(current_value)
        else:
            consecutive_values = [current_value]
        if len(consecutive_values) == 24 and consecutive_values[0] == 0:
            start_index = max(i - 23, 0)  # Make sure start index is not negative
            indices = list(range(start_index, i + 1))
            index_to_drop.extend(indices)
            consecutive_values = []
        elif len(consecutive_values) == 3 and consecutive_values[0] > 0:
            start_index = max(i - 2, 0)  # Make sure start index is not negative
            indices = list(range(start_index, i + 1))
            index_to_drop.extend(indices)
            consecutive_values = []
    # Drop the indices from Y and x
    fixed_Y = y.drop(index_to_drop).reset_index(drop=True)
    
    return fixed_Y

# Then you can proceed with your loop:
print(len(Y_train[0]), len(X_train[0]))
for i in range(len(Y_train)):
    Y_train[i] = new_fix_y_holes(Y_train[i])
print(len(Y_train[0]), len(X_train[0]))


34085 136244
34061 136244


In [8]:
print(Y_train[0])


                     time  pv_measurement
0     2019-06-02 22:00:00            0.00
1     2019-06-02 23:00:00            0.00
2     2019-06-03 00:00:00            0.00
3     2019-06-03 01:00:00            0.00
4     2019-06-03 02:00:00           19.36
...                   ...             ...
34056 2023-04-30 19:00:00            9.02
34057 2023-04-30 20:00:00            0.00
34058 2023-04-30 21:00:00            0.00
34059 2023-04-30 22:00:00            0.00
34060 2023-04-30 23:00:00            0.00

[34061 rows x 2 columns]


# Data Aggregation

In [9]:
# making shure that target values line up with x_values
import data_func.aggregation as data_agg

from datetime import datetime, timedelta

import pandas as pd

def calculate_hourly_average(df: pd.DataFrame) -> pd.DataFrame:
    # Convert the 'date_forecast' column to datetime format
    df['date_forecast'] = pd.to_datetime(df['date_forecast'])
    # Round down the datetime values to the nearest hour
    df['hour'] = df['date_forecast'].dt.floor('H')

    # Group by the 'hour' column and calculate the mean for each group
    hourly_average = df.groupby('hour').mean().reset_index()
    print(hourly_average.head(10))

    # 'hourly_average' already has 'hour' as the rounded down datetime column
    # Rename 'hour' to 'date_forecast' if you want that column name
    hourly_average.rename(columns={'hour': 'date_forecast'}, inplace=True)

    return hourly_average

def new_data_align(x: pd.DataFrame, y: pd.DataFrame) -> tuple:

    # Calculate the mean hourly values for the X data
    mean = calculate_hourly_average(x)
    # Make sure the 'time' column in Y is in datetime format
    y['time'] = pd.to_datetime(y['time'])

    # Merge the datasets on the aligned timestamps
    merged = pd.merge(mean, y, how='inner', left_on='date_forecast', right_on='time')
    merged.drop(columns=['time'], inplace=True)

    # Drop all Nan values for the target variable
    merged.dropna(subset=['pv_measurement'], inplace=True)

    y = pd.DataFrame(merged['pv_measurement'])

    print(y.isna().sum())

    return merged, y


for i in range(len(X_train)):
    X_train[i], Y_train[i] = new_data_align(X_train[i], Y_train[i])
    print(f'Aligned lengths for pair {i}: X={len(X_train[i])}, Y={len(Y_train[i])}')

for j in range(len(X_frames_test)):
    X_frames_test[j] = calculate_hourly_average(X_frames_test[j])

# To test the first dataset




# categorical_col = ['dew_or_rime:idx', 'precip_type_5min:idx', 'is_day:idx', 'is_in_shadow:idx', 'estimated']

# def aggregate_correct_x(x: pd.DataFrame) -> pd.DataFrame:
#    '''
#    Takes a given dataframe and returns an aggregated dataframe based on selected categorical functions. 
#    Assumes grouping of 4.
#    '''
#    categorical = x[["date_forecast"] + categorical_col]
#    mean = x.drop(columns=categorical_col)

#    categorical = data_agg.gen_agg(categorical, agg_type=data_agg.stocastic_median, merge_on = 'first')
#    mean = data_agg.gen_agg(mean, "mean", merge_on = 'first')

#    return pd.merge(categorical, mean, on="date_forecast")

# def data_allign(x_train, y_train):

#   y_train.dropna(inplace=True)
#   x_train = aggregate_correct_x(x_train)
#   combined_data = pd.merge(x_train, y_train, left_on='date_forecast', right_on='time')
#   y_train = combined_data['pv_measurement']

#   if 'time' and 'pv_measurement' in combined_data.columns:
#     combined_data.drop(columns=['time', 'pv_measurement'], inplace=True)
    
#   return combined_data, y_train



# X_train = [XTRA, XTRB, XTRC]
# Y_train = [YA, YB, YC]

# for i in range(len(X_train)):
#     X_train[i], Y_train[i] = data_allign(X_train[i], Y_train[i])

# for j in range(len(X_frames_test)):
#     X_frames_test[j] = aggregate_correct_x(X_frames_test[j])


                 hour  absolute_humidity_2m:gm3  air_density_2m:kgm3  \
0 2019-06-02 22:00:00                     7.700              1.22825   
1 2019-06-02 23:00:00                     7.700              1.22350   
2 2019-06-03 00:00:00                     7.875              1.21975   
3 2019-06-03 01:00:00                     8.425              1.21800   
4 2019-06-03 02:00:00                     8.950              1.21800   
5 2019-06-03 03:00:00                     9.250              1.21650   
6 2019-06-03 04:00:00                     9.525              1.21300   
7 2019-06-03 05:00:00                     9.700              1.20750   
8 2019-06-03 06:00:00                     9.550              1.20500   
9 2019-06-03 07:00:00                     9.450              1.20500   

   ceiling_height_agl:m  clear_sky_energy_1h:J  clear_sky_rad:W  \
0           1728.949951           0.000000e+00         0.000000   
1           1689.824951           0.000000e+00         0.000000   
2     

  hourly_average = df.groupby('hour').mean().reset_index()
  hourly_average = df.groupby('hour').mean().reset_index()
  hourly_average = df.groupby('hour').mean().reset_index()
  hourly_average = df.groupby('hour').mean().reset_index()
  hourly_average = df.groupby('hour').mean().reset_index()
  hourly_average = df.groupby('hour').mean().reset_index()


In [10]:
#Verify length matches
for x in range(len(X_train)):
    print("x,y: ", len(X_train[x]), len(Y_train[x]))


x,y:  34037 34037
x,y:  26459 26459
x,y:  21598 21598


In [11]:
# Verifying that all train dataset still have their end data after aggregation in case of allignment issue
for x in X_train:
    print(x["date_forecast"].tail())


34032   2023-04-30 19:00:00
34033   2023-04-30 20:00:00
34034   2023-04-30 21:00:00
34035   2023-04-30 22:00:00
34036   2023-04-30 23:00:00
Name: date_forecast, dtype: datetime64[ns]
26458   2023-04-30 19:00:00
26459   2023-04-30 20:00:00
26460   2023-04-30 21:00:00
26461   2023-04-30 22:00:00
26462   2023-04-30 23:00:00
Name: date_forecast, dtype: datetime64[ns]
27652   2023-04-30 19:00:00
27653   2023-04-30 20:00:00
27654   2023-04-30 21:00:00
27655   2023-04-30 22:00:00
27656   2023-04-30 23:00:00
Name: date_forecast, dtype: datetime64[ns]


In [12]:
Y_train[0].head(20)


Unnamed: 0,pv_measurement
0,0.0
1,0.0
2,0.0
3,0.0
4,19.36
5,251.02
6,263.78
7,522.72
8,904.42
9,1238.82


# Fix consecutive values in the dataset


# Feature engineering

In [13]:
# import data_func.timeseasonality as DTS
# import data_func.one_hot_encoding as OHE
# for i in range(len(X_train)):
#     X_train[i] = DTS.append_seasonal_columns(X_train[i])
    

# for i in range(len(X_frames_test)):
#     X_frames_test[i] = DTS.append_seasonal_columns(X_frames_test[i])
    

# # THIS SECTION CAN ONLY WORK IF THE PREVIOUS AGGREGATION IS DONE INDIVIDUALLY FOR CATEGORICAL DATA
# # import data_func.one_hot_encoding as OHE

# def fix_categorical(train: pd.DataFrame, test: pd.DataFrame):
#      temp = pd.concat((train, test), axis=0, ignore_index=True)
#      index_train = temp[(temp['date_forecast'] < test['date_forecast'].iloc[0])].index
#      return temp.drop(index_train)

# OH_columns = ['dew_or_rime:idx', 'precip_type_5min:idx']

# for i in range(len(X_frames_test)):
#      X_train[i] = OHE.one_hot_encode(X_train[i],OH_columns)
#      X_frames_test[i] = OHE.one_hot_encode(X_frames_test[i], OH_columns)
#      X_frames_test[i] = fix_categorical(X_train[i], X_frames_test[i])
#      #X_train[i].drop(columns=['date_forecast'], inplace=True)
#      #X_frames_test[i].drop(columns=['date_forecast'], inplace=True)


In [14]:
print(X_frames_test)


[          date_forecast  absolute_humidity_2m:gm3  air_density_2m:kgm3  \
0   2023-05-01 00:00:00                     4.325              1.28675   
1   2023-05-01 01:00:00                     4.275              1.28600   
2   2023-05-01 02:00:00                     4.150              1.28375   
3   2023-05-01 03:00:00                     4.025              1.28200   
4   2023-05-01 04:00:00                     3.900              1.28100   
..                  ...                       ...                  ...   
715 2023-07-03 19:00:00                     8.350              1.19725   
716 2023-07-03 20:00:00                     8.525              1.20050   
717 2023-07-03 21:00:00                     8.825              1.20450   
718 2023-07-03 22:00:00                     9.025              1.20700   
719 2023-07-03 23:00:00                     9.050              1.20775   

     ceiling_height_agl:m  clear_sky_energy_1h:J  clear_sky_rad:W  \
0              912.700012               0

## Data Cleaning

In [15]:
# Show null values for all training sets

count_a = X_train[0].isna().sum()
count_b = X_train[1].isna().sum()
count_c = X_train[2].isna().sum()

count_a.rename("A", inplace=True)
count_b.rename("B", inplace=True)
count_c.rename("C", inplace=True)

df = pd.merge(count_a, count_b, left_index=True, right_index=True)
df = pd.merge(df, count_c, left_index=True, right_index=True)

df[(df['A'] != 0) | (df['B'] != 0) | (df['C'] != 0)]


Unnamed: 0,A,B,C
ceiling_height_agl:m,6127,4175,4423
cloud_base_agl:m,2363,1885,1882
snow_density:kgm3,32897,25442,21472


In [16]:
# Show null values for all test sets

count_a = X_frames_test[0].isna().sum()
count_b = X_frames_test[1].isna().sum()
count_c = X_frames_test[2].isna().sum()

count_a.rename("A", inplace=True)
count_b.rename("B", inplace=True)
count_c.rename("C", inplace=True)

df = pd.merge(count_a, count_b, left_index=True, right_index=True)
df = pd.merge(df, count_c, left_index=True, right_index=True)

df[(df['A'] != 0) | (df['B'] != 0) | (df['C'] != 0)]


Unnamed: 0,A,B,C
ceiling_height_agl:m,188,180,202
cloud_base_agl:m,70,65,88
snow_density:kgm3,720,720,720


In [17]:
# Removes nan or fills it
values = {"ceiling_height_agl:m": 20000, "cloud_base_agl:m": 20000}

for i in range(len(X_train)):
    X_train[i].drop(columns=['snow_density:kgm3', 'elevation:m'], inplace=True)
    X_frames_test[i].drop(columns=['snow_density:kgm3', 'elevation:m'], inplace=True)
    X_train[i].fillna(value=values, inplace=True)
    X_frames_test[i].fillna(value=values, inplace=True)
    X_frames_test[i].fillna(0, inplace=True) # fills remaining NaN vals with 0, (specifically for one-hot encoded columns with NaN)


In [18]:
# drop_cols_a = [
#     "fresh_snow_12h:cm", 
#     "fresh_snow_1h:cm",
#     "fresh_snow_24h:cm",
#     "fresh_snow_3h:cm",
#     "fresh_snow_6h:cm",
#     "snow_drift:idx",
#     "snow_depth:cm",
#     "snow_melt_10min:mm",
#     "snow_water:kgm2"
# ]

# X_train[0].drop(columns=drop_cols_a, inplace=True)


# Training the model

In [24]:
x_train_a, x_val_a, y_train_a, y_val_a = train_test_split(X_train[0], Y_train[0], test_size=0.17, random_state=None)
x_train_b, x_val_b, y_train_b, y_val_b = train_test_split(X_train[1], Y_train[1], test_size=0.17, random_state=None)
x_train_c, x_val_c, y_train_c, y_val_c = train_test_split(X_train[2], Y_train[2], test_size=0.17, random_state=None)

print(Y_train[0]['pv_measurement'].head(10))


0       0.00
1       0.00
2       0.00
3       0.00
4      19.36
5     251.02
6     263.78
7     522.72
8     904.42
9    1238.82
Name: pv_measurement, dtype: float64


In [25]:
from autogluon.tabular import TabularDataset, TabularPredictor

train_a = TabularDataset(X_train[0])
predictor = TabularPredictor(label="pv_measurement", eval_metric='mae').fit(train_data=train_a, presets='medium_quality', time_limit=1800)


No path specified. Models will be saved in: "AutogluonModels/ag-20231109_124333/"
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 1800s
AutoGluon will save models to "AutogluonModels/ag-20231109_124333/"
AutoGluon Version:  0.8.2
Python Version:     3.10.9
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 22.6.0: Wed Oct  4 21:26:43 PDT 2023; root:xnu-8796.141.3.701.17~4/RELEASE_ARM64_T8112
Disk Space Avail:   362.02 GB / 494.38 GB (73.2%)
Train Data Rows:    34037
Train Data Columns: 46
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 631.4561, 1166.49673)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one o

[1000]	valid_set's l1: 161.632
[2000]	valid_set's l1: 152.326
[3000]	valid_set's l1: 147.35
[4000]	valid_set's l1: 144.371
[5000]	valid_set's l1: 142.531
[6000]	valid_set's l1: 141.006
[7000]	valid_set's l1: 140.134
[8000]	valid_set's l1: 139.363
[9000]	valid_set's l1: 138.98
[10000]	valid_set's l1: 138.536


	-138.5134	 = Validation score   (-mean_absolute_error)
	34.89s	 = Training   runtime
	0.41s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 1761.9s of the 1761.9s of remaining time.


[1000]	valid_set's l1: 159.711
[2000]	valid_set's l1: 155.391
[3000]	valid_set's l1: 153.82
[4000]	valid_set's l1: 152.742
[5000]	valid_set's l1: 152.077
[6000]	valid_set's l1: 151.771
[7000]	valid_set's l1: 151.574
[8000]	valid_set's l1: 151.471
[9000]	valid_set's l1: 151.413
[10000]	valid_set's l1: 151.36


	-151.3602	 = Validation score   (-mean_absolute_error)
	33.61s	 = Training   runtime
	0.39s	 = Validation runtime
Fitting model: RandomForestMSE ... Training model for up to 1727.61s of the 1727.61s of remaining time.
	-178.5466	 = Validation score   (-mean_absolute_error)
	26.36s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: CatBoost ... Training model for up to 1701.01s of the 1701.01s of remaining time.
	-158.8482	 = Validation score   (-mean_absolute_error)
	72.47s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ... Training model for up to 1628.53s of the 1628.53s of remaining time.
	-181.4306	 = Validation score   (-mean_absolute_error)
	4.93s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: NeuralNetFastAI ... Training model for up to 1623.36s of the 1623.36s of remaining time.
	-179.8632	 = Validation score   (-mean_absolute_error)
	13.71s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: XGBo

[1000]	valid_set's l1: 149.572
[2000]	valid_set's l1: 147.093
[3000]	valid_set's l1: 146.643
[4000]	valid_set's l1: 146.501
[5000]	valid_set's l1: 146.445
[6000]	valid_set's l1: 146.417
[7000]	valid_set's l1: 146.407
[8000]	valid_set's l1: 146.403
[9000]	valid_set's l1: 146.401
[10000]	valid_set's l1: 146.4


	-146.3997	 = Validation score   (-mean_absolute_error)
	125.91s	 = Training   runtime
	0.81s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.0s of the 1386.5s of remaining time.
	-133.8126	 = Validation score   (-mean_absolute_error)
	0.09s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 413.61s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20231109_124333/")


In [None]:
# y_a_test = predictor.predict(x_val_a)


In [None]:
# join_val_a = pd.merge(x_val_a, y_val_a, left_index=True, right_index=True)
# predictor.leaderboard(join_val_a, silent=True)


In [None]:
train_a['pv_measurement'].head(20)


0        0.00
1        0.00
2        0.00
3        0.00
4       19.36
5      251.02
6      263.78
7      522.72
8      904.42
9     1238.82
10    2189.88
11    3047.22
12    2163.26
13    2686.64
14    3175.92
15    2730.86
16    2093.96
17    2774.20
18    1833.48
19    1057.54
Name: pv_measurement, dtype: float64

In [26]:

train_b = TabularDataset(X_train[1])
predictor_b = TabularPredictor(label="pv_measurement", eval_metric="mae").fit(train_data=train_b, presets='medium_quality', time_limit=1800)


No path specified. Models will be saved in: "AutogluonModels/ag-20231109_125143/"
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 1800s
AutoGluon will save models to "AutogluonModels/ag-20231109_125143/"
AutoGluon Version:  0.8.2
Python Version:     3.10.9
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 22.6.0: Wed Oct  4 21:26:43 PDT 2023; root:xnu-8796.141.3.701.17~4/RELEASE_ARM64_T8112
Disk Space Avail:   360.93 GB / 494.38 GB (73.0%)
Train Data Rows:    26459
Train Data Columns: 46
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (1152.3, -0.0, 104.75969, 210.78482)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one o

[1000]	valid_set's l1: 21.8289
[2000]	valid_set's l1: 20.834
[3000]	valid_set's l1: 20.3879
[4000]	valid_set's l1: 20.1849
[5000]	valid_set's l1: 20.0568
[6000]	valid_set's l1: 19.9755
[7000]	valid_set's l1: 19.9166
[8000]	valid_set's l1: 19.8635
[9000]	valid_set's l1: 19.8267
[10000]	valid_set's l1: 19.8082


	-19.8074	 = Validation score   (-mean_absolute_error)
	33.12s	 = Training   runtime
	0.33s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 1766.04s of the 1766.04s of remaining time.


[1000]	valid_set's l1: 22.6362
[2000]	valid_set's l1: 22.1875
[3000]	valid_set's l1: 22.0267
[4000]	valid_set's l1: 21.9262
[5000]	valid_set's l1: 21.8818
[6000]	valid_set's l1: 21.8662
[7000]	valid_set's l1: 21.8545
[8000]	valid_set's l1: 21.8551
[9000]	valid_set's l1: 21.8507
[10000]	valid_set's l1: 21.8489


	-21.8484	 = Validation score   (-mean_absolute_error)
	32.19s	 = Training   runtime
	0.37s	 = Validation runtime
Fitting model: RandomForestMSE ... Training model for up to 1733.23s of the 1733.23s of remaining time.
	-25.0868	 = Validation score   (-mean_absolute_error)
	17.9s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: CatBoost ... Training model for up to 1715.18s of the 1715.17s of remaining time.
	-22.2484	 = Validation score   (-mean_absolute_error)
	69.29s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ... Training model for up to 1645.86s of the 1645.86s of remaining time.
	-25.1667	 = Validation score   (-mean_absolute_error)
	3.17s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: NeuralNetFastAI ... Training model for up to 1642.55s of the 1642.55s of remaining time.
	-24.0962	 = Validation score   (-mean_absolute_error)
	9.87s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: XGBoost ...

[1000]	valid_set's l1: 21.4686
[2000]	valid_set's l1: 21.3151
[3000]	valid_set's l1: 21.277
[4000]	valid_set's l1: 21.2692
[5000]	valid_set's l1: 21.2666
[6000]	valid_set's l1: 21.2656
[7000]	valid_set's l1: 21.2653
[8000]	valid_set's l1: 21.2651
[9000]	valid_set's l1: 21.265
[10000]	valid_set's l1: 21.265


	-21.265	 = Validation score   (-mean_absolute_error)
	124.68s	 = Training   runtime
	0.75s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.0s of the 1441.42s of remaining time.
	-19.2894	 = Validation score   (-mean_absolute_error)
	0.09s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 358.69s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20231109_125143/")


In [None]:
# y_b_test = predictor_b.predict(x_val_b)


In [None]:
# join_val_b = pd.merge(x_val_b, y_val_b, left_index=True, right_index=True)
# predictor_b.leaderboard(join_val_b, silent=True)


In [27]:

train_c = TabularDataset(X_train[2])
predictor_c = TabularPredictor(label="pv_measurement", eval_metric="mae").fit(train_data=train_c, presets='medium_quality', time_limit=1800)


No path specified. Models will be saved in: "AutogluonModels/ag-20231109_125801/"
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 1800s
AutoGluon will save models to "AutogluonModels/ag-20231109_125801/"
AutoGluon Version:  0.8.2
Python Version:     3.10.9
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 22.6.0: Wed Oct  4 21:26:43 PDT 2023; root:xnu-8796.141.3.701.17~4/RELEASE_ARM64_T8112
Disk Space Avail:   360.14 GB / 494.38 GB (72.8%)
Train Data Rows:    21598
Train Data Columns: 46
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and label-values can't be converted to int).
	Label info (max, min, mean, stddev): (999.6, -0.0, 93.57521, 178.09189)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as on

[1000]	valid_set's l1: 19.7185
[2000]	valid_set's l1: 18.9
[3000]	valid_set's l1: 18.5034
[4000]	valid_set's l1: 18.2803
[5000]	valid_set's l1: 18.1528
[6000]	valid_set's l1: 18.0839
[7000]	valid_set's l1: 18.0575
[8000]	valid_set's l1: 18.0371
[9000]	valid_set's l1: 18.0047
[10000]	valid_set's l1: 17.9843


	-17.9826	 = Validation score   (-mean_absolute_error)
	32.54s	 = Training   runtime
	0.3s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 1766.71s of the 1766.71s of remaining time.


[1000]	valid_set's l1: 20.9086
[2000]	valid_set's l1: 20.4416
[3000]	valid_set's l1: 20.3499
[4000]	valid_set's l1: 20.2486
[5000]	valid_set's l1: 20.2017
[6000]	valid_set's l1: 20.184
[7000]	valid_set's l1: 20.1752
[8000]	valid_set's l1: 20.1709
[9000]	valid_set's l1: 20.1682
[10000]	valid_set's l1: 20.165


	-20.165	 = Validation score   (-mean_absolute_error)
	31.52s	 = Training   runtime
	0.31s	 = Validation runtime
Fitting model: RandomForestMSE ... Training model for up to 1734.63s of the 1734.63s of remaining time.
	-23.8037	 = Validation score   (-mean_absolute_error)
	13.82s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: CatBoost ... Training model for up to 1720.63s of the 1720.63s of remaining time.
	-20.3895	 = Validation score   (-mean_absolute_error)
	68.67s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ... Training model for up to 1651.95s of the 1651.95s of remaining time.
	-23.4816	 = Validation score   (-mean_absolute_error)
	2.38s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: NeuralNetFastAI ... Training model for up to 1649.45s of the 1649.45s of remaining time.
	-22.6042	 = Validation score   (-mean_absolute_error)
	7.85s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: XGBoost ...

[1000]	valid_set's l1: 19.6889
[2000]	valid_set's l1: 19.6172
[3000]	valid_set's l1: 19.6044
[4000]	valid_set's l1: 19.6011
[5000]	valid_set's l1: 19.6003
[6000]	valid_set's l1: 19.6
[7000]	valid_set's l1: 19.6
[8000]	valid_set's l1: 19.6
[9000]	valid_set's l1: 19.6


	-19.5999	 = Validation score   (-mean_absolute_error)
	115.28s	 = Training   runtime
	0.45s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.0s of the 1463.85s of remaining time.
	-17.6042	 = Validation score   (-mean_absolute_error)
	0.09s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 336.25s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20231109_125801/")


In [None]:
# y_c_test = predictor_c.predict(x_val_c)


In [None]:
# join_val_c = pd.merge(x_val_c, y_val_c, left_index=True, right_index=True)
# predictor_c.leaderboard(join_val_c, silent=True)


## Evaluate

In [None]:
""" from sklearn.metrics import mean_absolute_error
mae_a = mean_absolute_error(y_val_a, predictor.predict(x_val_a))
print("MAE for A: ", mae_a)
mae_b = mean_absolute_error(y_val_b, y_b_test)
print("MAE for B: ", mae_b)
mae_c = mean_absolute_error(y_val_c, y_c_test)
print("MAE for C: ", mae_c)
print("Mean MAE: ", (mae_a + mae_b + mae_c) / 3) """


' from sklearn.metrics import mean_absolute_error\nmae_a = mean_absolute_error(y_val_a, predictor.predict(x_val_a))\nprint("MAE for A: ", mae_a)\nmae_b = mean_absolute_error(y_val_b, y_b_test)\nprint("MAE for B: ", mae_b)\nmae_c = mean_absolute_error(y_val_c, y_c_test)\nprint("MAE for C: ", mae_c)\nprint("Mean MAE: ", (mae_a + mae_b + mae_c) / 3) '

In [28]:
# x_val_a


In [29]:
full_a = pd.merge(X_train[0], Y_train[0], left_index=True, right_index=True)
full_b = pd.merge(X_train[1], Y_train[1], left_index=True, right_index=True)
full_c = pd.merge(X_train[2], Y_train[2], left_index=True, right_index=True)


In [33]:
#predictions before full training
pred_split_a = predictor.predict(X_frames_test[0])
pred_split_b = predictor_b.predict(X_frames_test[1])
pred_split_c = predictor_c.predict(X_frames_test[2])


# Make predictions

In [34]:
# Do some more stuff
y_pred = np.concatenate((pred_split_a, pred_split_b, pred_split_c), axis=0)

print(len(y_pred))


2160


In [35]:
y_pred[0:10]


array([-2.6314905e+00, -1.6035709e+00, -1.7524686e+00,  7.6036308e+01,
        3.8069138e+02,  9.9536182e+02,  1.4958785e+03,  3.0739004e+03,
        2.6127104e+03,  2.7461501e+03], dtype=float32)

In [38]:
for i in range(len(y_pred)):
    if y_pred[i] < 10: 
        y_pred[i] = 0


# Evaluate prediction

# Create submission

In [40]:
y_test_pred = y_pred

test = pd.read_csv('../data/test.csv')
test['prediction'] = y_test_pred
sample_submission = pd.read_csv('../data/sample_submission.csv')
submission = sample_submission[['id']].merge(test[['id', 'prediction']], on='id', how='left')
submission.to_csv('submissions/autogluon_new_pipe.csv', index=False)
