# Team description

Team: Ehhhhhhh

Markus Kinn, 106660
Mario Haroun, 543915
Torstein Korten, 543955

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer, IterativeImputer
from autogluon.tabular import TabularPredictor
from sklearn.metrics import mean_absolute_error
import numpy as np
import itertools

from sklearn.model_selection import train_test_split


def remove_unwanted_rows(df):
    unwanted_rows = (df['direct_rad:W'] == 0) & (df['diffuse_rad:W'] == 0) & (df['pv_measurement'] > 200) & (df['sun_elevation:d'] < 0) & (df['is_day:idx'] == 0)
    cleaned_df = df[~unwanted_rows]
    return cleaned_df
def remove_highly_correlated_features(df, threshold):
    # Compute the Pearson correlation matrix
    correlation_matrix = df.corr(method='pearson')

    # Initialize an empty list to hold features to be removed
    features_to_remove = []

    # Traverse the correlation matrix to find highly correlated features
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            feature1 = correlation_matrix.columns[i]
            feature2 = correlation_matrix.columns[j]

            # Check for high absolute correlation
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                # Add one of the features to the list if it's not already there
                if feature1 not in features_to_remove and feature2 not in features_to_remove:
                    features_to_remove.append(feature1)

    # Drop the identified features from the DataFrame
    filtered_df = df.drop(columns=features_to_remove)

    return filtered_df

def find_long_constant_periods(data, threshold):
    start = None
    segments = []
    for i in range(1, len(data)):
        if data[i] == data[i-1] and data[i] != 0:
            if start is None:
                start = i-1
        else:
            if start is not None:
                if (i - start) > threshold:
                    segments.append((start, i))
                start = None
    return segments

def remove_constant_periods(df, segments):
    drop_indices = []
    for start, end in segments:
        drop_indices.extend(range(start, end))
    return df.drop(drop_indices)

def lag_features_by_one_hour(df, column_names, time_col='time'):

    # Check if the DataFrame has a time-based index
    df['index'] = df[time_col]
    df = df.set_index('index')

    # Loop through each column name to create a lagged feature
    for col in column_names:
        lagged_col_name = f"{col}"
        df[lagged_col_name] = df[col].shift(freq='-1H')

    return df

def is_estimated(df, time_col='time'):
    split_date = '2022-10-27'
    df['is_estimated'] = 0  # Initialize with 0 (indicating observed)
    df.loc[df[time_col] >= pd.Timestamp(split_date), 'is_estimated'] = 1  # Set 1 for estimated data
    return df

def resample_to_hourly(df, datetime_column='date_forecast'):
    df[datetime_column] = pd.to_datetime(df[datetime_column])
    df.sort_values(by=datetime_column, inplace=True)

    df.set_index(datetime_column, inplace=True)

    df_hourly = df.resample('H').mean()

    df_hourly.dropna(how='all', inplace=True)

    df_hourly.reset_index(inplace=True)

    return df_hourly

def generate_solar_features_1(data):
    relevant_features = [
        'direct_rad:W', 'clear_sky_rad:W', 'diffuse_rad:W', 'sun_elevation:d', 'sun_azimuth:d',
        'clear_sky_energy_1h:J', 'direct_rad_1h:J', 'effective_cloud_cover:p', 'diffuse_rad_1h:J',
        'is_in_shadow:idx', 'total_cloud_cover:p', 'wind_speed_u_10m:ms', 'snow_water:kgm2',
        'relative_humidity_1000hPa:p', 'is_day:idx', 'wind_speed_v_10m:ms', 'cloud_base_agl:m',
        'fresh_snow_24h:cm', 'wind_speed_10m:ms', 'pressure_100m:hPa'
    ]

    interactions = {}
    ratios = {}
    differences = {}
    lags = {}
    self_interactions = {}
    additive = {}

    for col_pair in itertools.combinations(relevant_features, 2):
        interactions[f'{col_pair[0]}_times_{col_pair[1]}'] = data[col_pair[0]] * data[col_pair[1]]
        ratios[f'{col_pair[0]}_div_{col_pair[1]}'] = data[col_pair[0]] / (data[col_pair[1]] + 1e-8)
        differences[f'{col_pair[0]}_minus_{col_pair[1]}'] = data[col_pair[0]] - data[col_pair[1]]
        additive[f'{col_pair[0]}_plus_{col_pair[1]}'] = data[col_pair[0]] + data[col_pair[1]]

    for col in relevant_features:
        self_interactions[f'{col}_squared'] = data[col] ** 2

    # Creating lags for all relevant features
    for col in relevant_features:
        lags[f'{col}_lag1'] = data[col].shift(1)
        lags[f'{col}_lag3'] = data[col].shift(3)

    # Concatenate all new features with the original data
    data = pd.concat([data, pd.DataFrame(interactions), pd.DataFrame(ratios),
                      pd.DataFrame(differences), pd.DataFrame(lags), pd.DataFrame(self_interactions), pd.DataFrame(additive)], axis=1)

    data['wind_magnitude'] = np.sqrt(data['wind_speed_u_10m:ms']**2 + data['wind_speed_v_10m:ms']**2)
    data['wind_direction'] = np.arctan2(data['wind_speed_v_10m:ms'], data['wind_speed_u_10m:ms'])
    data['solar_angle_impact'] = np.sin(np.radians(data['sun_elevation:d']))

    return data

def generate_solar_features_2(data):
    relevant_features = [
        'sun_elevation:d', 'clear_sky_rad:W', 'direct_rad:W', 'diffuse_rad:W',
        'sun_azimuth:d', 'clear_sky_energy_1h:J', 'cloud_base_agl:m', 'diffuse_rad_1h:J',
        'effective_cloud_cover:p', 'direct_rad_1h:J', 'snow_water:kgm2', 'is_in_shadow:idx',
        'fresh_snow_24h:cm', 'wind_speed_u_10m:ms', 'total_cloud_cover:p', 'msl_pressure:hPa',
        'is_day:idx', 'relative_humidity_1000hPa:p', 'pressure_100m:hPa', 'ceiling_height_agl:m'
    ]

    interactions = {}
    ratios = {}
    differences = {}
    lags = {}
    self_interactions = {}
    additive = {}

    for col_pair in itertools.combinations(relevant_features, 2):
        interactions[f'{col_pair[0]}_times_{col_pair[1]}'] = data[col_pair[0]] * data[col_pair[1]]
        ratios[f'{col_pair[0]}_div_{col_pair[1]}'] = data[col_pair[0]] / (data[col_pair[1]] + 1e-8)
        differences[f'{col_pair[0]}_minus_{col_pair[1]}'] = data[col_pair[0]] - data[col_pair[1]]
        additive[f'{col_pair[0]}_plus_{col_pair[1]}'] = data[col_pair[0]] + data[col_pair[1]]

    for col in relevant_features:
        self_interactions[f'{col}_squared'] = data[col] ** 2

    # Creating lags for all relevant features
    for col in relevant_features:
        lags[f'{col}_lag1'] = data[col].shift(1)
        lags[f'{col}_lag3'] = data[col].shift(3)

    # Concatenate all new features with the original data
    data = pd.concat([data, pd.DataFrame(interactions), pd.DataFrame(ratios),
                      pd.DataFrame(differences), pd.DataFrame(lags), pd.DataFrame(self_interactions), pd.DataFrame(additive)], axis=1)

    data['wind_magnitude'] = np.sqrt(data['wind_speed_u_10m:ms']**2 + data['wind_speed_v_10m:ms']**2)
    data['wind_direction'] = np.arctan2(data['wind_speed_v_10m:ms'], data['wind_speed_u_10m:ms'])
    data['solar_angle_impact'] = np.sin(np.radians(data['sun_elevation:d']))

    return data

def generate_solar_features_3(data):
    relevant_features = [
        'clear_sky_rad:W', 'clear_sky_energy_1h:J', 'direct_rad_1h:J', 'sun_elevation:d',
        'direct_rad:W', 'diffuse_rad:W', 'sun_azimuth:d', 'diffuse_rad_1h:J',
        'air_density_2m:kgm3', 'wind_speed_v_10m:ms', 'fresh_snow_24h:cm',
        'relative_humidity_1000hPa:p', 'total_cloud_cover:p', 'effective_cloud_cover:p',
        'cloud_base_agl:m', 'snow_water:kgm2', 't_1000hPa:K', 'is_in_shadow:idx', 'dew_point_2m:K',
        'pressure_100m:hPa'
    ]

    interactions = {}
    ratios = {}
    differences = {}
    lags = {}
    self_interactions = {}
    additive = {}

    for col_pair in itertools.combinations(relevant_features, 2):
        interactions[f'{col_pair[0]}_times_{col_pair[1]}'] = data[col_pair[0]] * data[col_pair[1]]
        ratios[f'{col_pair[0]}_div_{col_pair[1]}'] = data[col_pair[0]] / (data[col_pair[1]] + 1e-8)
        differences[f'{col_pair[0]}_minus_{col_pair[1]}'] = data[col_pair[0]] - data[col_pair[1]]
        additive[f'{col_pair[0]}_plus_{col_pair[1]}'] = data[col_pair[0]] + data[col_pair[1]]

    for col in relevant_features:
        self_interactions[f'{col}_squared'] = data[col] ** 2

    # Creating lags for all relevant features
    for col in relevant_features:
        lags[f'{col}_lag1'] = data[col].shift(1)
        lags[f'{col}_lag3'] = data[col].shift(3)

    # Concatenate all new features with the original data
    data = pd.concat([data, pd.DataFrame(interactions), pd.DataFrame(ratios),
                      pd.DataFrame(differences), pd.DataFrame(lags), pd.DataFrame(self_interactions), pd.DataFrame(additive)], axis=1)

    data['wind_magnitude'] = np.sqrt(data['wind_speed_u_10m:ms']**2 + data['wind_speed_v_10m:ms']**2)
    data['wind_direction'] = np.arctan2(data['wind_speed_v_10m:ms'], data['wind_speed_u_10m:ms'])
    data['solar_angle_impact'] = np.sin(np.radians(data['sun_elevation:d']))

    return data

# Location A

In [10]:
train_a = pd.read_parquet('./data/A/train_targets.parquet')

X_train_estimated_a = pd.read_parquet('./data/A/X_train_estimated.parquet')
X_train_observed_a = pd.read_parquet('./data/A/X_train_observed.parquet')
X_test_estimated_a = pd.read_parquet('./data/A/X_test_estimated.parquet')

df = pd.concat([X_train_observed_a, X_train_estimated_a])

df = resample_to_hourly(df)
X_test_estimated_a = resample_to_hourly(X_test_estimated_a)

df = pd.merge(df, train_a, left_on='date_forecast', right_on='time', how='inner')
df = df.drop(columns=['snow_density:kgm3', 'snow_drift:idx', 'fresh_snow_1h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'fresh_snow_12h:cm', 'snow_melt_10min:mm', 'elevation:m', 'prob_rime:p', 'dew_or_rime:idx'])
X_test_estimated_a = X_test_estimated_a.drop(columns=['snow_density:kgm3', 'snow_drift:idx', 'fresh_snow_1h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'fresh_snow_12h:cm', 'snow_melt_10min:mm', 'elevation:m', 'prob_rime:p', 'dew_or_rime:idx'])

  df_hourly = df.resample('H').mean()
  df_hourly = df.resample('H').mean()


In [11]:
cols_to_impute = ['ceiling_height_agl:m', 'cloud_base_agl:m']

imputer = IterativeImputer(max_iter=10, random_state=42)
X_test_estimated_a[cols_to_impute] = imputer.fit_transform(X_test_estimated_a[cols_to_impute])
df[cols_to_impute] = imputer.fit_transform(df[cols_to_impute])
df = df.dropna(subset=['pv_measurement'])

In [12]:
df = is_estimated(df)
df = generate_solar_features_1(df)

X_test_estimated_a = is_estimated(X_test_estimated_a, 'date_forecast')
X_test_estimated_a = generate_solar_features_1(X_test_estimated_a)

In [13]:
train_end_date = '2022-10-21'
df['time'] = pd.to_datetime(df['time'])

train_df = df[df['time'] < train_end_date]
remaining_data = df[df['time'] > train_end_date]

train_data, validation_df = train_test_split(remaining_data, test_size=0.5, random_state=42)
train_df = pd.concat([train_df, train_data], ignore_index=True)

# Identifying the features and the target variable
X_train = train_df.drop(columns=['pv_measurement', 'time', 'date_forecast'])
y_train = train_df['pv_measurement']
X_val = validation_df.drop(columns=['pv_measurement', 'time', 'date_forecast'])
y_val = validation_df['pv_measurement']

In [14]:
# Combine training and validation data into a single dataset for AutoGluon
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)

# Specify the name of the target variable
label = 'pv_measurement'

# Create a TabularPredictor object
predictor = TabularPredictor(label=label, eval_metric="mean_absolute_error").fit(train_data=train_data, tuning_data=val_data, presets='medium_quality')

No path specified. Models will be saved in: "AutogluonModels\ag-20231112_140358\"
Presets specified: ['medium_quality']
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231112_140358\"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   900.21 GB / 2047.46 GB (44.0%)
Train Data Rows:    31863
Train Data Columns: 859
Tuning Data Rows:    2197
Tuning Data Columns: 859
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 649.75117, 1177.67732)
	If 'regression' 

[1000]	valid_set's l1: 94.0947
[2000]	valid_set's l1: 90.9007
[3000]	valid_set's l1: 89.7331
[4000]	valid_set's l1: 89.1377
[5000]	valid_set's l1: 88.8309
[6000]	valid_set's l1: 88.7296
[7000]	valid_set's l1: 88.6401
[8000]	valid_set's l1: 88.5747
[9000]	valid_set's l1: 88.4477
[10000]	valid_set's l1: 88.3347


	-88.3335	 = Validation score   (-mean_absolute_error)
	106.06s	 = Training   runtime
	0.12s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l1: 95.1734
[2000]	valid_set's l1: 93.2164
[3000]	valid_set's l1: 92.7358
[4000]	valid_set's l1: 92.5886
[5000]	valid_set's l1: 92.4035
[6000]	valid_set's l1: 92.2893
[7000]	valid_set's l1: 92.2514
[8000]	valid_set's l1: 92.2207
[9000]	valid_set's l1: 92.2271
[10000]	valid_set's l1: 92.2203


	-92.2191	 = Validation score   (-mean_absolute_error)
	157.85s	 = Training   runtime
	0.11s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-106.3874	 = Validation score   (-mean_absolute_error)
	548.28s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: CatBoost ...
	-95.4077	 = Validation score   (-mean_absolute_error)
	647.9s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-104.7218	 = Validation score   (-mean_absolute_error)
	95.41s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-107.9299	 = Validation score   (-mean_absolute_error)
	46.1s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: XGBoost ...
	-99.4111	 = Validation score   (-mean_absolute_error)
	32.18s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-89.0747	 = Validation score   (-mean_absolute_error)
	103.73s	 = Training   runtime
	0.4s	 = Validation runtime
Fittin

[1000]	valid_set's l1: 94.7572
[2000]	valid_set's l1: 94.1652
[3000]	valid_set's l1: 94.105
[4000]	valid_set's l1: 94.0865
[5000]	valid_set's l1: 94.08
[6000]	valid_set's l1: 94.0778
[7000]	valid_set's l1: 94.0771
[8000]	valid_set's l1: 94.0768
[9000]	valid_set's l1: 94.0767
[10000]	valid_set's l1: 94.0767


	-94.0767	 = Validation score   (-mean_absolute_error)
	475.5s	 = Training   runtime
	0.31s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	-84.527	 = Validation score   (-mean_absolute_error)
	0.2s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 2224.14s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels\ag-20231112_140358\")


In [15]:
results = predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                  model   score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -84.526975       0.618043  367.825929                0.000000           0.195613            2       True         12
1            LightGBMXT  -88.333488       0.116524  106.057042                0.116524         106.057042            1       True          3
2        NeuralNetTorch  -89.074721       0.395000  103.727405                0.395000         103.727405            1       True         10
3              LightGBM  -92.219139       0.106518  157.845870                0.106518         157.845870            1       True          4
4         LightGBMLarge  -94.076696       0.310780  475.501889                0.310780         475.501889            1       True         11
5              CatBoost  -95.407715       0.039000  647.904157                0.039000      



In [16]:
feature_importance = predictor.feature_importance(val_data)

Computing feature importance via permutation shuffling for 859 features using 2197 rows with 5 shuffle sets...
	3392.82s	= Expected runtime (678.56s per shuffle set)
	1696.77s	= Actual runtime (Completed 5 of 5 shuffle sets)


In [17]:
best_features = feature_importance[feature_importance['importance'] > 0.1].index.tolist()

X_train = X_train[best_features]
train_data = pd.concat([X_train, y_train], axis=1)

X_val = X_val[best_features]
val_data = pd.concat([X_val, y_val], axis=1)

label = 'pv_measurement'

predictor = TabularPredictor(label=label, eval_metric="mean_absolute_error").fit(train_data=train_data, tuning_data=val_data, presets='medium_quality')

No path specified. Models will be saved in: "AutogluonModels\ag-20231112_150921\"
Presets specified: ['medium_quality']
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231112_150921\"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   898.92 GB / 2047.46 GB (43.9%)
Train Data Rows:    31863
Train Data Columns: 529
Tuning Data Rows:    2197
Tuning Data Columns: 529
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 649.75117, 1177.67732)
	If 'regression' 

[1000]	valid_set's l1: 94.8094
[2000]	valid_set's l1: 91.1564
[3000]	valid_set's l1: 89.6507
[4000]	valid_set's l1: 89.0496
[5000]	valid_set's l1: 88.6239
[6000]	valid_set's l1: 88.4249
[7000]	valid_set's l1: 88.142
[8000]	valid_set's l1: 87.9455
[9000]	valid_set's l1: 87.8285
[10000]	valid_set's l1: 87.6959


	-87.6948	 = Validation score   (-mean_absolute_error)
	71.69s	 = Training   runtime
	0.11s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l1: 93.4136
[2000]	valid_set's l1: 92.0652
[3000]	valid_set's l1: 91.7453
[4000]	valid_set's l1: 91.4531
[5000]	valid_set's l1: 91.3031
[6000]	valid_set's l1: 91.2216
[7000]	valid_set's l1: 91.174
[8000]	valid_set's l1: 91.1615
[9000]	valid_set's l1: 91.1432
[10000]	valid_set's l1: 91.1323


	-91.1321	 = Validation score   (-mean_absolute_error)
	99.92s	 = Training   runtime
	0.12s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-105.971	 = Validation score   (-mean_absolute_error)
	334.66s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: CatBoost ...
	-92.205	 = Validation score   (-mean_absolute_error)
	467.95s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-104.6491	 = Validation score   (-mean_absolute_error)
	51.87s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-111.421	 = Validation score   (-mean_absolute_error)
	37.23s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: XGBoost ...
	-94.3357	 = Validation score   (-mean_absolute_error)
	455.41s	 = Training   runtime
	0.18s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-92.247	 = Validation score   (-mean_absolute_error)
	85.75s	 = Training   runtime
	0.25s	 = Validation runtime
Fitting 

[1000]	valid_set's l1: 94.1835
[2000]	valid_set's l1: 93.418
[3000]	valid_set's l1: 93.2886
[4000]	valid_set's l1: 93.2495
[5000]	valid_set's l1: 93.2393
[6000]	valid_set's l1: 93.2345
[7000]	valid_set's l1: 93.2334
[8000]	valid_set's l1: 93.233
[9000]	valid_set's l1: 93.2328
[10000]	valid_set's l1: 93.2328


	-93.2328	 = Validation score   (-mean_absolute_error)
	302.54s	 = Training   runtime
	0.34s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	-85.379	 = Validation score   (-mean_absolute_error)
	0.21s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 1915.25s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels\ag-20231112_150921\")


In [18]:
results = predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                  model   score_val  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -85.379029       0.687539  1180.933998                0.000000           0.214641            2       True         12
1            LightGBMXT  -87.694764       0.110260    71.693029                0.110260          71.693029            1       True          3
2              LightGBM  -91.132115       0.123772    99.917832                0.123772          99.917832            1       True          4
3              CatBoost  -92.205027       0.022024   467.951797                0.022024         467.951797            1       True          6
4        NeuralNetTorch  -92.247037       0.254052    85.750385                0.254052          85.750385            1       True         10
5         LightGBMLarge  -93.232782       0.335674   302.541405                0.33567



In [19]:
X_test_estimated_a = X_test_estimated_a[best_features]

y_pred = predictor.predict(X_test_estimated_a)
y_pred = y_pred.clip(lower=0)
y_pred = y_pred.reset_index(drop=True)
y_pred.index.name = 'id'

In [20]:
df = pd.DataFrame(y_pred)
df.to_csv('result_a.csv')

# Location B

In [21]:
train_b = pd.read_parquet('./data/B/train_targets.parquet')

X_train_estimated_b = pd.read_parquet('./data/B/X_train_estimated.parquet')
X_train_observed_b = pd.read_parquet('./data/B/X_train_observed.parquet')
X_test_estimated_b = pd.read_parquet('./data/B/X_test_estimated.parquet')

df = pd.concat([X_train_observed_b, X_train_estimated_b])

df = resample_to_hourly(df)
X_test_estimated_b = resample_to_hourly(X_test_estimated_b)

df = pd.merge(df, train_b, left_on='date_forecast', right_on='time', how='inner')
df = df.drop(columns=['snow_density:kgm3', 'elevation:m', 'snow_drift:idx', 'snow_melt_10min:mm', 'fresh_snow_1h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'fresh_snow_12h:cm', 'precip_5min:mm', 'rain_water:kgm2', 'snow_drift:idx', 'snow_melt_10min:mm', 'wind_speed_w_1000hPa:ms', 'cloud_base_agl:m'])
X_test_estimated_b = X_test_estimated_b.drop(columns=['snow_density:kgm3', 'elevation:m', 'snow_drift:idx', 'snow_melt_10min:mm', 'fresh_snow_1h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'fresh_snow_12h:cm', 'precip_5min:mm', 'rain_water:kgm2', 'snow_drift:idx', 'snow_melt_10min:mm', 'wind_speed_w_1000hPa:ms', 'cloud_base_agl:m'])

  df_hourly = df.resample('H').mean()
  df_hourly = df.resample('H').mean()


In [22]:
df = df.dropna(subset=['pv_measurement'])

datetime_features = df[['time', 'date_forecast']]
df = df.drop(['time', 'date_forecast'], axis=1)

imputer = IterativeImputer(random_state=123)
df_imputed = imputer.fit_transform(df)

df = pd.DataFrame(df_imputed, columns=df.columns)
df = pd.concat([df, datetime_features.reset_index(drop=True)], axis=1)

In [23]:
segments = find_long_constant_periods(train_b['pv_measurement'], threshold=5)
df = remove_constant_periods(df, segments)
df = remove_unwanted_rows(df)
df = is_estimated(df)
df = lag_features_by_one_hour(df, ['diffuse_rad_1h:J', 'direct_rad_1h:J', 'clear_sky_energy_1h:J'])

X_test_estimated_b = is_estimated(X_test_estimated_b, 'date_forecast')
X_test_estimated_b = lag_features_by_one_hour(X_test_estimated_b, ['diffuse_rad_1h:J', 'direct_rad_1h:J', 'clear_sky_energy_1h:J'], 'date_forecast')

In [24]:
train_end_date = '2022-10-21'
df['time'] = pd.to_datetime(df['time'])

train_df = df[df['time'] < train_end_date]
remaining_data = df[df['time'] > train_end_date]

train_data, validation_df = train_test_split(remaining_data, test_size=0.5, random_state=42)
train_df = pd.concat([train_df, train_data], ignore_index=True)

# Identifying the features and the target variable
X_train = train_df.drop(columns=['pv_measurement', 'time', 'date_forecast'])
y_train = train_df['pv_measurement']
X_val = validation_df.drop(columns=['pv_measurement', 'time', 'date_forecast'])
y_val = validation_df['pv_measurement']

In [25]:
# Combine training and validation data into a single dataset for AutoGluon
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)

# Specify the name of the target variable
label = 'pv_measurement'

# Create a TabularPredictor object
predictor = TabularPredictor(label=label, eval_metric="mean_absolute_error").fit(train_data=train_data, tuning_data=val_data, presets='medium_quality')

No path specified. Models will be saved in: "AutogluonModels\ag-20231112_154122\"
Presets specified: ['medium_quality']
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231112_154122\"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   897.60 GB / 2047.46 GB (43.8%)
Train Data Rows:    27807
Train Data Columns: 34
Tuning Data Rows:    1801
Tuning Data Columns: 34
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (1152.3, -0.0, 96.61183, 205.14064)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generator

[1000]	valid_set's l1: 12.5085
[2000]	valid_set's l1: 12.2277
[3000]	valid_set's l1: 12.0675
[4000]	valid_set's l1: 12.0195
[5000]	valid_set's l1: 11.9903
[6000]	valid_set's l1: 12.0009
[7000]	valid_set's l1: 11.9943
[8000]	valid_set's l1: 11.9742
[9000]	valid_set's l1: 11.9374
[10000]	valid_set's l1: 11.9382


	-11.9253	 = Validation score   (-mean_absolute_error)
	16.2s	 = Training   runtime
	0.09s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l1: 12.9825
[2000]	valid_set's l1: 12.8098
[3000]	valid_set's l1: 12.7336
[4000]	valid_set's l1: 12.7099
[5000]	valid_set's l1: 12.7158


	-12.7029	 = Validation score   (-mean_absolute_error)
	9.36s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-14.2155	 = Validation score   (-mean_absolute_error)
	12.83s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: CatBoost ...
	-13.0591	 = Validation score   (-mean_absolute_error)
	36.11s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-13.7027	 = Validation score   (-mean_absolute_error)
	2.03s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-13.4804	 = Validation score   (-mean_absolute_error)
	21.75s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: XGBoost ...
	-13.5175	 = Validation score   (-mean_absolute_error)
	1.16s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-10.9905	 = Validation score   (-mean_absolute_error)
	80.35s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model

[1000]	valid_set's l1: 12.0577
[2000]	valid_set's l1: 12.0334


	-12.0183	 = Validation score   (-mean_absolute_error)
	8.91s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	-10.3098	 = Validation score   (-mean_absolute_error)
	0.18s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 192.3s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels\ag-20231112_154122\")


In [26]:
results = predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                  model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2 -10.309786       0.200825  136.763167                0.000000           0.182827            2       True         12
1        NeuralNetTorch -10.990536       0.017999   80.354894                0.017999          80.354894            1       True         10
2            LightGBMXT -11.925316       0.094237   16.201539                0.094237          16.201539            1       True          3
3         LightGBMLarge -12.018280       0.025002    8.908605                0.025002           8.908605            1       True         11
4              LightGBM -12.702907       0.039589    9.362858                0.039589           9.362858            1       True          4
5              CatBoost -13.059088       0.005616   36.112140                0.005616          36.



In [27]:
y_pred = predictor.predict(X_test_estimated_b)
y_pred = y_pred.clip(lower=0)
y_pred = y_pred.reset_index(drop=True)
y_pred.index.name = 'id'

In [28]:
df = pd.DataFrame(y_pred)
df.to_csv('result_b.csv')

# Location C

In [29]:
train_c = pd.read_parquet('./data/C/train_targets.parquet')

X_train_estimated_c = pd.read_parquet('./data/C/X_train_estimated.parquet')
X_train_observed_c = pd.read_parquet('./data/C/X_train_observed.parquet')
X_test_estimated_c = pd.read_parquet('./data/C/X_test_estimated.parquet')

df = pd.concat([X_train_observed_c, X_train_estimated_c])

df = resample_to_hourly(df)
X_test_estimated_c = resample_to_hourly(X_test_estimated_c)

df = pd.merge(df, train_c, left_on='date_forecast', right_on='time', how='inner')
df = df.drop(columns=['snow_density:kgm3', 'elevation:m', 'snow_drift:idx', 'snow_melt_10min:mm', 'wind_speed_w_1000hPa:ms', 'fresh_snow_1h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'fresh_snow_12h:cm', 'precip_5min:mm', 'rain_water:kgm2', 'snow_drift:idx', 'snow_melt_10min:mm', 'wind_speed_w_1000hPa:ms'])
X_test_estimated_c = X_test_estimated_c.drop(columns=['snow_density:kgm3', 'elevation:m', 'snow_drift:idx', 'snow_melt_10min:mm', 'wind_speed_w_1000hPa:ms', 'fresh_snow_1h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'fresh_snow_12h:cm', 'precip_5min:mm', 'rain_water:kgm2', 'snow_drift:idx', 'snow_melt_10min:mm', 'wind_speed_w_1000hPa:ms'])

  df_hourly = df.resample('H').mean()
  df_hourly = df.resample('H').mean()


In [30]:
cols_to_impute = ['ceiling_height_agl:m', 'cloud_base_agl:m']

imputer = IterativeImputer(max_iter=10, random_state=42)
X_test_estimated_c[cols_to_impute] = imputer.fit_transform(X_test_estimated_c[cols_to_impute])
df[cols_to_impute] = imputer.fit_transform(df[cols_to_impute])
df = df.dropna(subset=['pv_measurement'])

In [31]:
segments = find_long_constant_periods(train_c['pv_measurement'], threshold=5)
df = remove_constant_periods(df, segments)
df = is_estimated(df)
df = lag_features_by_one_hour(df, ['diffuse_rad_1h:J', 'direct_rad_1h:J', 'clear_sky_energy_1h:J'])

X_test_estimated_c = is_estimated(X_test_estimated_c, 'date_forecast')
X_test_estimated_c = lag_features_by_one_hour(X_test_estimated_c, ['diffuse_rad_1h:J', 'direct_rad_1h:J', 'clear_sky_energy_1h:J'], 'date_forecast')

In [32]:
train_end_date = '2022-10-21'
df['time'] = pd.to_datetime(df['time'])

train_df = df[df['time'] < train_end_date]
remaining_data = df[df['time'] > train_end_date]

train_data, validation_df = train_test_split(remaining_data, test_size=0.5, random_state=42)
train_df = pd.concat([train_df, train_data], ignore_index=True)

# Identifying the features and the target variable
X_train = train_df.drop(columns=['pv_measurement', 'time', 'date_forecast'])
y_train = train_df['pv_measurement']
X_val = validation_df.drop(columns=['pv_measurement', 'time', 'date_forecast'])
y_val = validation_df['pv_measurement']

In [33]:
# Combine training and validation data into a single dataset for AutoGluon
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)

# Specify the name of the target variable
label = 'pv_measurement'

# Create a TabularPredictor object
predictor = TabularPredictor(label=label, eval_metric="mean_absolute_error").fit(train_data=train_data, tuning_data=val_data, presets='best_quality', num_gpus=1, num_stack_levels=0, use_bag_holdout=True)

No path specified. Models will be saved in: "AutogluonModels\ag-20231112_154436\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231112_154436\"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   896.96 GB / 2047.46 GB (43.8%)
Train Data Rows:    24600
Train Data Columns: 35
Tuning Data Rows:    1465
Tuning Data Columns: 35
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and label-values can't be converted to int).
	Label info (max, min, mean, stddev): (999.6, 0.0, 79.84979, 168.49708)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may spe

In [34]:
results = predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                     model  score_val  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L2 -11.122399       8.530623   890.767996                0.001000           0.185380            2       True         12
1        LightGBMXT_BAG_L1 -11.261465       8.384051   418.490465                8.384051         418.490465            1       True          3
2     LightGBMLarge_BAG_L1 -12.509137      13.955961  1613.097727               13.955961        1613.097727            1       True         11
3          LightGBM_BAG_L1 -12.611008      10.929166   389.093676               10.929166         389.093676            1       True          4
4    NeuralNetTorch_BAG_L1 -13.024384       0.145572   472.092152                0.145572         472.092152            1       True         10
5           XGBoost_BAG_L1 -13.368729       1.441644   340.109518         



In [35]:
y_pred = predictor.predict(X_test_estimated_c)
y_pred = y_pred.clip(lower=0)
y_pred = y_pred.reset_index(drop=True)
y_pred.index.name = 'id'

In [36]:
df = pd.DataFrame(y_pred)
df.to_csv('result_c.csv')

In [None]:
df_a = pd.read_csv('result_a.csv')
df_b = pd.read_csv('result_b.csv')
df_c = pd.read_csv('result_c.csv')

def combine_dataframes(df1, df2, df3):
    # Concatenate the dataframes in the specified order
    combined = pd.concat([df1, df2, df3], ignore_index=True)

    # Rename the 'index' column to 'id'
    combined.rename(columns={'index': 'id'}, inplace=True)

    # Ensure the 'id' values range from 0 to 'x'
    combined['id'] = range(len(combined))

    return combined


df = combine_dataframes(df_a, df_b, df_c)
df.to_csv('result.csv', index=False)