# Team description

Team: Ehhhhhhh

Markus Kinn, 106660
Mario Haroun, 543915
Torstein Korten, 543955

In [None]:
%matplotlib inline
from utils import *
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import pandas as pd
from autogluon.tabular import TabularPredictor
from sklearn.metrics import mean_absolute_error
import numpy as np
import itertools

In [None]:
def remove_unwanted_rows(df):
    unwanted_rows = (df['direct_rad:W'] == 0) & (df['diffuse_rad:W'] == 0) & (df['pv_measurement'] > 200) & (df['sun_elevation:d'] < 0) & (df['is_day:idx'] == 0)
    cleaned_df = df[~unwanted_rows]
    return cleaned_df

def find_long_constant_periods(data, threshold):
    start = None
    segments = []
    for i in range(1, len(data)):
        if data[i] == data[i-1] and data[i] != 0:
            if start is None:
                start = i-1
        else:
            if start is not None:
                if (i - start) > threshold:
                    segments.append((start, i))
                start = None
    return segments

def remove_constant_periods(df, segments):
    drop_indices = []
    for start, end in segments:
        drop_indices.extend(range(start, end))
    return df.drop(drop_indices)

def lag_features_by_one_hour(df, column_names, time_col='time'):

    # Check if the DataFrame has a time-based index
    df['index'] = df[time_col]
    df = df.set_index('index')

    # Loop through each column name to create a lagged feature
    for col in column_names:
        lagged_col_name = f"{col}"
        df[lagged_col_name] = df[col].shift(freq='-1H')

    return df

def is_estimated(df, time_col='time'):
    split_date = '2022-10-27'
    df['is_estimated'] = 0  # Initialize with 0 (indicating observed)
    df.loc[df[time_col] >= pd.Timestamp(split_date), 'is_estimated'] = 1  # Set 1 for estimated data
    return df

def resample_to_hourly(df, datetime_column='date_forecast'):
    df[datetime_column] = pd.to_datetime(df[datetime_column])
    df.sort_values(by=datetime_column, inplace=True)

    df.set_index(datetime_column, inplace=True)

    df_hourly = df.resample('H').mean()

    df_hourly.dropna(how='all', inplace=True)

    df_hourly.reset_index(inplace=True)

    return df_hourly

def generate_simple_features(data):
    data['wind_magnitude'] = np.sqrt(data['wind_speed_u_10m:ms']**2 + data['wind_speed_v_10m:ms']**2)
    data['wind_direction'] = np.arctan2(data['wind_speed_v_10m:ms'], data['wind_speed_u_10m:ms'])
    data['solar_angle_impact'] = np.sin(np.radians(data['sun_elevation:d']))

    data = data.drop(columns=['wind_speed_10m:ms', 'wind_speed_u_10m:ms', 'wind_speed_10m:ms',
                              'wind_speed_w_1000hPa:ms', 'wind_speed_v_10m:ms'])

    return data

# Location A

In [2]:
train_a = pd.read_parquet('./data/A/train_targets.parquet')

X_train_estimated_a = pd.read_parquet('./data/A/X_train_estimated.parquet')
X_train_observed_a = pd.read_parquet('./data/A/X_train_observed.parquet')
X_test_estimated_a = pd.read_parquet('./data/A/X_test_estimated.parquet')

df = pd.concat([X_train_observed_a, X_train_estimated_a])

df = resample_to_hourly(df)
X_test_estimated_a = resample_to_hourly(X_test_estimated_a)

df = pd.merge(df, train_a, left_on='date_forecast', right_on='time', how='inner')
df = df.drop(columns=['snow_density:kgm3', 'snow_drift:idx', 'fresh_snow_1h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'snow_melt_10min:mm', 'elevation:m', 'cloud_base_agl:m'])
X_test_estimated_a = X_test_estimated_a.drop(columns=['snow_density:kgm3', 'snow_drift:idx', 'fresh_snow_1h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'snow_melt_10min:mm', 'elevation:m', 'cloud_base_agl:m'])

  df_hourly = df.resample('H').mean()
  df_hourly = df.resample('H').mean()


In [3]:
datetime_features = df[['time', 'date_forecast']]
df = df.drop(['time', 'date_forecast'], axis=1)

imputer = IterativeImputer(random_state=42)
df_imputed = imputer.fit_transform(df)

df = pd.DataFrame(df_imputed, columns=df.columns)
df = pd.concat([df, datetime_features.reset_index(drop=True)], axis=1)

In [4]:
df = is_estimated(df)
df = generate_simple_features(df)

X_test_estimated_a = is_estimated(X_test_estimated_a, 'date_forecast')
X_test_estimated_a = generate_simple_features(X_test_estimated_a)

In [5]:
train_end_date = '2022-10-21'
df['time'] = pd.to_datetime(df['time'])

train_df = df[df['time'] < train_end_date]
remaining_data = df[df['time'] > train_end_date]

train_data, validation_df = train_test_split(remaining_data, test_size=0.5, random_state=50)
train_df = pd.concat([train_df, train_data], ignore_index=True)

# Identifying the features and the target variable
X_train = train_df.drop(columns=['pv_measurement', 'time', 'date_forecast'])
y_train = train_df['pv_measurement']
X_val = validation_df.drop(columns=['pv_measurement', 'time', 'date_forecast'])
y_val = validation_df['pv_measurement']

In [6]:
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)

label = 'pv_measurement'

predictor = TabularPredictor(label=label, eval_metric="mean_absolute_error").fit(train_data=train_data, tuning_data=val_data, presets='best_quality', num_gpus=1, num_stack_levels=0, use_bag_holdout=True)

No path specified. Models will be saved in: "AutogluonModels\ag-20231112_122520\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231112_122520\"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   906.18 GB / 2047.46 GB (44.3%)
Train Data Rows:    31863
Train Data Columns: 37
Tuning Data Rows:    2197
Tuning Data Columns: 37
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 650.65228, 1178.80649)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may spec

In [7]:
results = predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                     model   score_val  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L2  -85.098833      23.319916  2711.967278                0.000000           0.210576            2       True         12
1    NeuralNetTorch_BAG_L1  -88.964440       0.187825   575.829899                0.187825         575.829899            1       True         10
2     LightGBMLarge_BAG_L1  -89.147817      13.550098  1724.725685               13.550098        1724.725685            1       True         11
3        LightGBMXT_BAG_L1  -89.323748       9.581992   411.201118                9.581992         411.201118            1       True          3
4          LightGBM_BAG_L1  -92.042318      10.390874   367.736543               10.390874         367.736543            1       True          4
5           XGBoost_BAG_L1  -96.298775       0.467416    82.341583  



In [8]:
y_pred = predictor.predict(X_test_estimated_a)
y_pred = y_pred.clip(lower=0)
y_pred = y_pred.reset_index(drop=True)
y_pred.index.name = 'id'

In [None]:
df = pd.DataFrame(y_pred)
df.to_csv('result_a.csv')

# Location B

In [2]:
train_b = pd.read_parquet('./data/B/train_targets.parquet')

X_train_estimated_b = pd.read_parquet('./data/B/X_train_estimated.parquet')
X_train_observed_b = pd.read_parquet('./data/B/X_train_observed.parquet')
X_test_estimated_b = pd.read_parquet('./data/B/X_test_estimated.parquet')

df = pd.concat([X_train_observed_b, X_train_estimated_b])

df = resample_to_hourly(df)
X_test_estimated_b = resample_to_hourly(X_test_estimated_b)

df = pd.merge(df, train_b, left_on='date_forecast', right_on='time', how='inner')
df = df.drop(columns=['snow_density:kgm3', 'elevation:m', 'snow_drift:idx', 'snow_melt_10min:mm', 'fresh_snow_1h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'fresh_snow_12h:cm', 'precip_5min:mm', 'rain_water:kgm2', 'snow_drift:idx', 'snow_melt_10min:mm', 'wind_speed_w_1000hPa:ms'])
X_test_estimated_b = X_test_estimated_b.drop(columns=['snow_density:kgm3', 'elevation:m', 'snow_drift:idx', 'snow_melt_10min:mm', 'fresh_snow_1h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'fresh_snow_12h:cm', 'precip_5min:mm', 'rain_water:kgm2', 'snow_drift:idx', 'snow_melt_10min:mm', 'wind_speed_w_1000hPa:ms'])

  df_hourly = df.resample('H').mean()
  df_hourly = df.resample('H').mean()


In [3]:
df = df.dropna(subset=['pv_measurement'])

datetime_features = df[['time', 'date_forecast']]
df = df.drop(['time', 'date_forecast'], axis=1)

imputer = IterativeImputer(random_state=123)
df_imputed = imputer.fit_transform(df)

df = pd.DataFrame(df_imputed, columns=df.columns)
df = pd.concat([df, datetime_features.reset_index(drop=True)], axis=1)

In [4]:
segments = find_long_constant_periods(train_b['pv_measurement'], threshold=5)
df = remove_constant_periods(df, segments)
df = remove_unwanted_rows(df)
df = is_estimated(df)
df = lag_features_by_one_hour(df, ['diffuse_rad_1h:J', 'direct_rad_1h:J', 'clear_sky_energy_1h:J'])

X_test_estimated_b = is_estimated(X_test_estimated_b, 'date_forecast')
X_test_estimated_b = lag_features_by_one_hour(X_test_estimated_b, ['diffuse_rad_1h:J', 'direct_rad_1h:J', 'clear_sky_energy_1h:J'], 'date_forecast')

In [5]:
train_end_date = '2022-10-21'
df['time'] = pd.to_datetime(df['time'])

train_df = df[df['time'] < train_end_date]
remaining_data = df[df['time'] > train_end_date]

train_data, validation_df = train_test_split(remaining_data, test_size=0.5, random_state=42)
train_df = pd.concat([train_df, train_data], ignore_index=True)

# Identifying the features and the target variable
X_train = train_df.drop(columns=['pv_measurement', 'time', 'date_forecast'])
y_train = train_df['pv_measurement']
X_val = validation_df.drop(columns=['pv_measurement', 'time', 'date_forecast'])
y_val = validation_df['pv_measurement']

In [6]:
# Combine training and validation data into a single dataset for AutoGluon
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)

# Specify the name of the target variable
label = 'pv_measurement'

# Create a TabularPredictor object
predictor = TabularPredictor(label=label, eval_metric="mean_absolute_error").fit(train_data=train_data, tuning_data=val_data, presets='medium_quality')

No path specified. Models will be saved in: "AutogluonModels\ag-20231112_095423\"
Presets specified: ['medium_quality']
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231112_095423\"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   897.90 GB / 2047.46 GB (43.9%)
Train Data Rows:    27807
Train Data Columns: 35
Tuning Data Rows:    1801
Tuning Data Columns: 35
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (1152.3, -0.0, 96.61183, 205.14064)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generator

[1000]	valid_set's l1: 12.8494
[2000]	valid_set's l1: 12.6426
[3000]	valid_set's l1: 12.5573


	-12.5173	 = Validation score   (-mean_absolute_error)
	5.3s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l1: 12.9818
[2000]	valid_set's l1: 12.8635
[3000]	valid_set's l1: 12.8417


	-12.828	 = Validation score   (-mean_absolute_error)
	4.58s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-14.2066	 = Validation score   (-mean_absolute_error)
	14.9s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: CatBoost ...
	-13.2545	 = Validation score   (-mean_absolute_error)
	9.63s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-13.6995	 = Validation score   (-mean_absolute_error)
	1.81s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-12.9604	 = Validation score   (-mean_absolute_error)
	19.81s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: XGBoost ...
	-13.658	 = Validation score   (-mean_absolute_error)
	0.8s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-10.8596	 = Validation score   (-mean_absolute_error)
	94.66s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: Light

[1000]	valid_set's l1: 12.2347


	-12.2166	 = Validation score   (-mean_absolute_error)
	6.65s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	-10.5244	 = Validation score   (-mean_absolute_error)
	0.17s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 164.38s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels\ag-20231112_095423\")


In [7]:
results = predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                  model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2 -10.524436       0.169514  132.983032                0.000000           0.174001            2       True         12
1        NeuralNetTorch -10.859605       0.018000   94.657072                0.018000          94.657072            1       True         10
2         LightGBMLarge -12.216640       0.017002    6.645430                0.017002           6.645430            1       True         11
3            LightGBMXT -12.517338       0.025510    5.300957                0.025510           5.300957            1       True          3
4              LightGBM -12.828019       0.023003    4.582390                0.023003           4.582390            1       True          4
5       NeuralNetFastAI -12.960386       0.023999   19.808258                0.023999          19.



In [8]:
y_pred = predictor.predict(X_test_estimated_b)
y_pred = y_pred.clip(lower=0)
y_pred = y_pred.reset_index(drop=True)
y_pred.index.name = 'id'

In [None]:
df = pd.DataFrame(y_pred)
df.to_csv('result_b.csv')

# Location C

In [11]:
train_c = pd.read_parquet('./data/C/train_targets.parquet')

X_train_estimated_c = pd.read_parquet('./data/C/X_train_estimated.parquet')
X_train_observed_c = pd.read_parquet('./data/C/X_train_observed.parquet')
X_test_estimated_c = pd.read_parquet('./data/C/X_test_estimated.parquet')

df = pd.concat([X_train_observed_c, X_train_estimated_c])

df = resample_to_hourly(df)
X_test_estimated_c = resample_to_hourly(X_test_estimated_c)

df = pd.merge(df, train_c, left_on='date_forecast', right_on='time', how='inner')
df = df.drop(columns=['snow_density:kgm3', 'elevation:m', 'snow_drift:idx', 'rain_water:kgm2', 'snow_drift:idx', 'cloud_base_agl:m'])

X_test_estimated_c = X_test_estimated_c.drop(columns=['snow_density:kgm3', 'elevation:m', 'snow_drift:idx', 'rain_water:kgm2', 'snow_drift:idx', 'cloud_base_agl:m'])

  df_hourly = df.resample('H').mean()
  df_hourly = df.resample('H').mean()


In [12]:
df = df.dropna(subset=['pv_measurement'])

datetime_features = df[['time', 'date_forecast']]
df = df.drop(['time', 'date_forecast'], axis=1)

imputer = IterativeImputer(random_state=123)
df_imputed = imputer.fit_transform(df)

df = pd.DataFrame(df_imputed, columns=df.columns)
df = pd.concat([df, datetime_features.reset_index(drop=True)], axis=1)

In [13]:
segments = find_long_constant_periods(train_c['pv_measurement'], threshold=5)
df = remove_constant_periods(df, segments)
df = is_estimated(df)
df = generate_simple_features(df)

X_test_estimated_c = is_estimated(X_test_estimated_c, 'date_forecast')
X_test_estimated_c = generate_simple_features(X_test_estimated_c)

In [14]:
train_end_date = '2022-10-21'
df['time'] = pd.to_datetime(df['time'])

train_df = df[df['time'] < train_end_date]
remaining_data = df[df['time'] > train_end_date]

train_data, validation_df = train_test_split(remaining_data, test_size=0.5, random_state=100)
train_df = pd.concat([train_df, train_data], ignore_index=True)

X_train = train_df.drop(columns=['pv_measurement', 'time', 'date_forecast'])
y_train = train_df['pv_measurement']
X_val = validation_df.drop(columns=['pv_measurement', 'time', 'date_forecast'])
y_val = validation_df['pv_measurement']

In [15]:
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)

label = 'pv_measurement'

predictor = TabularPredictor(label=label, eval_metric="mean_absolute_error").fit(train_data=train_data, tuning_data=val_data, presets='medium_quality')

No path specified. Models will be saved in: "AutogluonModels\ag-20231111_094818\"
Presets specified: ['medium_quality']
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231111_094818\"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   904.58 GB / 2047.46 GB (44.2%)
Train Data Rows:    24600
Train Data Columns: 40
Tuning Data Rows:    1465
Tuning Data Columns: 40
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and label-values can't be converted to int).
	Label info (max, min, mean, stddev): (999.6, 0.0, 80.09705, 168.78947)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Genera

[1000]	valid_set's l1: 11.2419
[2000]	valid_set's l1: 10.495
[3000]	valid_set's l1: 10.4243
[4000]	valid_set's l1: 10.2802
[5000]	valid_set's l1: 10.2093
[6000]	valid_set's l1: 10.1382
[7000]	valid_set's l1: 10.08
[8000]	valid_set's l1: 10.0475
[9000]	valid_set's l1: 10.0345
[10000]	valid_set's l1: 10.017


	-10.011	 = Validation score   (-mean_absolute_error)
	11.54s	 = Training   runtime
	0.07s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l1: 11.8042
[2000]	valid_set's l1: 11.281


	-11.2531	 = Validation score   (-mean_absolute_error)
	3.49s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-14.3738	 = Validation score   (-mean_absolute_error)
	9.02s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: CatBoost ...
	-11.4457	 = Validation score   (-mean_absolute_error)
	175.71s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-13.9507	 = Validation score   (-mean_absolute_error)
	1.52s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-12.985	 = Validation score   (-mean_absolute_error)
	35.95s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: XGBoost ...
	-12.6692	 = Validation score   (-mean_absolute_error)
	85.95s	 = Training   runtime
	0.19s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-10.9963	 = Validation score   (-mean_absolute_error)
	194.29s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting mode

[1000]	valid_set's l1: 11.7745
[2000]	valid_set's l1: 11.5543
[3000]	valid_set's l1: 11.5441
[4000]	valid_set's l1: 11.5372
[5000]	valid_set's l1: 11.5354
[6000]	valid_set's l1: 11.5346
[7000]	valid_set's l1: 11.5346


	-11.5344	 = Validation score   (-mean_absolute_error)
	28.26s	 = Training   runtime
	0.07s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	-9.3701	 = Validation score   (-mean_absolute_error)
	0.17s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 549.54s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels\ag-20231111_094818\")


In [16]:
results = predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                  model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -9.370141       0.115195  385.202267                0.000000           0.172373            2       True         12
1            LightGBMXT -10.011036       0.070190   11.542578                0.070190          11.542578            1       True          3
2        NeuralNetTorch -10.996333       0.018003  194.288291                0.018003         194.288291            1       True         10
3              LightGBM -11.253090       0.016001    3.491979                0.016001           3.491979            1       True          4
4              CatBoost -11.445651       0.011000  175.707046                0.011000         175.707046            1       True          6
5         LightGBMLarge -11.534445       0.067533   28.258561                0.067533          28.



In [17]:
y_pred = predictor.predict(X_test_estimated_c)
y_pred = y_pred.clip(lower=0)
y_pred = y_pred.reset_index(drop=True)
y_pred.index.name = 'id'

In [None]:
df = pd.DataFrame(y_pred)
df.to_csv('result_c.csv')

In [None]:
df_a = pd.read_csv('result_a.csv')
df_b = pd.read_csv('result_b.csv')
df_c = pd.read_csv('result_c.csv')

def combine_dataframes(df1, df2, df3):
    # Concatenate the dataframes in the specified order
    combined = pd.concat([df1, df2, df3], ignore_index=True)

    # Rename the 'index' column to 'id'
    combined.rename(columns={'index': 'id'}, inplace=True)

    # Ensure the 'id' values range from 0 to 'x'
    combined['id'] = range(len(combined))

    return combined


df = combine_dataframes(df_a, df_b, df_c)
df.to_csv('result.csv', index=False)