In [29]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor

X_train_estimated_a = pd.read_parquet('data/A/X_train_estimated.parquet')
X_train_observed_a = pd.read_parquet('data/A/X_train_observed.parquet')
Y_a = pd.read_parquet('data/A/train_targets.parquet')
X_test_a = pd.read_parquet('data/A/X_test_estimated.parquet')

print(X_train_estimated_a.columns)

Index(['date_calc', 'date_forecast', 'absolute_humidity_2m:gm3',
       'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J',
       'clear_sky_rad:W', 'cloud_base_agl:m', 'dew_or_rime:idx',
       'dew_point_2m:K', 'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W',
       'direct_rad_1h:J', 'effective_cloud_cover:p', 'elevation:m',
       'fresh_snow_12h:cm', 'fresh_snow_1h:cm', 'fresh_snow_24h:cm',
       'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'is_day:idx',
       'is_in_shadow:idx', 'msl_pressure:hPa', 'precip_5min:mm',
       'precip_type_5min:idx', 'pressure_100m:hPa', 'pressure_50m:hPa',
       'prob_rime:p', 'rain_water:kgm2', 'relative_humidity_1000hPa:p',
       'sfc_pressure:hPa', 'snow_density:kgm3', 'snow_depth:cm',
       'snow_drift:idx', 'snow_melt_10min:mm', 'snow_water:kgm2',
       'sun_azimuth:d', 'sun_elevation:d', 'super_cooled_liquid_water:kgm2',
       't_1000hPa:K', 'total_cloud_cover:p', 'visibility:m',
       'wind_speed_10m:ms', 'wind_spe

#### Making combined dataframes

In [30]:
X_train_observed_a["date_calc"] = X_train_observed_a["date_forecast"]
X_train_all_a = pd.concat([X_train_observed_a, X_train_estimated_a], ignore_index=True)

In [31]:
#Remove all non hourly values
X_train_all_a.set_index('date_forecast', inplace=True)
mask = X_train_all_a.index.minute == 0
X_train_all_a = X_train_all_a[mask]

In [32]:
#Set the index as the date forecast for Y aswell
Y_a.set_index('time', inplace=True)

In [33]:
Y_a.head()

Unnamed: 0_level_0,pv_measurement
time,Unnamed: 1_level_1
2019-06-02 22:00:00,0.0
2019-06-02 23:00:00,0.0
2019-06-03 00:00:00,0.0
2019-06-03 01:00:00,0.0
2019-06-03 02:00:00,19.36


In [34]:
data = pd.concat([X_train_all_a, Y_a], axis=1)
#data = data.drop('time', axis=1)
print(data.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 34086 entries, 2019-06-02 22:00:00 to 2023-04-30 23:00:00
Data columns (total 47 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   absolute_humidity_2m:gm3        34062 non-null  float32       
 1   air_density_2m:kgm3             34062 non-null  float32       
 2   ceiling_height_agl:m            26284 non-null  float32       
 3   clear_sky_energy_1h:J           34062 non-null  float32       
 4   clear_sky_rad:W                 34062 non-null  float32       
 5   cloud_base_agl:m                30998 non-null  float32       
 6   dew_or_rime:idx                 34062 non-null  float32       
 7   dew_point_2m:K                  34062 non-null  float32       
 8   diffuse_rad:W                   34062 non-null  float32       
 9   diffuse_rad_1h:J                34062 non-null  float32       
 10  direct_rad:W                    340

#### Removing missing values

In [35]:
data = data.dropna(axis=0)

In [36]:
data.isnull().sum()

absolute_humidity_2m:gm3          0
air_density_2m:kgm3               0
ceiling_height_agl:m              0
clear_sky_energy_1h:J             0
clear_sky_rad:W                   0
cloud_base_agl:m                  0
dew_or_rime:idx                   0
dew_point_2m:K                    0
diffuse_rad:W                     0
diffuse_rad_1h:J                  0
direct_rad:W                      0
direct_rad_1h:J                   0
effective_cloud_cover:p           0
elevation:m                       0
fresh_snow_12h:cm                 0
fresh_snow_1h:cm                  0
fresh_snow_24h:cm                 0
fresh_snow_3h:cm                  0
fresh_snow_6h:cm                  0
is_day:idx                        0
is_in_shadow:idx                  0
msl_pressure:hPa                  0
precip_5min:mm                    0
precip_type_5min:idx              0
pressure_100m:hPa                 0
pressure_50m:hPa                  0
prob_rime:p                       0
rain_water:kgm2             

In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 828 entries, 2019-12-01 10:00:00 to 2023-03-17 23:00:00
Data columns (total 47 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   absolute_humidity_2m:gm3        828 non-null    float32       
 1   air_density_2m:kgm3             828 non-null    float32       
 2   ceiling_height_agl:m            828 non-null    float32       
 3   clear_sky_energy_1h:J           828 non-null    float32       
 4   clear_sky_rad:W                 828 non-null    float32       
 5   cloud_base_agl:m                828 non-null    float32       
 6   dew_or_rime:idx                 828 non-null    float32       
 7   dew_point_2m:K                  828 non-null    float32       
 8   diffuse_rad:W                   828 non-null    float32       
 9   diffuse_rad_1h:J                828 non-null    float32       
 10  direct_rad:W                    828 n

#### Making the Model 

In [38]:
X = data.drop(["pv_measurement",'date_calc'], axis=1)
y = data.pv_measurement

model = DecisionTreeRegressor(random_state=1)
model.fit(X, y)
predictions = model.predict(X_test_a.drop(['date_forecast', 'date_calc'], axis=1))
print(predictions)

[0.   0.   0.   ... 0.   8.58 8.58]


In [23]:
X_test_a.head(20)
X_test_a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2880 entries, 0 to 2879
Data columns (total 47 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   date_calc                       2880 non-null   datetime64[ns]
 1   date_forecast                   2880 non-null   datetime64[ns]
 2   absolute_humidity_2m:gm3        2880 non-null   float32       
 3   air_density_2m:kgm3             2880 non-null   float32       
 4   ceiling_height_agl:m            2087 non-null   float32       
 5   clear_sky_energy_1h:J           2880 non-null   float32       
 6   clear_sky_rad:W                 2880 non-null   float32       
 7   cloud_base_agl:m                2582 non-null   float32       
 8   dew_or_rime:idx                 2880 non-null   float32       
 9   dew_point_2m:K                  2880 non-null   float32       
 10  diffuse_rad:W                   2880 non-null   float32       
 11  diff

In [24]:
test = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')
test['prediction'] = np.random.rand(len(test))
test.prediction[0..predictions.size] = predictions
sample_submission = sample_submission[['id']].merge(test[['id', 'prediction']], on='id', how='left')
sample_submission.to_csv('my_first_submission.csv', index=False)

AttributeError: 'float' object has no attribute 'predictions'