#### Simple notebook to run model tests only

In [18]:
import pandas as pd
import numpy as np

In [19]:
#Load inn datasets
X_test  = pd.read_parquet('data/prepared_datasets/only_y_cleaned/X_test.parquet')
X_train = pd.read_parquet('data/prepared_datasets/only_y_cleaned/X_train.parquet')
y_train = pd.read_parquet('data/prepared_datasets/only_y_cleaned/Y_train.parquet')

## Catboost

In [22]:
from catboost import Pool, CatBoostRegressor

#Create a pool of data
train_pool = Pool(X_train, y_train, cat_features=["location"])
test_pool = Pool(X_test, cat_features=["location"]) 

#init model and fit it
catboost_model = CatBoostRegressor()
catboost_model.fit(train_pool)

Learning rate set to 0.083785
0:	learn: 716.4527423	total: 125ms	remaining: 2m 5s
1:	learn: 670.9118888	total: 253ms	remaining: 2m 6s
2:	learn: 628.4369142	total: 398ms	remaining: 2m 12s
3:	learn: 589.9885156	total: 541ms	remaining: 2m 14s
4:	learn: 555.6329504	total: 664ms	remaining: 2m 12s
5:	learn: 524.5986269	total: 803ms	remaining: 2m 12s
6:	learn: 496.9887725	total: 943ms	remaining: 2m 13s
7:	learn: 472.6342534	total: 1.08s	remaining: 2m 14s
8:	learn: 450.6170261	total: 1.28s	remaining: 2m 21s
9:	learn: 430.9377343	total: 1.39s	remaining: 2m 18s
10:	learn: 413.6633435	total: 1.52s	remaining: 2m 16s
11:	learn: 398.3036969	total: 1.62s	remaining: 2m 13s
12:	learn: 384.9110695	total: 1.72s	remaining: 2m 10s
13:	learn: 373.0316318	total: 1.82s	remaining: 2m 8s
14:	learn: 362.6021809	total: 1.92s	remaining: 2m 6s
15:	learn: 353.3445370	total: 2.02s	remaining: 2m 4s
16:	learn: 345.0748651	total: 2.13s	remaining: 2m 3s
17:	learn: 337.9010459	total: 2.23s	remaining: 2m 1s
18:	learn: 331.

<catboost.core.CatBoostRegressor at 0x20afd5afc10>

In [23]:
#make predictions
predictions = pd.DataFrame(catboost_model.predict(test_pool))
print(predictions)

              0
0    -13.779534
1     -9.736663
2    -11.785107
3    -11.440284
4    -16.473028
...         ...
8635   5.034486
8636   4.565914
8637   5.413928
8638   7.708985
8639   9.375538

[8640 rows x 1 columns]


In [25]:
#Local testing
from catboost import cv
cv_results = cv(train_pool, catboost_model.get_params(), fold_count=2, type="TimeSeries")

Training on fold [0/2]
0:	learn: 1312.8228543	test: 387.5876260	best: 387.5876260 (0)	total: 14.2ms	remaining: 14.2s
1:	learn: 1280.4187574	test: 368.4074392	best: 368.4074392 (1)	total: 26.3ms	remaining: 13.1s
2:	learn: 1248.8584366	test: 352.0701202	best: 352.0701202 (2)	total: 40.4ms	remaining: 13.4s
3:	learn: 1218.5941554	test: 338.8313140	best: 338.8313140 (3)	total: 52.1ms	remaining: 13s
4:	learn: 1189.6026450	test: 328.0818354	best: 328.0818354 (4)	total: 66.6ms	remaining: 13.3s
5:	learn: 1162.3631465	test: 320.2438678	best: 320.2438678 (5)	total: 83.1ms	remaining: 13.8s
6:	learn: 1135.8704857	test: 316.0048521	best: 316.0048521 (6)	total: 97.9ms	remaining: 13.9s
7:	learn: 1110.0830953	test: 313.1415179	best: 313.1415179 (7)	total: 115ms	remaining: 14.2s
8:	learn: 1084.8561608	test: 312.7655298	best: 312.7655298 (8)	total: 135ms	remaining: 14.8s
9:	learn: 1060.0198322	test: 314.9758612	best: 312.7655298 (8)	total: 153ms	remaining: 15.2s
10:	learn: 1036.2887450	test: 318.8695148	

## Preparing submission

In [26]:
def prepare_submission(predictions, X_test):
    index_df = X_test.index.to_frame()
    out_pd = pd.concat([index_df.reset_index(drop=True), predictions.reset_index(drop=True)], axis=1)
    out_pd = out_pd.rename(columns={0: 'prediction', 'date_forecast': 'time'})
    print(X_test.info())
    print(out_pd.info())
    out_pd['location'] = X_test['location'].reset_index(drop=True)
    out_pd.set_index('time', inplace=True)
    return out_pd

def merge_with_sample(out_pd):
    test = pd.read_csv('data/test.csv')
    test.time = pd.to_datetime(test.time)
    sample_submission = pd.read_csv('data/sample_submission.csv')
    test.set_index('time', inplace=True)
    
    merged_df = test.reset_index().merge(out_pd.reset_index(), on=['time', 'location'], how='left', suffixes=('_original', '_new'))
    merged_df['prediction_new'] = merged_df['prediction_new'].combine_first(merged_df['prediction_original'])
    merged_df.drop('prediction_original', axis=1, inplace=True)
    merged_df.rename(columns={'prediction_new': 'prediction'}, inplace=True)
    return sample_submission[['id']].merge(merged_df[['id', 'prediction']], on='id', how='left')


out_pd = prepare_submission(predictions, X_test)
sample_submission = merge_with_sample(out_pd)
sample_submission.to_csv('submission.csv', index=False)


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8640 entries, 2023-05-01 00:00:00 to 2023-07-03 23:45:00
Data columns (total 42 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   absolute_humidity_2m:gm3        8640 non-null   float32
 1   air_density_2m:kgm3             8640 non-null   float32
 2   ceiling_height_agl:m            6243 non-null   float32
 3   clear_sky_energy_1h:J           8640 non-null   float32
 4   clear_sky_rad:W                 8640 non-null   float32
 5   cloud_base_agl:m                7690 non-null   float32
 6   dew_point_2m:K                  8640 non-null   float32
 7   diffuse_rad:W                   8640 non-null   float32
 8   diffuse_rad_1h:J                8640 non-null   float32
 9   direct_rad:W                    8640 non-null   float32
 10  direct_rad_1h:J                 8640 non-null   float32
 11  effective_cloud_cover:p         8640 non-null   float32
 12