In [2]:
import sys
import os

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Go one level up to get the project root directory
project_root = os.path.abspath(os.path.join(notebook_dir, '..'))

# Add the project root to the Python path if it's not already there
if project_root not in sys.path:
    sys.path.append(project_root)

# Now you should be able to import your modules
# from src.data_utils import load_config, load_and_prepare_data, split_data_chronologically
# from src.preprocess_utils import scale_data, save_scaler, load_scaler, inverse_transform_predictions
# from src.feature_utils import engineer_features

In [3]:
import pandas as pd
import numpy as np
#import robustscaler
from sklearn.preprocessing import RobustScaler
from src.data_utils import load_config, load_and_prepare_data, split_data_chronologically
from src.preprocess_utils import scale_data, save_scaler, load_scaler, inverse_transform_predictions
from src.feature_utils import engineer_features





In [4]:
#load data
data_path = "../data/full.csv"
full_df = pd.read_csv(data_path)
cfg = config_file_for_pipeline = "../config/xgb_global/config_XGBoostGlobal_SPEI.yaml"
#change timestamps from 16/1/1901 to datetime format
full_df["time"]= pd.to_datetime(full_df['time'], dayfirst=True, format='%d/%m/%Y').dt.strftime('%Y-%m-%d')
full_df.sort_values(by=['time',"lat","lon"], inplace=True)
data_path_new = "../data/full_newtime.csv"
full_df.to_csv(data_path_new, index=False)

In [13]:
train_end_date= "2017-12-31"
validation_end_date= "2020-12-31"
train_df = full_df[full_df['time'] <= train_end_date]
val_df = full_df[(full_df['time'] > train_end_date) & (full_df['time'] <= validation_end_date)]
test_df = full_df[full_df['time'] > validation_end_date]

# 5. Print or use the split data
print("Train Data:")
print(train_df)
print("\nTest Data:")
print(test_df)

Train Data:
           lon    lat        time        tmp   dtr        cld        tmx  \
0       101.25   6.25  1901-01-16  25.300001   9.3  62.500000  30.000000   
1       101.75   6.25  1901-01-16  25.800001   8.0  65.100000  29.800001   
2        99.75   6.75  1901-01-16  27.800001   9.8  55.000000  32.700000   
3       100.25   6.75  1901-01-16  26.900000   9.8  58.800000  31.800001   
4       100.75   6.75  1901-01-16  26.100000   9.0  62.500000  30.600000   
...        ...    ...         ...        ...   ...        ...        ...   
251311   99.25  19.75  2017-12-16  19.300001  12.3  42.100002  25.500000   
251312   99.75  19.75  2017-12-16  19.900000  12.1  42.500000  26.000000   
251313  100.25  19.75  2017-12-16  19.800001  12.3  43.200000  26.000000   
251314   99.75  20.25  2017-12-16  19.500000  12.3  43.000000  25.700000   
251315  100.25  20.25  2017-12-16  20.200000  12.0  43.100002  26.200000   

              tmn         pre    wet        vap      spei    soi    dmi  \


In [14]:
#fit scaler on selected features
scaler_feature = RobustScaler()
scaler_pet = RobustScaler()
scaler_pre = RobustScaler()
scaler_spei = RobustScaler()
scaler_pet_full = RobustScaler()
scaler_pre_full = RobustScaler()
scaler_spei_full = RobustScaler()
scaler_full = RobustScaler()
features_to_grid = [
    'tmp', 'dtr', 'cld', 'tmx', 
    'tmn', 'wet', 'vap', 'soi', 
    'dmi', 'pdo', 'nino4', 
    'nino34', 'nino3'
] 
target_pet = 'pet'
target_pre = 'pre'
target_spei = 'spei'
scaler_feature.fit(train_df[features_to_grid])
scaler_pet.fit(train_df[[target_pet]])
scaler_pre.fit(train_df[[target_pre]])
scaler_spei.fit(train_df[[target_spei]])



In [16]:
scaler_pet_full.fit(full_df[[target_pet]])
scaler_pre_full.fit(full_df[[target_pre]])
scaler_spei_full.fit(full_df[[target_spei]])
import joblib
joblib.dump(scaler_pet_full, "robust_scaler_pet_full.joblib")
joblib.dump(scaler_pre_full, "robust_scaler_pre_full.joblib")
joblib.dump(scaler_spei_full, "robust_scaler_spei_full.joblib")
joblib.dump(scaler_pet, "robust_scaler_pet.joblib")
joblib.dump(scaler_pre, "robust_scaler_pre.joblib")
joblib.dump(scaler_spei, "robust_scaler_spei.joblib")


['robust_scaler_spei.joblib']

In [23]:
scaler_full.fit(full_df[features_to_grid + [target_pet, target_pre, target_spei]])
scaler_feature.fit(train_df[features_to_grid + [target_pet, target_pre, target_spei]])
train_scaled_values = scaler_feature.transform(train_df[features_to_grid + [target_pet, target_pre, target_spei]])
val_scaled_values = scaler_feature.transform(val_df[features_to_grid + [target_pet, target_pre, target_spei]])
test_scaled_values = scaler_feature.transform(test_df[features_to_grid + [target_pet, target_pre, target_spei]])
full_scaled_values = scaler_feature.transform(full_df[features_to_grid + [target_pet, target_pre, target_spei]])

In [24]:
# Define the column order
meta_cols = ['lat', 'lon', 'time']
feature_cols = features_to_grid + [target_pet, target_pre, target_spei]
all_cols = meta_cols + feature_cols

# Helper to reattach meta columns and reorder
def rebuild_scaled_df(scaled_values, ref_df):
    df = pd.DataFrame(scaled_values, columns=feature_cols)
    df[meta_cols] = ref_df[meta_cols].reset_index(drop=True)
    return df[all_cols]

# Apply for all splits
train_scaled = rebuild_scaled_df(train_scaled_values, train_df)
val_scaled = rebuild_scaled_df(val_scaled_values, val_df)
test_scaled = rebuild_scaled_df(test_scaled_values, test_df)
full_scaled = rebuild_scaled_df(full_scaled_values, full_df)

In [28]:
train_scaled.head()

Unnamed: 0,lat,lon,time,tmp,dtr,cld,tmx,tmn,wet,vap,soi,dmi,pdo,nino4,nino34,nino3,pet,pre,spei
0,6.25,101.25,1901-01-16,-0.533333,-0.090909,-0.014563,-0.586207,-0.365854,-0.066216,-0.236111,-0.023076,-0.723502,0.815156,0.8875,0.989362,0.645833,0.376518,-0.129184,-0.275561
1,6.25,101.75,1901-01-16,-0.366666,-0.386364,0.048544,-0.655172,-0.097561,0.122973,-0.027778,-0.023076,-0.723502,0.815156,0.8875,0.989362,0.645833,0.125506,0.116109,-0.233257
2,6.75,99.75,1901-01-16,0.3,0.022727,-0.196602,0.344828,0.170732,-0.49054,-0.125,-0.023076,-0.723502,0.815156,0.8875,0.989362,0.645833,1.380567,-0.376046,-0.437377
3,6.75,100.25,1901-01-16,0.0,0.022727,-0.104369,0.034483,-0.04878,-0.398649,-0.152778,-0.023076,-0.723502,0.815156,0.8875,0.989362,0.645833,1.004049,-0.373954,-0.415114
4,6.75,100.75,1901-01-16,-0.266667,-0.159091,-0.014563,-0.37931,-0.146341,-0.268919,-0.111111,-0.023076,-0.723502,0.815156,0.8875,0.989362,0.645833,0.502024,-0.299163,-0.236862


In [29]:
train_df.head()

Unnamed: 0,lon,lat,time,tmp,dtr,cld,tmx,tmn,pre,wet,vap,spei,soi,dmi,pdo,nino4,nino34,nino3,pet
0,101.25,6.25,1901-01-16,25.300001,9.3,62.5,30.0,20.7,84.6,10.28,25.2,-0.384595,-0.09,-0.54,1.114457,0.59,0.82,0.46,108.5
1,101.75,6.25,1901-01-16,25.800001,8.0,65.1,29.800001,21.800001,131.5,13.08,26.7,-0.32492,-0.09,-0.54,1.114457,0.59,0.82,0.46,102.3
2,99.75,6.75,1901-01-16,27.800001,9.8,55.0,32.7,22.9,37.4,4.0,26.0,-0.612856,-0.09,-0.54,1.114457,0.59,0.82,0.46,133.3
3,100.25,6.75,1901-01-16,26.9,9.8,58.8,31.800001,22.0,37.8,5.36,25.800001,-0.581451,-0.09,-0.54,1.114457,0.59,0.82,0.46,124.0
4,100.75,6.75,1901-01-16,26.1,9.0,62.5,30.6,21.6,52.100002,7.28,26.1,-0.330006,-0.09,-0.54,1.114457,0.59,0.82,0.46,111.600003


In [30]:
train_scaled.to_csv("train_scaled.csv", index=False)
val_scaled.to_csv("val_scaled.csv", index=False)
test_scaled.to_csv("test_scaled.csv", index=False)
full_scaled.to_csv("full_scaled.csv", index=False)