You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am trying to fit RandomForestRegressor on the training dataset, when using the predict function to predict the values, I receive constant values through time. What is the reason for this? My goal is to forecast up to 72 hours ahead.
Here is my code:
`from data_preparation import Preparation
from missing_timestamps import remove_duplicates
import pandas as pd
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.model_selection import grid_search_forecaster
from sklearn.ensemble import RandomForestRegressor
from skforecast.utils import save_forecaster
from skforecast.utils import load_forecaster
marvin
data = Preparation(r'/home/ieftimska/operato-meteo-1/data/MAS_processed/ELES-MAS-5001.csv.gz', "AMBIENT_TEMPERATURE")
#data = Preparation(r'/home/iva/Desktop/operato-meteo-1/data/MAS_processed/ELES-MAS-5001.csv.gz', "AMBIENT_TEMPERATURE")
train, test = data.split()
train_processed = remove_duplicates(train)
#train_processed_ = train_processed["AMBIENT_TEMPERATURE"].copy().squeeze()
test_processed = remove_duplicates(test)
#test_processed_ = test_processed["AMBIENT_TEMPERATURE"].copy().squeeze()
whole_data = pd.concat([train_processed, test_processed])
whole_data = whole_data.rename(columns={"AMBIENT_TEMPERATURE": "y"})
whole_data.index = whole_data.index.rename("datetime")
forecaster.fit(y=whole_data.loc[:"2022", "y"])
save_forecaster(forecaster, file_name='forecaster_random_forest.py', verbose=False)
forecaster_loaded = load_forecaster('forecaster_random_forest.py', verbose=True)
predictions = forecaster_loaded.predict(steps=864)Here is the dataset: [ELES-MAS-5001.csv.gz](https://github.com/JoaquinAmatRodrigo/skforecast/files/12646861/ELES-MAS-5001.csv.gz) This is the missing_timestamps script:import pandas as pd
from datetime import timedelta
import numpy as np
def remove_duplicates(data):
"""
A function that removes duplicates in timestamps and removes
timezone information from the timestamps
"""
data["timestamp"] = pd.to_datetime(data["timestamp"])
time = data["timestamp"].dt.tz_localize(None)
data["timestamp"] = time
data_processed = data.drop_duplicates(subset="timestamp", keep='last')
data_processed = data_processed.set_index("timestamp")
data_processed = data_processed.sort_index()
return data_processed
def missing_data(data_processed):
"""
A function that finds the dates of the missing data and fills it with the previous non-missing timestamp
for a better visualization to present train/test data with the missing data
"""
timestamps_series = pd.Series(data_processed.index)
diff_5 = timedelta(hours=0, minutes=5)
diff = timestamps_series.diff()
more_than_5 = np.where(diff > diff_5)[0]
previous_timestamp = more_than_5 - 1
gaps = list()
# a list of lists, where the first element is the previous timestamp of the missing data (succesive timestamps differ more than 5 min)
# and the second element is the end date of the missing data
for i, j in zip(data_processed.iloc[previous_timestamp].index, data_processed.iloc[more_than_5].index):
gaps.append([i, j])
missing_timestamps = list()
# dates of missing data where the start date is not a missing data
for i in gaps:
missing_timestamps.append(pd.date_range(start=i[0], end=i[1], freq='5T'))
missing_timestamps_one_by_one = list()
for i in missing_timestamps:
# exclude the non-missing timestamp
for j in i[1:]:
missing_timestamps_one_by_one.append(j)
values = data_processed.iloc[previous_timestamp].values
values_list = list()
for i, v in enumerate(values):
values_list.append(v[0])
# fill the missing timestamps with the values of their previous non-missing timestamps
missing_values_filled_previous = list()
for t, v in zip(missing_timestamps, values_list):
missing_values_filled_previous.append([v] * len(t[1:]))
missing_values_filled_previous_one_by_one = list()
for i in missing_values_filled_previous:
for j in i:
missing_values_filled_previous_one_by_one.append(j)
We are closing this issue due to inactivity. If you believe this issue is still relevant, please feel free to reopen it or create a new issue with updated details.
I am trying to fit RandomForestRegressor on the training dataset, when using the predict function to predict the values, I receive constant values through time. What is the reason for this? My goal is to forecast up to 72 hours ahead.
Here is my code:
`from data_preparation import Preparation
from missing_timestamps import remove_duplicates
import pandas as pd
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.model_selection import grid_search_forecaster
from sklearn.ensemble import RandomForestRegressor
from skforecast.utils import save_forecaster
from skforecast.utils import load_forecaster
marvin
data = Preparation(r'/home/ieftimska/operato-meteo-1/data/MAS_processed/ELES-MAS-5001.csv.gz', "AMBIENT_TEMPERATURE")
#data = Preparation(r'/home/iva/Desktop/operato-meteo-1/data/MAS_processed/ELES-MAS-5001.csv.gz', "AMBIENT_TEMPERATURE")
train, test = data.split()
train_processed = remove_duplicates(train)
#train_processed_ = train_processed["AMBIENT_TEMPERATURE"].copy().squeeze()
test_processed = remove_duplicates(test)
#test_processed_ = test_processed["AMBIENT_TEMPERATURE"].copy().squeeze()
whole_data = pd.concat([train_processed, test_processed])
whole_data = whole_data.rename(columns={"AMBIENT_TEMPERATURE": "y"})
whole_data.index = whole_data.index.rename("datetime")
forecaster.fit(y=whole_data.loc[:"2022", "y"])
save_forecaster(forecaster, file_name='forecaster_random_forest.py', verbose=False)
forecaster_loaded = load_forecaster('forecaster_random_forest.py', verbose=True)
predictions = forecaster_loaded.predict(steps=864)
Here is the dataset: [ELES-MAS-5001.csv.gz](https://github.com/JoaquinAmatRodrigo/skforecast/files/12646861/ELES-MAS-5001.csv.gz) This is the missing_timestamps script:
import pandas as pdfrom datetime import timedelta
import numpy as np
def remove_duplicates(data):
"""
A function that removes duplicates in timestamps and removes
timezone information from the timestamps
"""
data["timestamp"] = pd.to_datetime(data["timestamp"])
time = data["timestamp"].dt.tz_localize(None)
data["timestamp"] = time
data_processed = data.drop_duplicates(subset="timestamp", keep='last')
data_processed = data_processed.set_index("timestamp")
data_processed = data_processed.sort_index()
def missing_data(data_processed):
"""
A function that finds the dates of the missing data and fills it with the previous non-missing timestamp
for a better visualization to present train/test data with the missing data
"""
timestamps_series = pd.Series(data_processed.index)
diff_5 = timedelta(hours=0, minutes=5)
diff = timestamps_series.diff()
more_than_5 = np.where(diff > diff_5)[0]
previous_timestamp = more_than_5 - 1
gaps = list()
# a list of lists, where the first element is the previous timestamp of the missing data (succesive timestamps differ more than 5 min)
# and the second element is the end date of the missing data
for i, j in zip(data_processed.iloc[previous_timestamp].index, data_processed.iloc[more_than_5].index):
gaps.append([i, j])
missing_timestamps = list()
# dates of missing data where the start date is not a missing data
for i in gaps:
missing_timestamps.append(pd.date_range(start=i[0], end=i[1], freq='5T'))
missing_timestamps_one_by_one = list()
for i in missing_timestamps:
# exclude the non-missing timestamp
for j in i[1:]:
missing_timestamps_one_by_one.append(j)
values = data_processed.iloc[previous_timestamp].values
values_list = list()
for i, v in enumerate(values):
values_list.append(v[0])
# fill the missing timestamps with the values of their previous non-missing timestamps
missing_values_filled_previous = list()
for t, v in zip(missing_timestamps, values_list):
missing_values_filled_previous.append([v] * len(t[1:]))
missing_values_filled_previous_one_by_one = list()
for i in missing_values_filled_previous:
for j in i:
missing_values_filled_previous_one_by_one.append(j)
The text was updated successfully, but these errors were encountered: