In [72]:
### INITIALIZATION #############################

import pandas as pd
import os
import csv

############ Libraries ##############

from matplotlib import pyplot as plt
import numpy as np

# Evaluation metrics
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_percentage_error as mape

epislon = 1e-20  # Define a small epsilon value for division by zero cases

def rmse(y_true, y_pred):
  return np.sqrt(mse(y_true, y_pred))

def mase(y_true, y_pred, y_baseline):
    # Calcula o MAE do modelo
    mae_pred = np.mean(np.abs(y_true - y_pred))
    # Calcula o MAE do modelo baseline Persistent Window (i.e., últimas h observações antes do teste)
    mae_naive = np.mean(np.abs(y_true - y_baseline))
    result = mae_pred/mae_naive
    return result

def pbe(y_true, y_pred):
  if np.sum(y_true)!=0:
    return 100*(np.sum(y_true - y_pred)/np.sum(y_true))
  else:
    return 100*(np.sum(y_true - y_pred)/(np.sum(y_true) + epislon))

def pocid(y_true, y_pred):
  n = len(y_true)
  D = [1 if (y_pred[i] - y_pred[i-1]) * (y_true[i] - y_true[i-1]) > 0 else 0 for i in range(1, n)]
  POCID = 100 * np.sum(D) / (n-1)
  return POCID

def mcpm(rmse_result, mape_result, pocid_result):
  er_result = 100 - pocid_result

  A1 = (rmse_result * mape_result * np.sin((2*np.pi)/3))/2
  A2 = (mape_result * er_result * np.sin((2*np.pi)/3))/2
  A3 = (er_result * rmse_result * np.sin((2*np.pi)/3))/2
  total = A1 + A2 + A3
  return total

def znorm(x):
  if np.std(x) != 0: 
      x_znorm = (x - np.mean(x)) / np.std(x)
  else:
      x_znorm = (x - np.mean(x)) / (np.std(x) + epislon)
  return x_znorm

def znorm_reverse(x, mean_x, std_x):
  x_denormalized = (np.array(x) * std_x) + mean_x
  return x_denormalized

def get_stats_norm(series, horizon, window):
  last_subsequence = series[-(horizon+window):-horizon].values
  last_mean = np.mean(last_subsequence)
  last_std = np.std(last_subsequence)
  return last_mean, last_std

# Para predição de vendas por UF (mensal), será considerado horizon = 12
# Para predição de vendas por município (anual), será considerado horizon = 1
def train_test_split(data, horizon):
  X = data.iloc[:,:-1] # features
  y = data.iloc[:,-1] # target

  X_train = X[:-horizon] # features train
  X_test =  X[-horizon:] # features test

  y_train = y[:-horizon] # target train
  y_test = y[-horizon:] # target test
  return X_train, X_test, y_train, y_test

def recursive_multistep_forecasting(X_test, model, horizon):
  # example é composto pelas últimas observações vistas
  # na prática, é o pbeprimeiro exemplo do conjunto de teste
  example = X_test.iloc[0].values.reshape(1,-1)

  preds = []
  for i in range(horizon):
    pred = model.predict(example)[0]
    preds.append(pred)

    # Descartar o valor da primeira posição do vetor de características
    example = example[:,1:]

    # Adicionar o valor predito na última posição do vetor de características
    example = np.append(example, pred)
    example = example.reshape(1,-1)
  return preds

def baseline_mean(series, horizon):
  # como as séries são normalizadas, esse baseline irá retornar uma reta próxima de zero
  pred = np.repeat(np.mean(znorm(series[:-horizon])), horizon)
  return pred

def baseline_persistent(series, horizon):
  return np.repeat(znorm(series[-2*horizon:-horizon]).values[-1], horizon)

def baseline_persistent_window(series, horizon):
  subsequence = znorm(series[-horizon*2:-horizon]).values
  return subsequence

def baseline_persistent_windowR(series, horizon):
  subsequence2 = series[-horizon*2:-horizon].values
  return subsequence2

# Em geral, considera-se um tamanho de janela capaz de capturar um ciclo dos dados
# Por exemplo, 12 observações no caso dos dados com frequência mensal
def rolling_window(series, window):
  data = []
  for i in range(len(series)-window):
    example = znorm(np.array(series[i:i+window+1]))
    data.append(example)
  df = pd.DataFrame(data)
  return df


def extract_estado(file_name):
    parts = file_name.split('_')
    estado = parts[1]
    return estado

def read_csv_files(folder_path):
    estados = []
    files = os.listdir(folder_path)
    for file_name in files:
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', newline='') as csvfile:
                reader = csv.reader(csvfile)
                headers = next(reader)
                estado = extract_estado(file_name)
                estados.append(estado)
                estados.sort()
    return estados


In [None]:
######### ARROW INDIVIDUAL

from pathlib import Path
from typing import List, Union

import numpy as np
from gluonts.dataset.arrow import ArrowWriter


def extract_estado(file_name):
    parts = file_name.split('_')
    estado = parts[1]
    return estado

def read_csv_files(folder_path):
    estados = []
    files = os.listdir(folder_path)
    for file_name in files:
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', newline='') as csvfile:
                reader = csv.reader(csvfile)
                headers = next(reader)
                estado = extract_estado(file_name)
                estados.append(estado)
                estados.sort()
    return estados

def convert_to_arrow(
    path: Union[str, Path],
    time_series: Union[List[np.ndarray], np.ndarray],
    compression: str = "lz4",
):
    assert isinstance(time_series, list) or (
        isinstance(time_series, np.ndarray) and
        time_series.ndim == 2
    )

    # Set an arbitrary start time
    # start = np.datetime64("1990-01", "ns")
    start = "1990-01"

    dataset = [
        {"start": start, "target": ts} for ts in time_series
    ]

    ArrowWriter(compression=compression).write_to_file(
        dataset,
        path=path,
    )


if __name__ == "__main__":

    horizon = 12

    products = sorted([name for name in os.listdir('../uf/') if os.path.isdir(os.path.join('../uf/', name))])

    for ano in range(2024, 2019, -1):
        periodo = (2025 - ano) * 12

        for product in products:
            folder_path = f'../uf/{product}/'
            # Read the CSV files and extract estado names
            estados = read_csv_files(folder_path)
            for estado in estados:

                # Generate 20 random time series of length 1024
                time_series_t = pd.read_csv(f"../uf/{product}/mensal_{estado}_{product}.csv", header=0, sep=";")
                time_series = time_series_t['m3'].iloc[:-periodo]
                time_series = [time_series.to_numpy()]
                # Convert to GluonTS arrow format
                convert_to_arrow(f"../{product}_{estado}_{ano}-data.arrow", time_series=time_series)

In [None]:
############ FINE-TUNE

import subprocess

# Read the paths from list.txt
with open('listarrows.txt', 'r') as file:
    paths = file.readlines()


###### MODELS
# chronos-gpt2.yaml
# chronos-t5-base.yaml
# chronos-t5-large.yaml
# chronos-t5-mini.yaml
# chronos-t5-small.yaml
# chronos-t5-tiny.yaml


modelo = 'chronos-gpt2'
# Strip any extra spaces or newline characters
paths = [path.strip() for path in paths]

# Run the command for each path
for path in paths:
    # Read the yaml configuration file
    with open(f'./chronos-forecasting/scripts/training/configs/{modelo}.yaml', 'r') as yaml_file:
        yaml_lines = yaml_file.readlines()
    
    # Modify the second line of the yaml file with the current path
    yaml_lines[1] = f"  - \"{path}\"\n"

    # Write the modified yaml file back
    with open(f'./chronos-forecasting/scripts/training/configs/{modelo}.yaml', 'w') as yaml_file:
        yaml_file.writelines(yaml_lines)
    
    # Run the torchrun command
    command = f"torchrun --nproc-per-node=2 ./chronos-forecasting/scripts/training/train.py --config ./chronos-forecasting/scripts/training/configs/{modelo}.yaml"
    subprocess.run(command, shell=True)  # This will wait for the command to finish before moving to the next


#1 GPU
# CUDA_VISIBLE_DEVICES=0 python ./chronos-forecasting/scripts/training/train.py --config ./chronos-forecasting/scripts/training/configs/chronos-gpt2.yaml

#Multiplas GPUs
# torchrun --nproc-per-node=2 ./chronos-forecasting/scripts/training/train.py --config ./chronos-forecasting/scripts/training/configs/chronos-gpt2.yaml




In [8]:
######## TEST INDIV #############

import os
import csv
import pandas as pd  # requires: pip install pandas
import torch
from chronos import BaseChronosPipeline

####### MUDAR PARA CADA ANO!!! ################

ano = 2024

periodo = (2025-ano)*12
i = 0

horizon = 12



# List of products
products = sorted([name for name in os.listdir('../uf/') if os.path.isdir(os.path.join('../uf/', name))])

# Read the first model folder from list2024.txt ####### MUDAR PARA CADA ANO!!! ################
# with open(f'list{ano}.txt', 'r') as file:
with open(f'list24.txt', 'r') as file:
    model_folders = [line.strip() for line in file if line.strip()]  # Remove empty lines and strip spaces



# Collect all rows for the final CSV
all_data = []

try:
    for product in products:
        folder_path = f'../uf/{product}/'
        # Read the CSV files and extract estado names
        estados = read_csv_files(folder_path)  # Assuming you have this function defined elsewhere
        for estado in estados:
            df = pd.read_csv(f"../uf/{product}/mensal_{estado}_{product}.csv", header=0, sep=";")
            df2 = df.copy()
            df = df.iloc[:-periodo]
            if model_folders:
                first_model_folder = model_folders[i]

                # Load the first model pipeline
                version = "config.json"
                pipeline = BaseChronosPipeline.from_pretrained(
                    f"./output-P-7/{first_model_folder}/checkpoint-final/",
                    device_map="cuda",  # Use "cpu" if you don't have a GPU
                    torch_dtype=torch.bfloat16,
                )


            # Predict using the pipeline
            quantiles, mean = pipeline.predict_quantiles(
                context=torch.tensor(df['m3']),
                prediction_length=12,
                quantile_levels=[0.1, 0.5, 0.9],
            )

            low, median, high = quantiles[0, :, 0], quantiles[0, :, 1], quantiles[0, :, 2]

            ########################################################################

            Valores_Reais = df2['m3'].tail(horizon).reset_index(drop=True)
            basepredictions = baseline_persistent_window(df['m3'], horizon)

            predictions_df2 = pd.DataFrame({'Predictions': median})

            rmse_result2 = rmse(Valores_Reais, predictions_df2['Predictions'])
            mape_result2 = mape(Valores_Reais, predictions_df2['Predictions'])
            pocid_result2 = pocid(Valores_Reais, predictions_df2['Predictions'])
            mcpm_result2 = mcpm(rmse_result2, mape_result2, pocid_result2)
            pbe_result2 = pbe(Valores_Reais, predictions_df2['Predictions'])
            mase_result2 = mase(Valores_Reais, predictions_df2['Predictions'], basepredictions)
            i += 1
            # Collect data for final CSV
            all_data.append([
                product, estado, first_model_folder, 'Chronos', horizon, mape_result2, pocid_result2, pbe_result2, mase_result2,
                *predictions_df2['Predictions'].values
            ])
except IndexError as e:
    print('hhhiiiiiiii')

# Write all data to a single CSV file
with open(f'consolidated_output{ano}.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    # Write header (modify as needed)
    writer.writerow([
        'Product', 'Estado', 'Model Folder', 'Model', 'Horizon', 'MAPE', 'POCID', 'PBE', 'MASE',
        *[f'P_{i+1}' for i in range(horizon)]
    ])
    writer.writerows(all_data)

print("All data consolidated into 'consolidated_output.csv'.")

hhhiiiiiiii
All data consolidated into 'consolidated_output.csv'.
