# Análisis y Modelado de Series de Tiempo de Combustibles en Guatemala: Consumo, Importación y Precios (2000–2025)

## Carga, limpieza y preparación de datos

### Librerias

In [305]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.arima.model import ARIMA

# import pmdarima as pm  # auto_arima

from prophet import Prophet

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.neural_network import MLPRegressor  # NN simple o usar tensorflow dependerá por ahora el simple

### Variables - Constantes

In [306]:

DATA_DIR = "./data-input"
OUTPUT_DIR = "./data-output"
REPORTES = "./reportes"

# input files
CONSUMO_2024_XLSX = f"{DATA_DIR}/CONSUMO-HIDROCARBUROS-2024-12.xlsx"
CONSUMO_2025_XLSX = f"{DATA_DIR}/VENTAS-HIDROCARBUROS-2025-05.xlsx"
IMPORT_2024_XLSX = f"{DATA_DIR}/IMPORTACION-HIDROCARBUROS-VOLUMEN-2024-12.xlsx"
IMPORT_2025_XLSX = f"{DATA_DIR}/IMPORTACION-HIDROCARBUROS-VOLUMEN-2025-05.xlsx"
PRECIOS_2025_XLSX = f"{DATA_DIR}/Precios-Promedio-Nacionales-Diarios-2025-1.xlsx"

# output files
CONSUMO_CSV = f"{OUTPUT_DIR}/consumo_combustibles.csv"
IMPORT_CSV = f"{OUTPUT_DIR}/importacion_combustibles.csv"
PRECIOS_CSV = f"{OUTPUT_DIR}/precios_diarios.csv"
TARGET_COLUMNS_CI = ["fecha", "regular", "superior", "diesel", "glp"]
TARGET_COLUMNS_P = [
    'fecha',
    'regular - gtq/galon',
    'superior - gtq/galon',
    'diesel - gtq/galon',
    'glp cilindro 25lbs. - gtq/lb'
]

# var control
SAVE_CONSUMO_CSV = True
SAVE_IMPORT_CSV = True
SAVE_PRECIOS_CSV = True
SAVE_REPORTES = True

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(REPORTES, exist_ok=True)

### Carga y conversión de archivos Excel a CSV

In [307]:
def loadAndProcessFuelData(file2024, file2025, saveCsv=False, outputCsvPath=None):
    # Load 2024 data
    df2024 = pd.read_excel(file2024, skiprows=6)
    df2024.columns = df2024.columns.str.strip().str.lower()
    print(f"2024 Data: {df2024.shape[0]} rows, {df2024.shape[1]} columns")

    # Drop last 3 rows
    df2024 = df2024.iloc[:-3]
    print("Removed unnecessary rows from 2024 data.")
    print(f"Updated 2024 Data: {df2024.shape[0]} rows, {df2024.shape[1]} columns")

    # Select and process columns
    target_col_2024 = [
        "fecha",
        "gas licuado de petróleo",
        "gasolina regular",
        "gasolina superior",
        "diesel alto azufre",
        "diesel bajo azufre"
    ]
    df2024 = df2024[target_col_2024]
    df2024["diesel bajo azufre"] = df2024["diesel bajo azufre"] + df2024["diesel alto azufre"]
    df2024 = df2024.drop(columns=["diesel alto azufre"])
    print("Processed 2024 columns.")

    # Load 2025 data
    df2025 = pd.read_excel(file2025, skiprows=6)
    df2025.columns = df2025.columns.str.strip().str.lower()
    print(f"2025 Data: {df2025.shape[0]} rows, {df2025.shape[1]} columns")

    # Drop last 3 rows
    df2025 = df2025.iloc[:-3]
    print("Removed unnecessary rows from 2025 data.")
    print(f"Updated 2025 Data: {df2025.shape[0]} rows, {df2025.shape[1]} columns")

    # Select target columns
    target_col_2025 = [
        "fecha",
        "gas licuado de petróleo",
        "gasolina regular",
        "gasolina superior",
        "diesel bajo azufre"
    ]
    df2025 = df2025[target_col_2025]
    print("Processed 2025 columns.")

    # Concatenate both years
    dfResult = pd.concat([df2024, df2025], ignore_index=True)
    print("Merged 2024 and 2025 data.")

    # Rename and reorder columns
    dfResult.rename(columns={
        "fecha": "fecha",
        "gasolina regular": "regular",
        "gasolina superior": "superior",
        "diesel bajo azufre": "diesel",
        "gas licuado de petróleo": "glp"
    }, inplace=True)

    dfResult = dfResult[TARGET_COLUMNS_CI]

    # Save to CSV if requested
    if saveCsv and outputCsvPath:
        dfResult.to_csv(outputCsvPath, index=False)
        print(f"Data saved to {outputCsvPath}")

    return dfResult


In [308]:
def loadAndProcessMultipleSheets(configList, saveCsv=False, outputCsvPath=None):
    dfs = []

    for config in configList:
        filePath = config["filePath"]
        sheetName = config["sheetName"]
        skipRows = config["skipRows"]
        rowsToRemove = config["rowsToRemove"]

        df = pd.read_excel(filePath, header=[0, 1], skiprows=skipRows, sheet_name=sheetName)
        print(f"Loaded sheet '{sheetName}': {df.shape[0]} rows")

        df = df.iloc[:-rowsToRemove]

        target_columns_multi = [
            ('FECHA', 'Unnamed: 0_level_1'),
            ('Regular', 'GTQ/GALON'),
            ('Superior', 'GTQ/GALON'),
            ('Diesel', 'GTQ/GALON'),
            ('Glp Cilindro 25Lbs.', 'GTQ/LB')
        ]
        df = df[target_columns_multi]

        df.columns = [f"{col1.strip()} - {col2.strip()}".lower() for col1, col2 in df.columns]
        df.rename(columns={'fecha - unnamed: 0_level_1': 'fecha'}, inplace=True)
        df = df[TARGET_COLUMNS_P]

        print(f"Processed sheet '{sheetName}' with {df.shape[0]} rows")
        dfs.append(df)

    # Concatenate all
    df_final = pd.concat(dfs, ignore_index=True)

    if saveCsv and outputCsvPath:
        df_final.to_csv(outputCsvPath, index=False)
        print(f"Final data saved to {outputCsvPath}")

    return df_final

In [309]:
configList = [
    {"filePath": PRECIOS_2025_XLSX, "sheetName": 0, "skipRows": 7, "rowsToRemove": 9},
    {"filePath": PRECIOS_2025_XLSX, "sheetName": 1, "skipRows": 7, "rowsToRemove": 2},
    {"filePath": PRECIOS_2025_XLSX, "sheetName": 2, "skipRows": 7, "rowsToRemove": 2},
    {"filePath": PRECIOS_2025_XLSX, "sheetName": 3, "skipRows": 6, "rowsToRemove": 2},
    {"filePath": PRECIOS_2025_XLSX, "sheetName": 4, "skipRows": 6, "rowsToRemove": 2}
]

df_consumo = loadAndProcessFuelData(CONSUMO_2024_XLSX, CONSUMO_2025_XLSX, saveCsv=SAVE_CONSUMO_CSV, outputCsvPath=CONSUMO_CSV)
df_importaciones = loadAndProcessFuelData(IMPORT_2024_XLSX, IMPORT_2025_XLSX, saveCsv=SAVE_IMPORT_CSV, outputCsvPath=IMPORT_CSV)

df_precios = loadAndProcessMultipleSheets(
    configList=configList,
    saveCsv=SAVE_PRECIOS_CSV,
    outputCsvPath=PRECIOS_CSV
)

2024 Data: 303 rows, 24 columns
Removed unnecessary rows from 2024 data.
Updated 2024 Data: 300 rows, 24 columns
Processed 2024 columns.
2025 Data: 8 rows, 19 columns
Removed unnecessary rows from 2025 data.
Updated 2025 Data: 5 rows, 19 columns
Processed 2025 columns.
Merged 2024 and 2025 data.
Data saved to ./data-output/consumo_combustibles.csv
2024 Data: 291 rows, 25 columns
Removed unnecessary rows from 2024 data.
Updated 2024 Data: 288 rows, 25 columns
Processed 2024 columns.
2025 Data: 8 rows, 19 columns
Removed unnecessary rows from 2025 data.
Updated 2025 Data: 5 rows, 19 columns
Processed 2025 columns.
Merged 2024 and 2025 data.
Data saved to ./data-output/importacion_combustibles.csv
Loaded sheet '0': 203 rows
Processed sheet '0' with 194 rows
Loaded sheet '1': 368 rows
Processed sheet '1' with 366 rows
Loaded sheet '2': 367 rows
Processed sheet '2' with 365 rows
Loaded sheet '3': 367 rows
Processed sheet '3' with 365 rows
Loaded sheet '4': 367 rows
Processed sheet '4' with 

## Generación de reportes para explorar data

In [310]:
def generateBasicReport(df, name):
    report_path = os.path.join(REPORTES, f"{name}_reporte.txt")

    with open(report_path, "w", encoding="utf-8") as f:
        f.write("REPORTE BÁSICO DE DATAFRAME\n")
        f.write("=" * 40 + "\n\n")

        # Dimensiones
        f.write(f"Total filas: {df.shape[0]}\n")
        f.write(f"Total columnas: {df.shape[1]}\n\n")

        # Columnas
        f.write("Columnas:\n")
        for col in df.columns:
            f.write(f"  - {col}\n")
        f.write("\n")

        # Tipos de datos
        f.write("Tipos de datos por columna:\n")
        f.write(df.dtypes.to_string())
        f.write("\n\n")

        # Nulos
        f.write("Valores nulos por columna:\n")
        f.write(df.isnull().sum().to_string())
        f.write("\n\n")

        # Porcentaje de nulos
        f.write("Porcentaje de nulos por columna:\n")
        null_pct = (df.isnull().mean() * 100).round(2)
        f.write(null_pct.to_string())
        f.write("\n\n")

        # Duplicados
        f.write(f"Total filas duplicadas: {df.duplicated().sum()}\n\n")

        # Estadísticas básicas (numéricas)
        f.write("Estadísticas básicas (solo numéricas):\n")
        f.write(df.describe().to_string())
        f.write("\n\n")

        # Medianas
        f.write("Mediana por columna numérica:\n")
        f.write(df.median(numeric_only=True).to_string())
        f.write("\n")

    print(f"Reporte generado en: {report_path}")


In [311]:
if SAVE_REPORTES:
  generateBasicReport(df_consumo, "consumo_combustibles")
  generateBasicReport(df_importaciones, "importacion_combustibles")
  generateBasicReport(df_precios, "precios_combustibles")

Reporte generado en: ./reportes\consumo_combustibles_reporte.txt
Reporte generado en: ./reportes\importacion_combustibles_reporte.txt
Reporte generado en: ./reportes\precios_combustibles_reporte.txt


In [312]:
# serie_precio_gasolina_regular      # Series de tiempo de precios mensual (Ciudad Capital)
# serie_import_diesel_total          # Serie mensual de importación total de diésel
# serie_consumo_gas_licuado          # Serie mensual de consumo de gas propano
