# forecasting_demand



## Startup cells

In [0]:
# Set environment variables for sagemaker_studio imports

import os
os.environ['DataZoneProjectId'] = 'cgk5ugdebva2ef'
os.environ['DataZoneDomainId'] = 'dzd-bsa8pqfbpcm7on'
os.environ['DataZoneEnvironmentId'] = 'cbebxulwrbe6g7'
os.environ['DataZoneDomainRegion'] = 'us-east-1'

# create both a function and variable for metadata access
_resource_metadata = None

def _get_resource_metadata():
    global _resource_metadata
    if _resource_metadata is None:
        _resource_metadata = {
            "AdditionalMetadata": {
                "DataZoneProjectId": "cgk5ugdebva2ef",
                "DataZoneDomainId": "dzd-bsa8pqfbpcm7on",
                "DataZoneEnvironmentId": "cbebxulwrbe6g7",
                "DataZoneDomainRegion": "us-east-1",
            }
        }
    return _resource_metadata
metadata = _get_resource_metadata()

In [0]:
"""
Logging Configuration

Purpose:
--------
This sets up the logging framework for code executed in the user namespace.
"""

from typing import Optional


def _set_logging(log_dir: str, log_file: str, log_name: Optional[str] = None):
    import os
    import logging
    from logging.handlers import RotatingFileHandler

    level = logging.INFO
    max_bytes = 5 * 1024 * 1024
    backup_count = 5

    # fallback to /tmp dir on access, helpful for local dev setup
    try:
        os.makedirs(log_dir, exist_ok=True)
    except Exception:
        log_dir = "/tmp/kernels/"

    os.makedirs(log_dir, exist_ok=True)
    log_path = os.path.join(log_dir, log_file)

    logger = logging.getLogger() if not log_name else logging.getLogger(log_name)
    logger.handlers = []
    logger.setLevel(level)

    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")

    # Rotating file handler
    fh = RotatingFileHandler(filename=log_path, maxBytes=max_bytes, backupCount=backup_count, encoding="utf-8")
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    logger.info(f"Logging initialized for {log_name}.")


_set_logging("/var/log/computeEnvironments/kernel/", "kernel.log")
_set_logging("/var/log/studio/data-notebook-kernel-server/", "metrics.log", "metrics")

In [0]:
import logging
from sagemaker_studio import ClientConfig, sqlutils, sparkutils, dataframeutils

logger = logging.getLogger(__name__)
logger.info("Initializing sparkutils")
spark = sparkutils.init()
logger.info("Finished initializing sparkutils")

In [0]:
def _reset_os_path():
    """
    Reset the process's working directory to handle mount timing issues.
    
    This function resolves a race condition where the Python process starts
    before the filesystem mount is complete, causing the process to reference
    old mount paths and inodes. By explicitly changing to the mounted directory
    (/home/sagemaker-user), we ensure the process uses the correct, up-to-date
    mount point.
    
    The function logs stat information (device ID and inode) before and after
    the directory change to verify that the working directory is properly
    updated to reference the new mount.
    
    Note:
        This is executed at module import time to ensure the fix is applied
        as early as possible in the kernel initialization process.
    """
    try:
        import os
        import logging

        logger = logging.getLogger(__name__)
        logger.info("---------Before------")
        logger.info("CWD: %s", os.getcwd())
        logger.info("stat('.'): %s %s", os.stat('.').st_dev, os.stat('.').st_ino)
        logger.info("stat('/home/sagemaker-user'): %s %s", os.stat('/home/sagemaker-user').st_dev, os.stat('/home/sagemaker-user').st_ino)

        os.chdir("/home/sagemaker-user")

        logger.info("---------After------")
        logger.info("CWD: %s", os.getcwd())
        logger.info("stat('.'): %s %s", os.stat('.').st_dev, os.stat('.').st_ino)
        logger.info("stat('/home/sagemaker-user'): %s %s", os.stat('/home/sagemaker-user').st_dev, os.stat('/home/sagemaker-user').st_ino)
    except Exception as e:
        logger.exception(f"Failed to reset working directory: {e}")

_reset_os_path()

## Notebook

# Project 1: Demand Forecasting with LightGBM

## 1. Introduction
This notebook demonstrates a complete pipeline for demand forecasting using LightGBM.
We will:
- Load data from S3
- Explore and preprocess the dataset
- Create temporal features
- Split the data into train and validation sets
- Train a LightGBM model
- Track parameters, metrics, and the model using MLflow

## 2. Data Loading
Load the dataset from the S3 bucket.

In [0]:
import pandas as pd
import numpy as np
import sklearn
import mlflow

print("OK")

OK


In [0]:
import boto3
import pandas as pd
from io import BytesIO

print("STEP 1: creating S3 client")
s3 = boto3.client("s3")

BUCKET = "ml-portfolio-av"
KEY = "train.csv"

obj = s3.get_object(Bucket=BUCKET, Key=KEY)
df = pd.read_csv(BytesIO(obj["Body"].read()))

print("DF LOADED")
print(df.shape)

STEP 1: creating S3 client


DF LOADED
(3000888, 6)


# 3. Exploratory Data Analysis

Check columns, data types, and date range.

In [0]:
print(df.columns)
print(df.dtypes)

Index(['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion'], dtype='object')
id               int64
date            object
store_nbr        int64
family          object
sales          float64
onpromotion      int64
dtype: object


In [0]:
df["date"] = pd.to_datetime(df["date"])
print(df["date"].min(), df["date"].max())

2013-01-01 00:00:00 2017-08-15 00:00:00


# 4. Sorting and Aggregation

Sort data by date and aggregate sales by date and family.

In [0]:
df = df.sort_values("date").reset_index(drop=True)
print("OK - sorted")

OK - sorted


In [0]:
df_agg = (
    df
    .groupby(["date", "family"], as_index=False)
    .agg({"sales": "sum"})
)

print(df_agg.shape)
df_agg.head()


(55572, 3)


Unnamed: 0,date,family,sales
0,2013-01-01,AUTOMOTIVE,0.0
1,2013-01-01,BABY CARE,0.0
2,2013-01-01,BEAUTY,2.0
3,2013-01-01,BEVERAGES,810.0
4,2013-01-01,BOOKS,0.0


# 5. Train / Validation Split

Split the dataset using a temporal split.

In [0]:
split_date = "2017-01-01"

train_df = df_agg[df_agg["date"] < split_date]
val_df   = df_agg[df_agg["date"] >= split_date]

print("TRAIN:", train_df.shape)
print("VAL:", val_df.shape)


TRAIN: (48081, 3)
VAL: (7491, 3)


# 6. Feature Engineering

Add temporal features: day of week, month, day.

In [0]:
def add_time_features(df):
    df = df.copy()
    df["dayofweek"] = df["date"].dt.dayofweek
    df["month"] = df["date"].dt.month
    df["day"] = df["date"].dt.day
    return df

train_df = add_time_features(train_df)
val_df   = add_time_features(val_df)

print(train_df.head())


        date      family  sales  dayofweek  month  day
0 2013-01-01  AUTOMOTIVE    0.0          1      1    1
1 2013-01-01   BABY CARE    0.0          1      1    1
2 2013-01-01      BEAUTY    2.0          1      1    1
3 2013-01-01   BEVERAGES  810.0          1      1    1
4 2013-01-01       BOOKS    0.0          1      1    1


In [0]:
import lightgbm as lgb
print("LightGBM OK", lgb.__version__)

LightGBM OK 4.6.0


Encode the categorical variable family.

In [0]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_df["family_enc"] = le.fit_transform(train_df["family"])
val_df["family_enc"] = le.transform(val_df["family"])

print(train_df[["family", "family_enc"]].head())

       family  family_enc
0  AUTOMOTIVE           0
1   BABY CARE           1
2      BEAUTY           2
3   BEVERAGES           3
4       BOOKS           4


# 7. Prepare Features and Target

In [0]:
FEATURES = ["family_enc", "dayofweek", "month", "day"]
TARGET = "sales"

X_train = train_df[FEATURES]
y_train = train_df[TARGET]

X_val = val_df[FEATURES]
y_val = val_df[TARGET]

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

X_train shape: (48081, 4)
y_train shape: (48081,)


# 8. LightGBM Training

Train a LightGBM model on CPU.

In [0]:
import lightgbm as lgb

train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

params = {
    "objective": "regression",
    "metric": "rmse",
    "verbose": -1,
    "boosting_type": "gbdt",
    "num_threads": 2  # CPU ligero
}

print("Training started...")

callbacks = [lgb.early_stopping(stopping_rounds=10)]

model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    num_boost_round=50,
    callbacks=callbacks
)

print("Training finished")


Training started...
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[50]	training's rmse: 14853.5	valid_1's rmse: 26160.3
Training finished


# 9. MLflow Tracking

Log parameters, metrics, and model in MLflow.

In [0]:
import mlflow
import mlflow.lightgbm
from sklearn.metrics import mean_squared_error
import numpy as np

mlflow.set_experiment("Forecasting_Demand")

with mlflow.start_run():
    mlflow.log_params(params)
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mlflow.log_metric("rmse", rmse)
    mlflow.lightgbm.log_model(model, name="model")

print("MLflow run logged, RMSE:", rmse)





MLflow run logged, RMSE: 26160.26321213675


# 10. Summary

- Dataset loaded from S3 
- Temporal features added
- Train / Validation split applied
- LightGBM trained on CPU
- Metrics and model tracked in MLflow
- Ready to deploy or integrate in production

In [0]:
import os
print(os.path.expanduser("~"))



/home/sagemaker-user


In [0]:
import os

# Carpeta raíz del proyecto (local en tu home)
project_root = "/home/sagemaker-user/project_forecasting_demand"

# Subcarpetas
subfolders = ["notebooks", "scripts", "data"]

# Crear carpeta principal
os.makedirs(project_root, exist_ok=True)

# Crear subcarpetas
for f in subfolders:
    os.makedirs(os.path.join(project_root, f), exist_ok=True)

print("Project folders created locally:")
print(os.listdir(project_root))


Project folders created locally:
['notebooks', 'scripts', 'data']


In [0]:
import subprocess

# Git configuration with your data
git_name = "Javier-DataScience"
git_email = "alvaro.vega.vargas@gmail.com"

subprocess.run(["git", "config", "--global", "user.name", git_name])
subprocess.run(["git", "config", "--global", "user.email", git_email])

print("Git configured successfully")

Git configured successfully


In [0]:
import shutil
import os

# Nombre actual de tu notebook tal como aparece en SageMaker
current_notebook = "/home/sagemaker-user/Forecasting Demand.ipynb"

# Carpeta destino
destination_folder = "/home/sagemaker-user/project_forecasting_demand/notebooks/"

# Crear carpeta destino si no existe
os.makedirs(destination_folder, exist_ok=True)

# Nombre final del notebook
destination_notebook = os.path.join(destination_folder, "forecasting_demand.ipynb")

# Mover el notebook
shutil.move(current_notebook, destination_notebook)

print("Notebook moved to:", destination_notebook)


FileNotFoundError: [Errno 2] No such file or directory: '/home/sagemaker-user/Forecasting Demand.ipynb'

In [0]:
import subprocess
import os

# Carpeta raíz del proyecto
project_root = "/home/sagemaker-user/project_forecasting_demand"

# Navegar a la carpeta del proyecto
os.chdir(project_root)

# Crear README.md con contenido básico
readme_content = """
# Project 1: Demand Forecasting with LightGBM

## Overview
This project demonstrates a complete pipeline for demand forecasting using LightGBM.
Tracked with MLflow, fully reproducible in Amazon SageMaker.

## Dataset
- Source: Kaggle or internal CSV
- Columns: id, date, store_nbr, family, sales, onpromotion
- Stored in S3 bucket: ml-portfolio-av

## Features
- Temporal features: day of week, month, day
- Categorical encoding for 'family'

## Model
- LightGBM regression
- CPU training to save costs
- Metrics: RMSE (Validation)

## Folder Structure
project_forecasting_demand/
├─ notebooks/
├─ scripts/
├─ data/
├─ README.md
"""

# Guardar README.md
with open("README.md", "w") as f:
    f.write(readme_content)

print("README.md created")

# Inicializar git si no está inicializado
subprocess.run(["git", "init"])

# Agregar archivos al staging
subprocess.run(["git", "add", "notebooks/forecasting_demand.ipynb", "README.md"])

# Primer commit
subprocess.run(["git", "commit", "-m", "Initial commit: Project 1 - Forecasting Demand"])

print("First commit done")


README.md created
Initialized empty Git repository in /home/sagemaker-user/project_forecasting_demand/.git/
On branch master

Initial commit

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	README.md

nothing added to commit but untracked files present (use "git add" to track)
First commit done


hint: Using 'master' as the name for the initial branch. This default branch name
hint: is subject to change. To configure the initial branch name to use in all
hint: 
hint: 	git config --global init.defaultBranch <name>
hint: 
hint: Names commonly chosen instead of 'master' are 'main', 'trunk' and
hint: 'development'. The just-created branch can be renamed via this command:
hint: 
hint: 	git branch -m <name>
fatal: pathspec 'notebooks/forecasting_demand.ipynb' did not match any files


## Shutdown cells

In [0]:
"""
Stop spark session and associated Athena Spark session
"""

from IPython import get_ipython as _get_ipython
_get_ipython().user_ns["spark"].stop()