# A usual Forecasting preparation & modules

## Setup Libraries & Import

In [None]:
%pip install openpyxl lightgbm xgboost

In [None]:
dbutils.library.restartPython()

In [None]:
# Decorator to calculate running time.
def with_time_review(func):
    import time
    import traceback

    def wrapper(*args, **kwargs):
      begin = time.time()
      result = func(*args, **kwargs)
      end = round(time.time() - begin, 5)


      print (f"Function: '{func.__name__}' runs for: {end} seconds.")
      print ("----------------------------\n")
      return result
    return wrapper

In [None]:
import pandas as pd
import glob
import os

# Import parallel libraries
import pyspark.pandas as ps_pd
import ray

# Import machine learning model
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgbm
import xgboost as xgb

# Get Number of CPUs from Spark
SPARK_CPUS = sc.defaultParallelism
print(SPARK_CPUS)

## Setup folder and Read data

In [None]:
%fs ls dbfs:/mnt

In [None]:
OUTPUT_FOLDER = .................
os.makedirs('/dbfs/' + OUTPUT_FOLDER, exist_ok=True)

In [None]:
FILE_PATH = '../dataset/Stallion-AbinBev-kaggle.csv'
df_dataset = pd.read_csv(FILE_PATH)
df_dataset['KEY'] = df_dataset['Agency'] + '_' + df_dataset['SKU']
df_dataset = df_dataset.drop(columns=['Price']) # This PRICE needs to remove due to leakage variables. PRICE = SALES + PROMO

In [None]:
display(df_dataset)

## Forecast module

In [None]:
def codeblock_feature_engineering(key, df_group):
    df_group = df_group.sort_values(by=['YearMonth'])
    df_group = df_group.reset_index(drop=True)
    
    for col in ['Sales', 'Promotions']:
        for lag in range(1, 13):
            df_group[f'f__LAG_{col}_{lag}'] = df_group[col].shift(lag)
        for window in [3, 6, 9, 12]:
            df_group[f'f__MA_{col}_{window}'] = df_group[col].rolling(window).mean().shift(1)
            df_group[f'f__MSTD_{col}_{window}'] = df_group[col].rolling(window).std().shift(1)

    df_group = df_group.drop(columns=['Agency', 'SKU'])
    df_group = df_group.dropna()
    return df_group

def codeblock_model_forecasting(key, df_train, df_test):
    import numpy as np
    np.random.seed(1234)

    models_list = [
        RandomForestRegressor(n_jobs=1, random_state=1234),
        lgbm.LGBMRegressor(n_jobs=1, random_state=1234),
        xgb.XGBRegressor(n_jobs=1, random_state=1234),
    ]

    X_train, y_train = df_train.drop(columns=['Sales']), df_train['Sales']
    X_test, y_test = df_test.drop(columns=['Sales']), df_test['Sales']

    list_numeric_cols = X_train.select_dtypes(include=np.number).columns.tolist()

    for model in models_list:
        model.fit(X_train[list_numeric_cols], y_train)
        y_pred = model.predict(X_test[list_numeric_cols])
        df_test[model.__class__.__name__] = y_pred
    
    return df_test  

def codeblock_evalutation_pipeline(key, df_group, training_yearmonth):    

    df_feature_engineering = codeblock_feature_engineering(key, df_group)

    df_train = df_feature_engineering.query(f"YearMonth < {training_yearmonth}")
    df_test = df_feature_engineering.query(f"YearMonth >= {training_yearmonth}")

    df_forecast = codeblock_model_forecasting(key, df_train, df_test)
    
    return df_forecast

## Demo on 1 Key

In [None]:
training_yearmonth = 201707
key = 'Agency_01_SKU_01'
df_group = df_dataset.query(f"KEY == '{key}'")
df_demo = codeblock_evalutation_pipeline(key, df_group, training_yearmonth)

In [None]:
display(df_demo)

In [None]:
schema_spark = spark.createDataFrame(df_demo).schema

# Spark Man vs. Ray Captain Competition 😏

## PySpark applyInPandas

In [None]:
spark_df = spark.createDataFrame(df_dataset)

In [None]:
@with_time_review
def test_SPARK(spark_df, training_yearmonth):
    spark_df_output = spark_df.groupBy("KEY").applyInPandas(
        lambda df_group: codeblock_evalutation_pipeline(
            key=df_group["KEY"].iloc[0], df_group=df_group, training_yearmonth=training_yearmonth
        ), schema=schema_spark
    )
    spark_df_output.write.mode('overwrite').parquet(f"dbfs:/{OUTPUT_FOLDER}/SPARK_OUTPUT_parquet")

In [None]:
for i in range(5):
    print(f"Test: {i}")
    test_SPARK(spark_df=spark_df, training_yearmonth=training_yearmonth)

## PySpark applyInPandas with Repartition (The Magic before Ray)

In [None]:
spark_df_repartition = spark_df.repartition(SPARK_CPUS * 3, 'KEY')

In [None]:
for i in range(5):
    print(f"Test: {i}")
    test_SPARK(spark_df=spark_df_repartition, training_yearmonth=training_yearmonth)

## Parallel setup for Ray

In [None]:
ray.init(..................................)

In [None]:
@ray.remote
def ray_verify(i):
    return i
tasks = [ray_verify.remote(i) for i in range(SPARK_CPUS)]
tasks = ray.get(tasks)
print(tasks)

In [None]:
# Get Number of CPUs from Ray
RAY_CPUS = ray.available_resources()

print(SPARK_CPUS, ' | ', RAY_CPUS)

## Ray Core Demo

In [None]:
@with_time_review
def test_RAY_CORE(pandas_df, training_yearmonth):
    pass

In [None]:
for i in range(5):
    print(f"Test: {i}")
    test_RAY_CORE(pandas_df=df_dataset, training_yearmonth=training_yearmonth)

## Ray Data Demo

In [None]:
ray_df = ray.data.from_pandas(df_dataset)
ray_df = ray_df.repartition(RAY_CPUS)

In [None]:
@with_time_review
def test_RAY_DATA(ray_df, training_yearmonth):
    pass

In [None]:
for i in range(5):
    print(f"Test: {i}")
    test_RAY_DATA(ray_df=ray_df, training_yearmonth=training_yearmonth)

# Run the evaluation 5 times

In [None]:
# ray_output = pd.read_parquet(f"/dbfs/{OUTPUT_FOLDER}/RAY_PANDAS.parquet")
# spark_output = pd.read_parquet(f"/dbfs/{OUTPUT_FOLDER}/SPARK_OUTPUT_parquet")

# print( ray_output['RandomForestRegressor'].sum() == spark_output['RandomForestRegressor'].sum())
# print( ray_output['LGBMRegressor'].sum() == spark_output['LGBMRegressor'].sum() )
# print( ray_output['XGBRegressor'].sum() == spark_output['XGBRegressor'].sum() )