# Environment

In [1]:
!python -V

Python 3.10.8


In [2]:
!pip install plotly
!pip install Prophet



In [3]:
import os
import re
import warnings
from datetime import datetime, timedelta

import plotly.express as px
import numpy as np

import pyspark
import pyspark.pandas as ps

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

from prophet import Prophet



In [4]:
ps.options.display.max_rows = 10

ps.set_option('plotting.backend', 'plotly')

os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"] = "lab"

warnings.filterwarnings("ignore") 

In [5]:
print(f'pyspark version: {pyspark.__version__}')

pyspark version: 3.3.1


# Spark Secion

In [6]:
conf = pyspark.SparkConf()

conf.setAppName('Task1')
conf.setMaster('local[2]')

sc = SparkContext.getOrCreate(conf)
spark = SparkSession(sc)

# Load Data

In [7]:
wd = os.getcwd()
path_data = os.path.join(wd, 'data', 'forcasting_cs_data.csv')
df = ps.read_csv(path_data)
df.head()

Unnamed: 0,Product,date,Sales,Price Discount (%),In-Store Promo,Catalogue Promo,Store End Promo,Google_Mobility,Covid_Flag,V_DAY,EASTER,CHRISTMAS
0,SKU1,05/02/17,27750,0%,0,0,0,0.0,0,0,0,0
1,SKU1,12/02/17,29023,0%,1,0,1,0.0,0,1,0,0
2,SKU1,19/02/17,45630,17%,0,0,0,0.0,0,0,0,0
3,SKU1,26/02/17,26789,0%,1,0,1,0.0,0,0,0,0
4,SKU1,05/03/17,41999,17%,0,0,0,0.0,0,0,0,0


## Column name homogenization

In [8]:
columns = [re.sub("[^A-Z0-9 _]", "", column, 0, re.IGNORECASE) for column in df.columns]
columns = [column.replace('_',' ').title().replace(' ','') for column in columns]
df.columns = columns

## Type convertion

In [9]:
df['Day'] = df['Date'].apply(lambda x: x.split('/')[0])
df['Month'] = df['Date'].apply(lambda x: x.split('/')[1])
df['Year'] = df['Date'].apply(lambda x: x.split('/')[2])
df['Date'] = df['Month'] + '/' + df['Day'] + '/' + df['Year']
df['Date'] = ps.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year
df['Cw'] = df['Date'].dt.week
df['Quarter'] = df['Date'].dt.quarter

In [10]:
df['SkuNumber'] = df['Product'].apply(lambda x: int(x.replace('SKU','')))
df = df.drop(['Product'], axis= 1)
df['PriceDiscount'] = df['PriceDiscount'].apply(lambda x: float(x.replace('%','')))

In [11]:
df.shape

(1218, 16)

# Model

In [12]:
from pyspark.sql.functions import pandas_udf, PandasUDFType, sum, max, col, concat, lit
from pyspark.sql.types import *
import pandas as pd

## Schema of the modle output

In [13]:
schema = StructType([
        StructField('SkuNumber', IntegerType()),
        StructField('ds', TimestampType()),
        StructField('yhat', DoubleType()),
])

## PD UDF for training modles

In [14]:
@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def apply_model(df):

    def train_fitted_prophet(df, cutoff):

        ts_train = (df
                    .query('Date <= @cutoff')
                    .rename(columns={'Date': 'ds', 'Sales': 'y'})
                    .sort_values('ds')
                    
                    )

        ts_test = (df
                   .query('Date > @cutoff')
                   .rename(columns={'Date': 'ds', 'Sales': 'y'})
                   .sort_values('ds')
                   .assign(ds=lambda x: pd.to_datetime(x["ds"]))
                   .drop('y', axis=1)
                   )
                
        # init model
        model = Prophet(
                    interval_width=0.95,
                    growth='linear',
                    daily_seasonality=False,
                    weekly_seasonality=True,
                    yearly_seasonality=True,
                    seasonality_mode='multiplicative'
        )

        model.fit(ts_train)

        #df["Date"] = pd.to_datetime(df["Date"])

        y_hat = model.predict(ts_test)

        ts_hat = (y_hat[["ds", "yhat"]]
                  .assign(ds=lambda x: pd.to_datetime(x["ds"]))
                  ).merge(ts_test, on=["ds"], how="left") 
        
    
        return pd.DataFrame(ts_hat, columns=schema.fieldNames())

    return train_fitted_prophet(df, cutoff)

In [15]:
from sklearn.metrics import mean_absolute_percentage_error

In [16]:
days_to_subtract = 90
cutoff = df['Date'].max() - timedelta(days=days_to_subtract)

global_predictions = (df.to_spark()
                        .groupBy("SkuNumber")
                        .apply(apply_model)
                        )
                        
global_predictions.show()

+---------+-------------------+------------------+
|SkuNumber|                 ds|              yhat|
+---------+-------------------+------------------+
|        1|2020-10-05 00:00:00| 57229.92198227653|
|        1|2020-10-18 00:00:00|35246.882501721295|
|        1|2020-10-25 00:00:00|35001.662114038525|
|        1|2020-11-10 00:00:00| 65918.67587289202|
|        1|2020-11-15 00:00:00| 63865.50055251804|
|        1|2020-11-22 00:00:00| 48818.10135401698|
|        1|2020-11-29 00:00:00| 37893.66656567042|
|        1|2020-12-01 00:00:00| 39956.50101103427|
|        1|2020-12-04 00:00:00| 38435.29210996273|
|        1|2020-12-07 00:00:00| 62696.00001104938|
|        1|2020-12-13 00:00:00| 96102.86789396201|
|        1|2020-12-20 00:00:00|133219.53355112256|
|        1|2020-12-27 00:00:00|133771.82948524706|
|        2|2020-10-05 00:00:00|3290.3942239504318|
|        2|2020-10-18 00:00:00|10690.375654232455|
|        2|2020-10-25 00:00:00|11763.380397670184|
|        2|2020-11-10 00:00:00|