# Environment

In [28]:
!python -V

Python 3.10.8


In [29]:
!pip install plotly
!pip install Prophet



In [30]:
import os
import re
import warnings

import plotly.express as px
import numpy as np

import pyspark
import pyspark.pandas as ps

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [31]:
ps.options.display.max_rows = 10

ps.set_option('plotting.backend', 'plotly')

os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"] = "lab"

warnings.filterwarnings("ignore") 

In [32]:
print(f'pyspark version: {pyspark.__version__}')

pyspark version: 3.3.1


# Spark Secion

In [33]:
conf = pyspark.SparkConf()

conf.setAppName('Task1')
conf.setMaster('local[2]')

sc = SparkContext.getOrCreate(conf)
spark = SparkSession(sc)

# Load Data

In [34]:
wd = os.getcwd()
path_data = os.path.join(wd, 'study_case/Task1/data/', 'forcasting_cs_data.csv')
#path_data = os.path.join(wd, 'data', 'forcasting_cs_data.csv')
df = ps.read_csv(path_data)
df.head()

Unnamed: 0,Product,date,Sales,Price Discount (%),In-Store Promo,Catalogue Promo,Store End Promo,Google_Mobility,Covid_Flag,V_DAY,EASTER,CHRISTMAS
0,SKU1,05/02/17,27750,0%,0,0,0,0.0,0,0,0,0
1,SKU1,12/02/17,29023,0%,1,0,1,0.0,0,1,0,0
2,SKU1,19/02/17,45630,17%,0,0,0,0.0,0,0,0,0
3,SKU1,26/02/17,26789,0%,1,0,1,0.0,0,0,0,0
4,SKU1,05/03/17,41999,17%,0,0,0,0.0,0,0,0,0


## Column name homogenization

In [35]:
columns = [re.sub("[^A-Z0-9 _]", "", column, 0, re.IGNORECASE) for column in df.columns]
columns = [column.replace('_',' ').title().replace(' ','') for column in columns]
df.columns = columns

In [36]:
products = np.sort(df['Product'].unique().values)
for product in products:
    df[df['Product']==product].plot.line(x='Date', y='Sales', color="Product", title=f"{product} sales over 2017-2020").show()

## Type convertion

In [37]:
df.dtypes

Product            object
Date               object
Sales               int32
PriceDiscount      object
InstorePromo        int32
CataloguePromo      int32
StoreEndPromo       int32
GoogleMobility    float64
CovidFlag           int32
VDay                int32
Easter              int32
Christmas           int32
dtype: object

In [38]:
df['Day'] = df['Date'].apply(lambda x: x.split('/')[0])
df['Month'] = df['Date'].apply(lambda x: x.split('/')[1])
df['Year'] = df['Date'].apply(lambda x: x.split('/')[2])
df['Date'] = df['Month'] + '/' + df['Day'] + '/' + df['Year']
df['Date'] = ps.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year
df['Cw'] = df['Date'].dt.week
df['Quarter'] = df['Date'].dt.quarter

In [39]:
df['SkuNumber'] = df['Product'].apply(lambda x: int(x.replace('SKU','')))
df['PriceDiscount'] = df['PriceDiscount'].apply(lambda x: float(x.replace('%','')))

## Shape and missing values 

In [40]:
df.shape

(1218, 18)

In [41]:
df.isna().sum()

Product           0
Date              0
Sales             0
PriceDiscount     0
InstorePromo      0
CataloguePromo    0
StoreEndPromo     0
GoogleMobility    0
CovidFlag         0
VDay              0
dtype: int64
Showing only the first 10

In [42]:
df.isnull().sum()

Product           0
Date              0
Sales             0
PriceDiscount     0
InstorePromo      0
CataloguePromo    0
StoreEndPromo     0
GoogleMobility    0
CovidFlag         0
VDay              0
dtype: int64
Showing only the first 10

# Exploration

## Stats

In [43]:
df.head()

Unnamed: 0,Product,Date,Sales,PriceDiscount,InstorePromo,CataloguePromo,StoreEndPromo,GoogleMobility,CovidFlag,VDay,Easter,Christmas,Day,Month,Year,Cw,Quarter,SkuNumber
0,SKU1,2017-02-05,27750,0.0,0,0,0,0.0,0,0,0,0,5,2,2017,5,1,1
1,SKU1,2017-02-12,29023,0.0,1,0,1,0.0,0,1,0,0,12,2,2017,6,1,1
2,SKU1,2017-02-19,45630,17.0,0,0,0,0.0,0,0,0,0,19,2,2017,7,1,1
3,SKU1,2017-02-26,26789,0.0,1,0,1,0.0,0,0,0,0,26,2,2017,8,1,1
4,SKU1,2017-03-05,41999,17.0,0,0,0,0.0,0,0,0,0,5,3,2017,9,1,1


In [44]:
df.describe()

Unnamed: 0,Date,Sales,PriceDiscount,InstorePromo,CataloguePromo,StoreEndPromo,GoogleMobility,CovidFlag,VDay,Easter,Christmas,Month,Year,Cw,Quarter,SkuNumber
count,1218,1218.0,1218.0,1218.0,1218.0,1218.0,1218.0,1218.0,1218.0,1218.0,1218.0,1218.0,1218.0,1218.0,1218.0,1218.0
mean,2019-01-13 02:04:08.275862,30294.0,25.104269,0.0,0.0,0.0,-2.377406,0.0,0.0,0.0,0.0,6.0,2018.0,26.0,2.0,3.0
min,2017-02-05 00:00:00,0.0,0.0,0.0,0.0,0.0,-28.49,0.0,0.0,0.0,0.0,1.0,2017.0,1.0,1.0,1.0
25%,2018-01-21 00:00:00,7212.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2018.0,14.0,2.0,2.0
50%,2019-01-13 00:00:00,19735.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,2019.0,27.0,3.0,3.0
75%,2020-01-05 00:00:00,40295.0,40.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,10.0,2020.0,40.0,4.0,5.0
max,2020-12-27 00:00:00,288322.0,83.0,1.0,1.0,1.0,3.9,1.0,1.0,1.0,1.0,12.0,2020.0,52.0,4.0,6.0
std,,35032.527297,21.54935,0.499425,0.409346,0.476828,5.806291,0.418804,0.13904,0.13904,0.136169,3.376247,1.107197,14.725899,1.101964,1.703666


In [45]:
df.corr()

Unnamed: 0,Sales,PriceDiscount,InstorePromo,CataloguePromo,StoreEndPromo,GoogleMobility,CovidFlag,VDay,Easter,Christmas,Month,Year,Cw,Quarter,SkuNumber
Sales,1.0,0.403196,0.240891,-0.121081,0.235439,0.058781,-0.085571,-0.008442,-0.010266,-0.035916,0.076782,-0.078891,0.07616,0.072527,-0.085193
PriceDiscount,0.403196,1.0,0.217904,-0.074055,0.216127,-0.213534,0.281537,-0.044565,0.000959,-0.012153,0.055461,0.250843,0.062121,0.053238,0.279167
InstorePromo,0.240891,0.217904,1.0,-0.491438,0.367004,0.056074,-0.024732,0.019761,0.019761,0.013799,0.0813,-0.031572,0.082982,0.074042,0.021807
CataloguePromo,-0.121081,-0.074055,-0.491438,1.0,0.116301,0.07326,-0.094373,-0.044805,-0.044805,0.045834,0.037491,-0.03851,0.03181,0.051696,0.113924
StoreEndPromo,0.235439,0.216127,0.367004,0.116301,1.0,0.084955,-0.075321,0.020148,-0.06661,-0.000322,0.050312,-0.071472,0.047589,0.049433,0.095823
GoogleMobility,0.058781,-0.213534,0.056074,0.07326,0.084955,1.0,-0.756752,0.078227,-0.108645,0.003461,-0.017752,-0.546968,-0.030071,-0.03185,0.004846
CovidFlag,-0.085571,0.281537,-0.024732,-0.094373,-0.075321,-0.756752,1.0,0.007924,0.007924,-0.003052,0.050348,0.722783,0.06232,0.0411,-0.01336
VDay,-0.008442,-0.044565,0.019761,-0.044805,0.020148,0.078227,0.007924,1.0,-0.020101,-0.019669,-0.196251,-0.00284,-0.198504,-0.199089,0.001025
Easter,-0.010266,0.000959,0.019761,-0.044805,-0.06661,-0.108645,0.007924,-0.020101,1.0,-0.019669,-0.112232,-0.00284,-0.121451,-0.070379,0.001025
Christmas,-0.035916,-0.012153,0.013799,0.045834,-0.000322,0.003461,-0.003052,-0.019669,-0.019669,1.0,0.219039,-0.010954,0.205393,0.183028,-0.007852


In [46]:
px.imshow(df.corr().to_pandas(), text_auto=True, aspect="auto")

In [47]:
df.groupby(['Product']).agg(total_sales_month = ('Date','count')).reset_index()

Unnamed: 0,Product,total_sales_month
0,SKU3,204
1,SKU4,204
2,SKU1,204
3,SKU5,204
4,SKU6,198
5,SKU2,204


Note: SKU6 has 6 missing values

## Plots

In [48]:
df.groupby(['Product', 'Year', 'Month']).agg(total_sales_month = ('Sales','sum')).reset_index()

Unnamed: 0,Product,Year,Month,total_sales_month
0,SKU1,2019,5,159307
1,SKU5,2020,3,36434
2,SKU6,2019,5,144149
3,SKU4,2018,7,72545
4,SKU4,2019,10,56929
5,SKU1,2020,10,194136
6,SKU2,2017,11,13721
7,SKU3,2020,6,198209
8,SKU2,2017,7,25227
9,SKU6,2020,9,140430


In [49]:
df.groupby(['Product', 'Cw']).agg(total_sales_week = ('Sales','sum')).reset_index().plot.scatter(x='Cw', y='total_sales_week', 
color="Product", title="Total Weekly sales over 2017-2020")

In [53]:
df.groupby(['Product', 'Date']).agg(mean_sales = ('Sales','mean')).reset_index().plot.scatter(x='Date', y='mean_sales', color="Product", title="Mean sales over 2017-2020")

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 57042)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.10/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.10/socketserver.py", line 747, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 281, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 253, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 257, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/

In [51]:
df[df['Product'] == 'SKU1'].groupby(['Year','Cw']).agg(mean_sales = ('Sales','mean')).reset_index().plot.scatter(x='Cw', y='mean_sales', color="Year", title="Mean sales over 2017-2020")