**Store Sales Forecasting** an ongoing Kaggle competition. This note book is about the ML and predictive modeling of the data.

In [1]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l- \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / done
[?25h  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845513 sha256=7508aae9ca99287c658e1f18a8fd3e330429c11c732f5aaa86d83f276fc7ffd0
  Stored in directory

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [3]:
#lets assign var names to the source files for easy references

holidays = '/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv'
oil = '/kaggle/input/store-sales-time-series-forecasting/oil.csv'
stores = '/kaggle/input/store-sales-time-series-forecasting/stores.csv'
train = '/kaggle/input/store-sales-time-series-forecasting/train.csv'
txn = '/kaggle/input/store-sales-time-series-forecasting/transactions.csv'
#We wont be needing those for quite some time
test = '/kaggle/input/store-sales-time-series-forecasting/test.csv'
sample = '/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv'

In [4]:
#starting the spark session and getting the database setup.

spark = SparkSession.builder.appName('sales_fc').getOrCreate()
sparkql= spark.sql
sparkreader = spark.read

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/02 11:15:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
sparkql("SET spark.sql.warehouse.dir").show(truncate=False)

+-----------------------+------------------------------------+
|key                    |value                               |
+-----------------------+------------------------------------+
|spark.sql.warehouse.dir|file:/kaggle/working/spark-warehouse|
+-----------------------+------------------------------------+



In [6]:
#creating local database, even though not having hive file system
sparkql("CREATE DATABASE IF NOT EXISTS sales_forecast")
sparkql("USE sales_forecast")

DataFrame[]

In [7]:
#Reading in the data
holidays_data = sparkreader.csv(holidays,inferSchema=True,header=True)
oil_data = sparkreader.csv(oil,inferSchema=True,header=True)
stores_data = sparkreader.csv(stores,inferSchema=True,header=True)
train_data = sparkreader.csv(train,inferSchema=True,header=True)
txn_data = sparkreader.csv(txn,inferSchema=True,header=True)

                                                                                

In [8]:
test_data = sparkreader.csv(test,inferSchema=True,header=True)

Anything that is outside the database is data, once it is 
inside then it is a table. That will keep things separate

In [9]:
#Lets create temp views of the tables first. 
holidays_data.createOrReplaceTempView("holidays_table")
oil_data.createOrReplaceTempView("oil_table")
stores_data.createOrReplaceTempView("stores_table")
train_data.createOrReplaceTempView("train_table")
txn_data.createOrReplaceTempView("txn_table")

In [10]:
test_data.createOrReplaceTempView("test_table")

The temp tables are dropped like the usual sql tables. sparkql("DROP TABLE holidays_data")

### Creating the date sequence that we want

In [11]:
data_date_series = holidays_data.select(date_trunc("mm", max(to_date("date", "dd/MM/yyyy"))).\
            alias("max_date"),
            date_trunc("mm", min(to_date("date", "dd/MM/yyyy"))). \
            alias("min_date")). \
    select(expr("sequence(min_date, max_date, interval 1 day)").alias("date_seq")). \
        withColumn("date_new",explode("date_seq")). \
        withColumn("date_form",date_format("date_new", "yyyy-MM-dd"))

In [12]:
holidays_data.select(date_trunc("mm", max(to_date("date", "yyyy-mm-dd"))).\
            alias("max_date"),
            date_trunc("mm", min(to_date("date", "yyyy-mm-dd"))). \
            alias("min_date")).show()

+-------------------+-------------------+
|           max_date|           min_date|
+-------------------+-------------------+
|2017-12-01 00:00:00|2012-03-01 00:00:00|
+-------------------+-------------------+



In [13]:
date_series=data_date_series.drop("date_seq","date_new")

In [14]:
date_series.tail(2)

[Row(date_form='2017-11-30'), Row(date_form='2017-12-01')]

In [15]:
date_series.count()

2102

In [16]:
date_series.createOrReplaceTempView('date_table')

In [17]:
sparkql("""SELECT * from oil_table""").count()

1218

In [18]:
#Resorting to the Temp view creation route instead
sparkql(""" SELECT date_form, COALESCE(dcoilwtico,0) as dcoilwtico
        FROM date_table dt LEFT JOIN oil_table ot
        ON dt.date_form = ot.date"""). \
    createOrReplaceTempView('full_oil_table')

In [19]:
sparkql("""SELECT * 
            FROM full_oil_table""").count()

2102

In [20]:
sparkql("""SELECT * FROM full_oil_table""").tail(10)

[Row(date_form='2017-11-22', dcoilwtico=0.0),
 Row(date_form='2017-11-23', dcoilwtico=0.0),
 Row(date_form='2017-11-24', dcoilwtico=0.0),
 Row(date_form='2017-11-25', dcoilwtico=0.0),
 Row(date_form='2017-11-26', dcoilwtico=0.0),
 Row(date_form='2017-11-27', dcoilwtico=0.0),
 Row(date_form='2017-11-28', dcoilwtico=0.0),
 Row(date_form='2017-11-29', dcoilwtico=0.0),
 Row(date_form='2017-11-30', dcoilwtico=0.0),
 Row(date_form='2017-12-01', dcoilwtico=0.0)]

In [21]:
sparkql(""" SELECT ot.date_form, COALESCE(ht.type,'Working') as type, 
        COALESCE(ht.locale,'National') as locale,
        COALESCE(ht.locale_name,'National') as locale_name,
        COALESCE(ht.transferred,false) as transferred,
        ot.dcoilwtico
        FROM holidays_table ht RIGHT JOIN full_oil_table ot
        ON date_format(ht.date,'yyyy-MM-dd') = ot.date_form
""").createOrReplaceTempView('full_oil_with_holidays')
## The tables are joining

In [22]:
sparkql("""SELECT * 
            FROM full_oil_with_holidays""").show(2)

+----------+-------+--------+-----------+-----------+----------+
| date_form|   type|  locale|locale_name|transferred|dcoilwtico|
+----------+-------+--------+-----------+-----------+----------+
|2012-03-01|Working|National|   National|      false|       0.0|
|2012-03-02|Holiday|   Local|      Manta|      false|       0.0|
+----------+-------+--------+-----------+-----------+----------+
only showing top 2 rows



In [23]:
sparkql("""SELECT * 
            FROM full_oil_with_holidays""").count()

2138

In [24]:
sparkql(""" SELECT tt.id, date_format(tt.date,'yyyy-MM-dd') as date,
            tt.store_nbr, tt.family, 
            tt.sales, tt.onpromotion
        FROM train_table tt""").createOrReplaceTempView('full_train_table')

In [25]:
sparkql(""" SELECT te.id, date_format(te.date,'yyyy-MM-dd') as date,
            te.store_nbr, te.family, te.onpromotion
        FROM test_table te""").createOrReplaceTempView('full_test_table')

In [26]:
sparkql("""SELECT * FROM full_train_table""").count()

                                                                                

3000888

In [27]:
sparkql("""SELECT * FROM full_test_table""").count()

28512

In [28]:
# table short form used is ftt. and table created is train_store_oil_table
sparkql(""" SELECT ftt.id, ftt.date,ftt.store_nbr,ftt.family,
            ftt.sales, ftt.onpromotion, st.city, st.state,st.type,
            st.cluster,fot.dcoilwtico, fot.type, fot.locale,fot.locale_name,
            fot.transferred
        FROM full_train_table ftt JOIN stores_table st
        on ftt.store_nbr = st.store_nbr
        LEFT JOIN full_oil_with_holidays fot
        on fot.date_form = ftt.date"""). \
        createOrReplaceTempView("train_store_oil_table")

In [29]:
sparkql("""SELECT * FROM train_store_oil_table
                    ORDER BY date""").tail(2)

                                                                                

[Row(id=3000886, date='2017-08-15', store_nbr=9, family='SCHOOL AND OFFICE SUPPLIES', sales=121.0, onpromotion=8, city='Quito', state='Pichincha', type='B', cluster=6, dcoilwtico=47.57, type='Holiday', locale='Local', locale_name='Riobamba', transferred=False),
 Row(id=3000887, date='2017-08-15', store_nbr=9, family='SEAFOOD', sales=16.0, onpromotion=0, city='Quito', state='Pichincha', type='B', cluster=6, dcoilwtico=47.57, type='Holiday', locale='Local', locale_name='Riobamba', transferred=False)]

In [30]:
sparkql("""SELECT * FROM train_store_oil_table
                    ORDER BY date""").show(2)

[Stage 76:>                                                         (0 + 4) / 4]

+---+----------+---------+----------+-----+-----------+-----+---------+----+-------+----------+-------+--------+-----------+-----------+
| id|      date|store_nbr|    family|sales|onpromotion| city|    state|type|cluster|dcoilwtico|   type|  locale|locale_name|transferred|
+---+----------+---------+----------+-----+-----------+-----+---------+----+-------+----------+-------+--------+-----------+-----------+
|  2|2013-01-01|        1|    BEAUTY|  0.0|          0|Quito|Pichincha|   D|     13|       0.0|Holiday|National|    Ecuador|      false|
|  0|2013-01-01|        1|AUTOMOTIVE|  0.0|          0|Quito|Pichincha|   D|     13|       0.0|Holiday|National|    Ecuador|      false|
+---+----------+---------+----------+-----+-----------+-----+---------+----+-------+----------+-------+--------+-----------+-----------+
only showing top 2 rows



                                                                                

In [31]:
# table short form used is ftt. and table created is test_store_oil_table
# There is no sales column.
sparkql(""" SELECT ftt.id, ftt.date,ftt.store_nbr,ftt.family,
            ftt.onpromotion, st.city, st.state,st.type,
            st.cluster,fot.dcoilwtico, fot.type, fot.locale,fot.locale_name,
            fot.transferred
        FROM full_test_table ftt JOIN stores_table st
        on ftt.store_nbr = st.store_nbr
        LEFT JOIN full_oil_with_holidays fot
        on fot.date_form = ftt.date"""). \
        createOrReplaceTempView("test_store_oil_table")

In [32]:
sparkql("""SELECT tsot.*
            FROM train_store_oil_table tsot""").count()

                                                                                

3054348

In [33]:
sparkql("""SELECT tsot.*
            FROM test_store_oil_table tsot
                ORDER BY date""").show(2)

+-------+----------+---------+----------+-----------+-----+---------+----+-------+----------+-------+--------+-----------+-----------+
|     id|      date|store_nbr|    family|onpromotion| city|    state|type|cluster|dcoilwtico|   type|  locale|locale_name|transferred|
+-------+----------+---------+----------+-----------+-----+---------+----+-------+----------+-------+--------+-----------+-----------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|Quito|Pichincha|   D|     13|      46.8|Working|National|   National|      false|
|3000890|2017-08-16|        1|    BEAUTY|          2|Quito|Pichincha|   D|     13|      46.8|Working|National|   National|      false|
+-------+----------+---------+----------+-----------+-----+---------+----+-------+----------+-------+--------+-----------+-----------+
only showing top 2 rows



In [34]:
sparkql("""SELECT tsot.*
            FROM test_store_oil_table tsot
                ORDER BY date""").count()

28512

In [35]:
sparkql("""SELECT tsot.*, 
            COALESCE(DATE_FORMAT(txt.date,'yyyy-MM-dd'),tsot.date) as txn_date,
            COALESCE(txt.transactions,0) as store_txns 
            FROM train_store_oil_table tsot LEFT JOIN txn_table txt
            on tsot.date = date_format(txt.date,'yyyy-MM-dd')
            and tsot.store_nbr = txt.store_nbr
        """).createOrReplaceTempView("all_data_joined_train_data")

In [36]:
sparkql("""SELECT * FROM txn_table""").tail(2)

# There will be no txn data for the test_data time interval. So no need to join that 
# table

[Row(date=datetime.datetime(2017, 8, 15, 0, 0), store_nbr=53, transactions=932),
 Row(date=datetime.datetime(2017, 8, 15, 0, 0), store_nbr=54, transactions=802)]

In [37]:
sparkql("""SELECT tsot.*, 
            COALESCE(DATE_FORMAT(txt.date,'yyyy-MM-dd'),tsot.date) as txn_date,
            COALESCE(txt.transactions,0) as store_txns 
            FROM test_store_oil_table tsot LEFT JOIN txn_table txt
            on tsot.date = date_format(txt.date,'yyyy-MM-dd')
            and tsot.store_nbr = txt.store_nbr
        """).createOrReplaceTempView("all_data_joined_train_data")

In [38]:
sparkql("""SELECT * FROM all_data_joined_train_data adj""").show(2)

+-------+----------+---------+----------+-----------+-----+---------+----+-------+----------+-------+--------+-----------+-----------+----------+----------+
|     id|      date|store_nbr|    family|onpromotion| city|    state|type|cluster|dcoilwtico|   type|  locale|locale_name|transferred|  txn_date|store_txns|
+-------+----------+---------+----------+-----------+-----+---------+----+-------+----------+-------+--------+-----------+-----------+----------+----------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|Quito|Pichincha|   D|     13|      46.8|Working|National|   National|      false|2017-08-16|         0|
|3000889|2017-08-16|        1| BABY CARE|          0|Quito|Pichincha|   D|     13|      46.8|Working|National|   National|      false|2017-08-16|         0|
+-------+----------+---------+----------+-----------+-----+---------+----+-------+----------+-------+--------+-----------+-----------+----------+----------+
only showing top 2 rows

