**Store Sales Forecasting** an ongoing Kaggle competition. This note book is about the ML and predictive modeling of the data.

In [4]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845513 sha256=39ad9f9c70960e7b95f185b160e03647c573b2c08ec121ef5cbcda97310dd8dd
  Stored in directory: /root/.cache/pip/wheels/42/59/f5/79a5bf931714dcd201b26025347785f087370a10a3329a899c
Successfully built pyspark
Installing collected packages: py4j, pyspark
  Attempting uninstall: py4j
    Found existing installation: py4j 0.10.9.7
  

In [5]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [6]:
%%sh
cd /kaggle/input/store-sales-time-series-forecasting/
ls

holidays_events.csv
oil.csv
sample_submission.csv
stores.csv
test.csv
train.csv
transactions.csv


In [7]:
#lets assign var names to the source files for easy references

holidays = '/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv'
oil = '/kaggle/input/store-sales-time-series-forecasting/oil.csv'
stores = '/kaggle/input/store-sales-time-series-forecasting/stores.csv'
train = '/kaggle/input/store-sales-time-series-forecasting/train.csv'
txn = '/kaggle/input/store-sales-time-series-forecasting/transactions.csv'
#We wont be needing those for quite some time
test = '/kaggle/input/store-sales-time-series-forecasting/test.csv'
sample = '/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv'

In [8]:
#starting the spark session and getting the database setup.

spark = SparkSession.builder.appName('sales_fc').getOrCreate()
sparkql= spark.sql
sparkreader = spark.read

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/02 09:02:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [9]:
sparkql("SET spark.sql.warehouse.dir").show(truncate=False)

+-----------------------+------------------------------------+
|key                    |value                               |
+-----------------------+------------------------------------+
|spark.sql.warehouse.dir|file:/kaggle/working/spark-warehouse|
+-----------------------+------------------------------------+



In [10]:
#creating local database, even though not having hive file system
sparkql("CREATE DATABASE IF NOT EXISTS sales_forecast")
sparkql("USE sales_forecast")

DataFrame[]

In [11]:
#Reading in the data
holidays_data = sparkreader.csv(holidays,inferSchema=True,header=True)
oil_data = sparkreader.csv(oil,inferSchema=True,header=True)
stores_data = sparkreader.csv(stores,inferSchema=True,header=True)
train_data = sparkreader.csv(train,inferSchema=True,header=True)
txn_data = sparkreader.csv(txn,inferSchema=True,header=True)

                                                                                

Anything that is outside the database is data, once it is 
inside then it is a table. That will keep things separate

In [12]:
#Lets create temp views of the tables first. 
holidays_data.createOrReplaceTempView("holidays_table")
oil_data.createOrReplaceTempView("oil_table")
stores_data.createOrReplaceTempView("stores_table")
train_data.createOrReplaceTempView("train_table")
txn_data.createOrReplaceTempView("txn_table")

The temp tables are dropped like the usual sql tables. sparkql("DROP TABLE holidays_data")

In [13]:
#We have the data inside the spark data base to start manipulation
#using sql
sparkql("SHOW TABLES").show()

+---------+--------------+-----------+
|namespace|     tableName|isTemporary|
+---------+--------------+-----------+
|         |holidays_table|       true|
|         |     oil_table|       true|
|         |  stores_table|       true|
|         |   train_table|       true|
|         |     txn_table|       true|
+---------+--------------+-----------+



### Lets get the data into tables

In [14]:
sparkql("""SELECT MIN(date) as min_date,MAX(date) as max_date,
                MIN(date) - MAX(date) as interval
                from holidays_table""").show()

+-------------------+-------------------+--------------------+
|           min_date|           max_date|            interval|
+-------------------+-------------------+--------------------+
|2012-03-02 00:00:00|2017-12-26 00:00:00|INTERVAL '-2125 0...|
+-------------------+-------------------+--------------------+



In [16]:
holidays_data.select(date_trunc("mm", max(to_date("date", "dd/MM/yyyy"))).\
            alias("max_date"),
            date_trunc("mm", min(to_date("date", "dd/MM/yyyy"))). \
            alias("min_date"),
            (date_trunc("mm", max(to_date("date", "dd/MM/yyyy"))) - \
            date_trunc("mm", min(to_date("date", "dd/MM/yyyy")))).alias('diff_date')).show()

+-------------------+-------------------+--------------------+
|           max_date|           min_date|           diff_date|
+-------------------+-------------------+--------------------+
|2017-12-01 00:00:00|2012-03-01 00:00:00|INTERVAL '2101 00...|
+-------------------+-------------------+--------------------+



### Creating the date sequence that we want

In [17]:
data_date_series = holidays_data.select(date_trunc("mm", max(to_date("date", "dd/MM/yyyy"))).\
            alias("max_date"),
            date_trunc("mm", min(to_date("date", "dd/MM/yyyy"))). \
            alias("min_date")). \
    select(expr("sequence(min_date, max_date, interval 1 day)").alias("date_seq")). \
        withColumn("date_new",explode("date_seq")). \
        withColumn("date_form",date_format("date_new", "yyyy-MM-dd"))

In [18]:
date_series=data_date_series.drop("date_seq","date_new")

In [19]:
date_series.count()

2102

In [20]:
date_series.createOrReplaceTempView('date_table')

In [22]:
sparkql("""SELECT date_form 
            from date_table""").tail(2)

[Row(date_form='2017-11-30'), Row(date_form='2017-12-01')]

In [25]:
#Resorting to the Temp view creation route instead
sparkql(""" SELECT date_form, COALESCE(dcoilwtico,0) as dcoilwtico
        FROM date_table dt LEFT JOIN oil_table ot
        ON dt.date_form = ot.date
        where ot.date > '2013-01-01'"""). \
    createOrReplaceTempView('full_oil_table')

In [26]:
# Creating table the sql style
sparkql("""SELECT * 
            FROM full_oil_table""").show(2)

+----------+----------+
| date_form|dcoilwtico|
+----------+----------+
|2013-01-02|     93.14|
|2013-01-03|     92.97|
+----------+----------+
only showing top 2 rows



In [28]:
sparkql(""" SELECT ot.date_form, ht.date, ht.type, ht.locale,
        ht.locale_name,ot.dcoilwtico
        FROM holidays_table ht JOIN full_oil_table ot
        ON date_format(ht.date,'yyyy-MM-dd') = ot.date_form
""").show(2, truncate=False)
## The tables are joining

+----------+-------------------+-------+--------+-----------+----------+
|date_form |date               |type   |locale  |locale_name|dcoilwtico|
+----------+-------------------+-------+--------+-----------+----------+
|2013-02-11|2013-02-11 00:00:00|Holiday|National|Ecuador    |97.01     |
|2013-02-12|2013-02-12 00:00:00|Holiday|National|Ecuador    |97.48     |
+----------+-------------------+-------+--------+-----------+----------+
only showing top 2 rows



In [29]:
sparkql(""" SELECT ot.date_form, COALESCE(ht.type,'Working') as type, 
        COALESCE(ht.locale,'National') as locale,
        COALESCE(ht.locale_name,'National') as locale_name,
        ot.dcoilwtico
        FROM holidays_table ht RIGHT JOIN full_oil_table ot
        ON date_format(ht.date,'yyyy-MM-dd') = ot.date_form
""").createOrReplaceTempView('full_oil_with_holidays')
## The tables are joining

In [30]:
sparkql("""SELECT * 
            FROM full_oil_with_holidays""").show(2)

+----------+-------+--------+-----------+----------+
| date_form|   type|  locale|locale_name|dcoilwtico|
+----------+-------+--------+-----------+----------+
|2013-01-02|Working|National|   National|     93.14|
|2013-01-03|Working|National|   National|     92.97|
+----------+-------+--------+-----------+----------+
only showing top 2 rows



In [31]:
sparkql("""SELECT * 
            FROM full_oil_with_holidays""").tail(2)

[Row(date_form='2017-08-30', type='Working', locale='National', locale_name='National', dcoilwtico=45.96),
 Row(date_form='2017-08-31', type='Working', locale='National', locale_name='National', dcoilwtico=47.26)]

In [32]:
sparkql("""SELECT * 
            FROM full_oil_with_holidays""").count()

1235

Validating the table join

- Check if there is extra rows

- Find the extra rows 

- Ensure there is no duplication

In [None]:
sparkql(""" SELECT *
        FROM full_oil_with_holidays
""").count()

In [None]:
sparkql(""" SELECT distinct date_form
        FROM full_oil_with_holidays
""").count()

We can observe the date has been duplicated. The reason must be linked with the locales and types. Running a group by with those 
columns must assure there is no data duplication

In [None]:
sparkql(""" SELECT COUNT(1) as typ_counts, date_form, type
        FROM full_oil_with_holidays
        GROUP BY date_form, type
        HAVING COUNT(1) > 1
""").show()

In [None]:
sparkql(""" SELECT ft.date_form, ft.dcoilwtico
        FROM full_oil_with_holidays ft
        EXCEPT
        SELECT ot.date_form, ot.dcoilwtico 
        FROM full_oil_table ot
""").show()

Based on above checks the table join and new view creation is successful. Proceeding to the next join

Stores table shown below looks like a dimension table. The store_nbr can be the unique id. Lets check that.

The store-nbr is arbitrary, to identify a particular store. There are multiple store in same city, state, type and cluster. It is a valid joiner.

In [None]:
sparkql(""" SELECT st.*
        FROM stores_table st
""").show(5)

In [None]:
sparkql(""" SELECT tt.id, date_format(tt.date,'yyyy-MM-dd') as date,
            tt.store_nbr, tt.family, 
            tt.sales, tt.onpromotion
        FROM train_table tt""").createOrReplaceTempView('full_train_table')

In [None]:
sparkql("""select COUNT(1) as day_data,tt.date
            FROM full_train_table tt
            GROUP BY tt.date
            ORDER BY tt.date""").show(2)

In [None]:
sparkql("""SELECT MAX(date) as max_date,
            MIN(date) as min_date,
            MAX(date) - MIN(date) as avbl_span
            FROM train_table""").show(2)

#Number of days is 1687 which is 14 days more than data
# available in oil_data. 

Remember the SQL follows the 
                        
                        From
                                Join
                                
                             Where
                             
                         Groupby
                         
                                 Select
                                 
                                        Order by for execution.
                                        

Based on above execution, order by can see variables present in Select. But G / W cannot see them

Lets get joining the train table with the stores and full_oil_table.

In [None]:
sparkql(""" SELECT ftt.*, st.*,fot.*
        FROM full_train_table ftt JOIN stores_table st
        on ftt.store_nbr = st.store_nbr
        join full_oil_table fot
        on fot.date_form = ftt.date
""").show(2)

In [None]:
sparkql(""" SELECT ftt.*, st.*,fot.*
        FROM full_train_table ftt JOIN stores_table st
        on ftt.store_nbr = st.store_nbr
        RIGHT JOIN full_oil_table fot
        on fot.date_form = ftt.date
""").show(2)

Lets try validating the join by the usual process of checking the data

-- Row Counts of store Number of individual tables and final 
joined tables

In [None]:
sparkql("""SELECT * FROM train_table""").count()

In [None]:
sparkql(""" SELECT ftt.*, st.*,fot.*
        FROM full_train_table ftt JOIN stores_table st
        on ftt.store_nbr = st.store_nbr
        join full_oil_table fot
        on fot.date_form = ftt.date
""").count()

Hmm the rows has been lost... I guess some of the full oil table 
has lesser date rows... 

In [None]:
sparkql("""select COUNT(1) as day_data,tt.date
            FROM full_train_table tt
            GROUP BY tt.date
            ORDER BY tt.date""").count()

In [None]:
sparkql("""select COUNT(1) as day_data,tt.date_form
            FROM full_oil_table tt
            GROUP BY tt.date_form
            ORDER BY tt.date_form""").count()

In [None]:
# That provides part of the answer.
3000888 - 1782 * 10 

In [None]:
# Lets check the store numbers. That tallys up with the 
# 54 store numbers
sparkql("""select COUNT(1) as day_data,tt.store_nbr
            FROM full_train_table tt
            GROUP BY tt.store_nbr
            ORDER BY tt.store_nbr""").count()

So there we found the culprit. We had to do left outer join.
There might be days which is present in train_table and not 
in oil_data. We need to work on that next

In [None]:
sparkql(""" SELECT ftt.*, st.*,fot.*
        FROM full_train_table ftt JOIN stores_table st
        on ftt.store_nbr = st.store_nbr
        LEFT JOIN full_oil_table fot
        on fot.date_form = ftt.date""").count()

In [None]:
sparkql(""" SELECT ftt.id, ftt.date,ftt.store_nbr,ftt.family,
            ftt.sales, ftt.onpromotion, st.city, st.state,st.type,
            st.cluster,fot.dcoilwtico
        FROM full_train_table ftt JOIN stores_table st
        on ftt.store_nbr = st.store_nbr
        LEFT JOIN full_oil_table fot
        on fot.date_form = ftt.date"""). \
        createOrReplaceTempView("train_store_oil_table")

In [None]:
sparkql("""SELECT tsot.*
            FROM train_store_oil_table tsot
            WHERE tsot.date = '2017-08-02'""").show(2)

In [None]:
sparkql("""SELECT date_format(txt.date,'yyyy-MM-dd') as date,
                sum(txt.transactions) as total_txn
                FROM txn_table txt
            GROUP BY date_format(txt.date,'yyyy-MM-dd')
            ORDER BY date""").show(5)

In [None]:
sparkql("""SELECT MAX(date) as max_date,
            MIN(date) as min_date,
            MAX(date) - MIN(date) as avbl_span
            FROM txn_table""").show(2)

In [None]:
sparkql("""SELECT date_format(txt.date,'yyyy-MM-dd') as date,
                sum(txt.transactions) as total_txn
                FROM txn_table txt
            GROUP BY date_format(txt.date,'yyyy-MM-dd')
            ORDER BY date""").tail(5)

There are missing txn data in the middle of the span. Which the above way of checking will not show. Lets proceed with the joining

In [None]:
sparkql("""SELECT tsot.*, date_format(txt.date,'yyyy-MM-dd') as txn_date,
            txt.transactions, txt.store_nbr
            FROM train_store_oil_table tsot LEFT JOIN txn_table txt
            on tsot.date = date_format(txt.date,'yyyy-MM-dd')
        """).show(2)

In [None]:
sparkql("""SELECT tsot.*, date_format(txt.date,'yyyy-MM-dd') as txn_date,
            txt.transactions, txt.store_nbr
            FROM train_store_oil_table tsot LEFT JOIN txn_table txt
            on tsot.date = date_format(txt.date,'yyyy-MM-dd') and
                tsot.store_nbr = txt.store_nbr
        """).count()

In [None]:
sparkql("""SELECT date_format(txt.date,'yyyy-MM-dd') as txn_date,
            txt.transactions, txt.store_nbr
            FROM txn_table txt""").count()

In [None]:
83488 * 3000888

In [None]:
sparkql("""SELECT tsot.*, 
            COALESCE(DATE_FORMAT(txt.date,'yyyy-MM-dd'),tsot.date) as txn_date,
            COALESCE(txt.transactions,0) as store_txns, 
            COALESCE(txt.store_nbr, tsot.store_nbr) as store_nbr
            FROM train_store_oil_table tsot LEFT JOIN txn_table txt
            on tsot.date = date_format(txt.date,'yyyy-MM-dd')
            and tsot.store_nbr = txt.store_nbr
        """).createOrReplaceTempView("all_data_joined_data")

In [None]:
sparkql("""SELECT * FROM all_data_joined_data adj
            WHERE adj.date = '2013-01-01'
            and adj.store_txns != 0""").show()