**Store Sales Forecasting** an ongoing Kaggle competition. This note book is about the ML and predictive modeling of the data.

In [1]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l- \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | done
[?25h  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845513 sha256=8b90331b66ffbaabd393e0cbe47043ce383215aac74f6cd1dd9a773edbda5b9d
  Stored in directory: /root/.cache/pip/wheels/42/59/f5/7

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [3]:
#lets assign var names to the source files for easy references

holidays = '/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv'
oil = '/kaggle/input/store-sales-time-series-forecasting/oil.csv'
stores = '/kaggle/input/store-sales-time-series-forecasting/stores.csv'
train = '/kaggle/input/store-sales-time-series-forecasting/train.csv'
txn = '/kaggle/input/store-sales-time-series-forecasting/transactions.csv'
#We wont be needing those for quite some time
test = '/kaggle/input/store-sales-time-series-forecasting/test.csv'
sample = '/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv'

In [4]:
#starting the spark session and getting the database setup.

spark = SparkSession.builder.appName('sales_fc').getOrCreate()
sparkql= spark.sql
sparkreader = spark.read

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/02 09:39:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
sparkql("SET spark.sql.warehouse.dir").show(truncate=False)

+-----------------------+------------------------------------+
|key                    |value                               |
+-----------------------+------------------------------------+
|spark.sql.warehouse.dir|file:/kaggle/working/spark-warehouse|
+-----------------------+------------------------------------+



In [6]:
#creating local database, even though not having hive file system
sparkql("CREATE DATABASE IF NOT EXISTS sales_forecast")
sparkql("USE sales_forecast")

DataFrame[]

In [7]:
#Reading in the data
holidays_data = sparkreader.csv(holidays,inferSchema=True,header=True)
oil_data = sparkreader.csv(oil,inferSchema=True,header=True)
stores_data = sparkreader.csv(stores,inferSchema=True,header=True)
train_data = sparkreader.csv(train,inferSchema=True,header=True)
txn_data = sparkreader.csv(txn,inferSchema=True,header=True)

                                                                                

Anything that is outside the database is data, once it is 
inside then it is a table. That will keep things separate

In [8]:
#Lets create temp views of the tables first. 
holidays_data.createOrReplaceTempView("holidays_table")
oil_data.createOrReplaceTempView("oil_table")
stores_data.createOrReplaceTempView("stores_table")
train_data.createOrReplaceTempView("train_table")
txn_data.createOrReplaceTempView("txn_table")

The temp tables are dropped like the usual sql tables. sparkql("DROP TABLE holidays_data")

### Creating the date sequence that we want

In [9]:
data_date_series = oil_data.select(date_trunc("mm", max(to_date("date", "dd/MM/yyyy"))).\
            alias("max_date"),
            date_trunc("mm", min(to_date("date", "dd/MM/yyyy"))). \
            alias("min_date")). \
    select(expr("sequence(min_date, max_date, interval 1 day)").alias("date_seq")). \
        withColumn("date_new",explode("date_seq")). \
        withColumn("date_form",date_format("date_new", "yyyy-MM-dd"))

In [10]:
date_series=data_date_series.drop("date_seq","date_new")

In [11]:
date_series.count()

1674

In [12]:
date_series.createOrReplaceTempView('date_table')

In [13]:
sparkql("""SELECT * from oil_table""").count()

1218

In [14]:
#Resorting to the Temp view creation route instead
sparkql(""" SELECT date_form, COALESCE(dcoilwtico,0) as dcoilwtico
        FROM date_table dt LEFT JOIN oil_table ot
        ON dt.date_form = ot.date"""). \
    createOrReplaceTempView('full_oil_table')

In [15]:
sparkql("""SELECT * 
            FROM full_oil_table""").count()

1674

In [16]:
sparkql(""" SELECT ot.date_form, COALESCE(ht.type,'Working') as type, 
        COALESCE(ht.locale,'National') as locale,
        COALESCE(ht.locale_name,'National') as locale_name,
        ot.dcoilwtico
        FROM holidays_table ht RIGHT JOIN full_oil_table ot
        ON date_format(ht.date,'yyyy-MM-dd') = ot.date_form
""").createOrReplaceTempView('full_oil_with_holidays')
## The tables are joining

In [17]:
sparkql("""SELECT * 
            FROM full_oil_with_holidays""").show(2)

+----------+-------+--------+-----------+----------+
| date_form|   type|  locale|locale_name|dcoilwtico|
+----------+-------+--------+-----------+----------+
|2013-01-01|Holiday|National|    Ecuador|       0.0|
|2013-01-02|Working|National|   National|     93.14|
+----------+-------+--------+-----------+----------+
only showing top 2 rows



In [18]:
sparkql("""SELECT * 
            FROM full_oil_with_holidays""").count()

1704

In [19]:
sparkql(""" SELECT tt.id, date_format(tt.date,'yyyy-MM-dd') as date,
            tt.store_nbr, tt.family, 
            tt.sales, tt.onpromotion
        FROM train_table tt""").createOrReplaceTempView('full_train_table')

In [20]:
sparkql("""SELECT * FROM train_table""").count()

                                                                                

3000888

In [21]:
sparkql(""" SELECT ftt.*, st.*,fot.*
        FROM full_train_table ftt JOIN stores_table st
        on ftt.store_nbr = st.store_nbr
        LEFT JOIN full_oil_table fot
        on fot.date_form = ftt.date""").count()

                                                                                

3000888

In [22]:
sparkql(""" SELECT ftt.id, ftt.date,ftt.store_nbr,ftt.family,
            ftt.sales, ftt.onpromotion, st.city, st.state,st.type,
            st.cluster,fot.dcoilwtico
        FROM full_train_table ftt JOIN stores_table st
        on ftt.store_nbr = st.store_nbr
        LEFT JOIN full_oil_table fot
        on fot.date_form = ftt.date"""). \
        createOrReplaceTempView("train_store_oil_table")

In [23]:
sparkql("""SELECT tsot.*
            FROM train_store_oil_table tsot""").count()

                                                                                

3000888

In [24]:
sparkql("""SELECT tsot.*, 
            COALESCE(DATE_FORMAT(txt.date,'yyyy-MM-dd'),tsot.date) as txn_date,
            COALESCE(txt.transactions,0) as store_txns, 
            COALESCE(txt.store_nbr, tsot.store_nbr) as store_nbr
            FROM train_store_oil_table tsot LEFT JOIN txn_table txt
            on tsot.date = date_format(txt.date,'yyyy-MM-dd')
            and tsot.store_nbr = txt.store_nbr
        """).createOrReplaceTempView("all_data_joined_data")

In [25]:
sparkql("""SELECT * FROM all_data_joined_data adj
            WHERE adj.date = '2013-01-01'
            and adj.store_txns != 0""").show()

+---+----------+---------+-------------------+---------+-----------+-------+-----------+----+-------+----------+----------+----------+---------+
| id|      date|store_nbr|             family|    sales|onpromotion|   city|      state|type|cluster|dcoilwtico|  txn_date|store_txns|store_nbr|
+---+----------+---------+-------------------+---------+-----------+-------+-----------+----+-------+----------+----------+----------+---------+
|561|2013-01-01|       25|         AUTOMOTIVE|      0.0|          0|Salinas|Santa Elena|   D|      1|       0.0|2013-01-01|       770|       25|
|562|2013-01-01|       25|          BABY CARE|      0.0|          0|Salinas|Santa Elena|   D|      1|       0.0|2013-01-01|       770|       25|
|563|2013-01-01|       25|             BEAUTY|      2.0|          0|Salinas|Santa Elena|   D|      1|       0.0|2013-01-01|       770|       25|
|564|2013-01-01|       25|          BEVERAGES|    810.0|          0|Salinas|Santa Elena|   D|      1|       0.0|2013-01-01|       