## Gathering Data and Imports

In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/walmart-sales-forecast/features.csv
/kaggle/input/walmart-sales-forecast/stores.csv
/kaggle/input/walmart-sales-forecast/train.csv
/kaggle/input/walmart-sales-forecast/test.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Installing Spark and Creating Spark session

In [None]:
# Installing pyspark
!pip install pyspark
import pyspark.sql.functions as f
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Time Series analysis').getOrCreate()

### Loading the Train.csv Dataset into a spark DataFrame

In [7]:
df = spark.read.options(delimeter=',', inferSchema=True, header=True).csv('/kaggle/input/walmart-sales-forecast/train.csv')
df.limit(10).toPandas().style.hide(axis="index")

                                                                                

Store,Dept,Date,Weekly_Sales,IsHoliday
1,1,2010-02-05,24924.5,False
1,1,2010-02-12,46039.49,True
1,1,2010-02-19,41595.55,False
1,1,2010-02-26,19403.54,False
1,1,2010-03-05,21827.9,False
1,1,2010-03-12,21043.39,False
1,1,2010-03-19,22136.64,False
1,1,2010-03-26,26229.21,False
1,1,2010-04-02,57258.43,False
1,1,2010-04-09,42960.91,False


In [13]:
mn_dt = df.select('Date').agg(f.min('Date')).collect()[0][0]
mx_dt = df.select('Date').agg(f.max('Date')).collect()[0][0]
print('Total Rows: ',df.count())
print('Min Date: ', mn_dt)
print('Max Date: ', mx_dt)

                                                                                

Total Rows:  421570
Min Date:  2010-02-05
Max Date:  2012-10-26


In [8]:
stores = spark.read.options(delimeter=',', inferSchema=True, header=True).csv('/kaggle/input/walmart-sales-forecast/stores.csv')
stores.limit(10).toPandas().style.hide(axis="index")

Store,Type,Size
1,A,151315
2,A,202307
3,B,37392
4,A,205863
5,B,34875
6,A,202505
7,B,70713
8,A,155078
9,B,125833
10,B,126512


In [16]:
print('Total Rows: ', stores.count())
print('Total Stores: ', stores.select('Store').distinct().count())
stores.select('Type').distinct().show()

Total Rows:  45
Total Stores:  45
+----+
|Type|
+----+
|   B|
|   C|
|   A|
+----+



In [9]:
features = spark.read.options(delimeter=',', inferSchema=True, header=True).csv('/kaggle/input/walmart-sales-forecast/features.csv')
features.limit(10).toPandas().style.hide(axis="index")

Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
1,2010-02-05,42.31,2.572,,,,,,211.0963582,8.106,False
1,2010-02-12,38.51,2.548,,,,,,211.2421698,8.106,True
1,2010-02-19,39.93,2.514,,,,,,211.2891429,8.106,False
1,2010-02-26,46.63,2.561,,,,,,211.3196429,8.106,False
1,2010-03-05,46.5,2.625,,,,,,211.3501429,8.106,False
1,2010-03-12,57.79,2.667,,,,,,211.3806429,8.106,False
1,2010-03-19,54.58,2.72,,,,,,211.215635,8.106,False
1,2010-03-26,51.45,2.732,,,,,,211.0180424,8.106,False
1,2010-04-02,62.27,2.719,,,,,,210.8204499,7.808,False
1,2010-04-09,65.86,2.77,,,,,,210.6228574,7.808,False


In [17]:
mn_dt = features.select('Date').agg(f.min('Date')).collect()[0][0]
mx_dt = features.select('Date').agg(f.max('Date')).collect()[0][0]
print('Total Rows: ',features.count())
print('Min Date: ', mn_dt)
print('Max Date: ', mx_dt)

Total Rows:  8190
Min Date:  2010-02-05
Max Date:  2013-07-26
