In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import pandas_udf, PandasUDFType, sum, max, col, concat, lit
import sys
import os

In [2]:
from prophet import Prophet
import pandas as pd
import numpy as np

In [3]:
#Reading the arguments and storing them in variables
project_name=<<your_project_name>>
dataset_name=<<your_dataset_name>>
bucket_name=<<your_bucket_name>>
user_name=<<your_user_name>>

In [4]:
#creating a spark session
spark =SparkSession.builder.appName("Timeseries").config('spark.jars', 'gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.22.2.jar').getOrCreate()

In [5]:
#Writing the output to BigQuery
spark.conf.set("parentProject", project_name)
bucket = bucket_name
spark.conf.set("temporaryGcsBucket",bucket)

In [6]:
# define an output schema
schema = StructType([
        StructField("store", StringType(), True),
        StructField("item", StringType(), True),
        StructField("ds", DateType(), True),
        StructField("yhat", DoubleType(), True)
    ])

In [7]:
@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def fit_pandas_udf(df):
    """
    :param df: Dataframe (train + test data)
    :return: predictions as defined in the output schema
    """

    def train_fitted_prophet(df, cutoff):
        # train
        ts_train = (df
                    .query('date <= @cutoff')
                    .rename(columns={'date': 'ds', 'sales': 'y'})
                    .sort_values('ds')
                    )
        # test
        ts_test = (df
                   .query('date > @cutoff')
                   .rename(columns={'date': 'ds', 'sales': 'y'})
                   .sort_values('ds')
                   .assign(ds=lambda x: pd.to_datetime(x["ds"]))
                   .drop('y', axis=1)
                   )

        print(ts_test.columns)
        # init model
        m = Prophet(yearly_seasonality=True,
                    weekly_seasonality=True,
                    daily_seasonality=True)
        m.fit(ts_train)

        # to date
        df["date"] = pd.to_datetime(df["date"])
        # at this step we predict the future and we get plenty of additional columns be cautious
        ts_hat = (m.predict(ts_test)[["ds", "yhat"]]
                  .assign(ds=lambda x: pd.to_datetime(x["ds"]))
                  ).merge(ts_test, on=["ds"], how="left")  # merge to retrieve item and store index
        return pd.DataFrame(ts_hat, columns=schema.fieldNames())

    return train_fitted_prophet(df, cutoff)

In [8]:
tr_df = spark.read.csv('gs://'+bucket_name+'/timeseries_forecasting/01-datasets/train.csv',header=True, inferSchema=True)

In [9]:
ts_df = spark.read.csv('gs://'+bucket_name+'/timeseries_forecasting/01-datasets/test.csv',header=True, inferSchema=True)

In [10]:
tr_df.show(5,truncate=False)

In [11]:
ts_df.show(5,truncate=False)

In [12]:
if __name__ == '__main__':
    spark = (SparkSession
             .builder
             .appName("forecasting")
             .config('spark.sql.execution.arrow.enable', 'true')
             .getOrCreate()
             )

    # read input data from :https://www.kaggle.com/c/demand-forecasting-kernels-only/data
    data_train = (spark
                  .read
                  .format("csv")
                  .option('header', 'true')
                  .load('gs://'+bucket_name+'/timeseries_forecasting/01-datasets/train.csv')
                  )

    data_test = (spark
                 .read
                 .format("csv")
                 .option('header', 'true')
                 .load('gs://'+bucket_name+'/timeseries_forecasting/01-datasets/test.csv')
                 .drop('id')
                 )
    # max train date
    cutoff = data_train.select(max(col('date'))).collect()[0][0]
    # add sales none col to match with union
    data_test = data_test.withColumn('sales', lit(None))
    # concat train test
    df = (data_train.union(data_test)).sort(col('date'))
    # fit
    global_predictions = (df
                          .groupBy("store", "item")
                          .apply(fit_pandas_udf)
                          )
    

In [13]:
global_predictions.show()

In [14]:
# write global predictions table to parquet files
global_predictions.write.format('bigquery') .mode("overwrite").option('table', project_name+':'+dataset_name+'.'+user_name+'_global_predictions').save()