In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, to_timestamp, sum, lag
from pyspark.sql.window import Window
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
spark = SparkSession.builder.appName('Sales').getOrCreate()

In [None]:
df = spark.read.csv('/content/Online Retail.csv', header=True, inferSchema=True)

In [None]:
df.show()

+---------+---------+--------------------+--------+---------+----------+--------------+-------------------+----+-----+----+---+---------+
|InvoiceNo|StockCode|         Description|Quantity|UnitPrice|CustomerID|       Country|        InvoiceDate|Year|Month|Week|Day|DayOfWeek|
+---------+---------+--------------------+--------+---------+----------+--------------+-------------------+----+-----+----+---+---------+
|   536365|   85123A|WHITE HANGING HEA...|       6|     2.55|     17850|United Kingdom|2010-01-12 08:26:00|2010|    1|   2| 12|        1|
|   536365|    71053| WHITE METAL LANTERN|       6|     3.39|     17850|United Kingdom|2010-01-12 08:26:00|2010|    1|   2| 12|        1|
|   536365|   84406B|CREAM CUPID HEART...|       8|     2.75|     17850|United Kingdom|2010-01-12 08:26:00|2010|    1|   2| 12|        1|
|   536365|   84029G|KNITTED UNION FLA...|       6|     3.39|     17850|United Kingdom|2010-01-12 08:26:00|2010|    1|   2| 12|        1|
|   536365|   84029E|RED WOOLLY HO

In [None]:
df = df.withColumn('InvoiceDate', to_timestamp(col('InvoiceDate'), 'MM/dd/yyyy HH:mm'))

In [None]:
df = df.filter((col('Quantity') > 0) & (col('UnitPrice') < 30))

In [None]:
grouped_df = df.groupBy('StockCode', 'Country', 'InvoiceDate').agg(sum('Quantity').alias('TotalQuantity'), sum('UnitPrice').alias('UnitPriceTotal'))

In [None]:
grouped_df.show()

+---------+--------------+-------------------+-------------+--------------+
|StockCode|       Country|        InvoiceDate|TotalQuantity|UnitPriceTotal|
+---------+--------------+-------------------+-------------+--------------+
|    22900|United Kingdom|2010-01-12 11:45:00|            2|           5.9|
|    22926|United Kingdom|2010-01-12 12:43:00|            1|          5.95|
|    22963|United Kingdom|2010-01-12 13:17:00|            2|          0.85|
|   46000S|United Kingdom|2010-01-12 14:41:00|            1|          1.45|
|    21486|United Kingdom|2010-01-12 14:41:00|            2|          3.75|
|    37449|United Kingdom|2010-01-12 15:08:00|            6|          19.9|
|    21070|United Kingdom|2010-02-12 10:39:00|           12|          1.25|
|    82483|United Kingdom|2010-02-12 10:54:00|            4|          4.95|
|    82482|United Kingdom|2010-02-12 12:23:00|            6|           2.1|
|    84945|United Kingdom|2010-02-12 13:11:00|           12|          0.85|
|    22191|U