In [1]:
!pip install pyspark



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *


In [3]:
spark = SparkSession.builder.appName('Silver Layer').getOrCreate()

## Loading the input file

In [4]:
data = spark.read.csv('data.csv',header = True, inferSchema = True)

In [5]:
data.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [10]:
data.show(3,truncate = False)

+---------+---------+----------------------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                       |Quantity|InvoiceDate   |UnitPrice|CustomerID|Country       |
+---------+---------+----------------------------------+--------+--------------+---------+----------+--------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER|6       |12/1/2010 8:26|2.55     |17850     |United Kingdom|
|536365   |71053    |WHITE METAL LANTERN               |6       |12/1/2010 8:26|3.39     |17850     |United Kingdom|
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER    |8       |12/1/2010 8:26|2.75     |17850     |United Kingdom|
+---------+---------+----------------------------------+--------+--------------+---------+----------+--------------+
only showing top 3 rows



In [9]:
data.count()

541909

In [11]:
data.select(
    count(when(col('CustomerID').isNull(), 1)).alias('CustomerID_nulls')
).show()

+----------------+
|CustomerID_nulls|
+----------------+
|          135080|
+----------------+



In [16]:
null_counts = data.select([count(when(col(c).isNull(),1)).alias(c) for c in data.columns])

In [17]:
null_counts.show()


+---------+---------+-----------+--------+-----------+---------+----------+-------+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+-----------+--------+-----------+---------+----------+-------+
|        0|        0|       1454|       0|          0|        0|    135080|      0|
+---------+---------+-----------+--------+-----------+---------+----------+-------+



In [19]:
data.filter(col('Quantity') <0).count()

10624

In [21]:
data.filter(col('UnitPrice') <= 0).count()

2517

In [22]:
data.filter(col('Quantity') <0).show(3)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|  C536379|        D|            Discount|      -1| 12/1/2010 9:41|     27.5|     14527|United Kingdom|
|  C536383|   35004C|SET OF 3 COLOURED...|      -1| 12/1/2010 9:49|     4.65|     15311|United Kingdom|
|  C536391|    22556|PLASTERS IN TIN C...|     -12|12/1/2010 10:24|     1.65|     17548|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 3 rows



In [26]:
returns_count = data.filter((col('Quantity')<0) & (col('InvoiceNo').startswith("C"))).count()
returns_count

9288

In [28]:
error_counts = data.filter((col('Quantity')<0) & (~col('InvoiceNo').startswith("C"))).count()
error_counts

1336

In [30]:
suspicious = data.filter((col('Quantity')<0) & (~col('InvoiceNo').startswith("C")))

In [34]:
null_customer = suspicious.filter(col('CustomerID').isNull()).count()
null_customer

1336

In [33]:
null_descrption = suspicious.filter(col('Description').isNull()).count()
null_descrption

862

In [36]:
has_descr = data.filter((col('Quantity')<0) & (~col('InvoiceNo').startswith("C")) &(col('CustomerID').isNull())
&(~col('Description').isNull()))

In [38]:
has_descr.count()

474

In [39]:
has_descr.show(10, truncate=False)


+---------+---------+--------------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description               |Quantity|InvoiceDate    |UnitPrice|CustomerID|Country       |
+---------+---------+--------------------------+--------+---------------+---------+----------+--------------+
|537032   |21275    |?                         |-30     |12/3/2010 16:50|0.0      |NULL      |United Kingdom|
|537425   |84968F   |check                     |-20     |12/6/2010 15:35|0.0      |NULL      |United Kingdom|
|537426   |84968E   |check                     |-35     |12/6/2010 15:36|0.0      |NULL      |United Kingdom|
|537432   |35833G   |damages                   |-43     |12/6/2010 16:10|0.0      |NULL      |United Kingdom|
|538072   |22423    |faulty                    |-13     |12/9/2010 14:10|0.0      |NULL      |United Kingdom|
|538090   |20956    |?                         |-723    |12/9/2010 14:48|0.0      |NULL      |United Kingdom|
|538161   

In [40]:
data_clean = data.filter(
    ~((col("Quantity") < 0) &
      (~col("InvoiceNo").startswith("C")) &
      (col("CustomerID").isNull()))
)

In [43]:
normal_null_customers = data.filter((col('CustomerID').isNull()) & (col('Quantity')>0))

In [44]:
normal_null_customers.show(5,truncate= False)

+---------+---------+-------------------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                    |Quantity|InvoiceDate    |UnitPrice|CustomerID|Country       |
+---------+---------+-------------------------------+--------+---------------+---------+----------+--------------+
|536414   |22139    |NULL                           |56      |12/1/2010 11:52|0.0      |NULL      |United Kingdom|
|536544   |21773    |DECORATIVE ROSE BATHROOM BOTTLE|1       |12/1/2010 14:32|2.51     |NULL      |United Kingdom|
|536544   |21774    |DECORATIVE CATS BATHROOM BOTTLE|2       |12/1/2010 14:32|2.51     |NULL      |United Kingdom|
|536544   |21786    |POLKADOT RAIN HAT              |4       |12/1/2010 14:32|0.85     |NULL      |United Kingdom|
|536544   |21787    |RAIN PONCHO RETROSPOT          |2       |12/1/2010 14:32|1.66     |NULL      |United Kingdom|
+---------+---------+-------------------------------+--------+---------------+--

In [45]:
revenue_null = normal_null_customers.withColumn("TotalPrice", col("Quantity") * col("UnitPrice")) \
    .agg(sum("TotalPrice").alias("revenue")).collect()[0]["revenue"]

In [46]:
revenue_null

1733152.52000013

In [49]:
total_revenue = data.withColumn("TotalPrice", col("Quantity") * col("UnitPrice")) \
    .agg(sum("TotalPrice").alias("revenue")).collect()[0]["revenue"]
total_revenue

9747747.93399951

In [50]:
data_cleaned = data.withColumn('customertype',when(col('CustomerID').isNull(),'Guest').otherwise('Registered'))

In [51]:
data_cleaned.show(3)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|customertype|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|  Registered|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|  Registered|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|  Registered|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+------------+
only showing top 3 rows



In [54]:
bad_price_returns = data.filter((col('UnitPrice')<=0) & (col('InvoiceNo').startswith('C')))
bad_price_returns.count()

0

In [55]:
bad_price_sales = data.filter((col('UnitPrice')<=0) & (~col('InvoiceNo').startswith('C')))
bad_price_sales.count()

2517

In [58]:
price_zero = bad_price_sales.filter((col('UnitPrice') ==0)).count()
price_negative = bad_price_sales.filter((col('UnitPrice')  < 0)).count()

In [59]:
price_negative

2

In [65]:
bad_price_sales.filter((col('UnitPrice') ==0)&(col('Description').isNull())).count()

1454

In [66]:
bad_price_sales.filter((col('UnitPrice') ==0)&(~col('Description').isNull())).count()

1061

In [68]:
bad_price_sales.filter((col('UnitPrice') ==0)&(col('Description').isNull()) & (col('CustomerID').isNull())).count()

1454

In [69]:
bad_price_sales.filter((col('UnitPrice') ==0)&(col('Description').isNull()) & (~col('CustomerID').isNull())).count()

0

In [71]:
bad_price_sales.filter((col('UnitPrice') ==0)&(~col('Description').isNull()) & (col('Quantity') < 0)).count()

474

In [73]:
zero_price_negative = data.filter(
    (col("UnitPrice") == 0) &
    (col("Description").isNotNull()) &
    (col("Quantity") < 0)
)
zero_price_negative.count()

474

In [74]:
with_C = zero_price_negative.filter(col("InvoiceNo").startswith("C")).count()
without_C = zero_price_negative.filter(~col("InvoiceNo").startswith("C")).count()

In [75]:
with_C

0

In [76]:
without_C

474

In [78]:
zero_price_positive = data.filter(
    (col("UnitPrice") == 0) &
    (col("Quantity") > 0)
)

In [80]:
zero_price_positive.count()

1179

In [81]:
with_desc = zero_price_positive.filter(col("Description").isNotNull()).count()
without_desc = zero_price_positive.filter(col("Description").isNull()).count()

In [82]:
with_desc

587

In [83]:
without_desc

592

In [85]:
incomplete = data.filter(
    (col("UnitPrice") == 0) &
    (col("Quantity") > 0) &
    (col("Description").isNull())
)

In [86]:
incomplete.show(3)

+---------+---------+-----------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+-----------+--------+---------------+---------+----------+--------------+
|   536414|    22139|       NULL|      56|12/1/2010 11:52|      0.0|      NULL|United Kingdom|
|   536545|    21134|       NULL|       1|12/1/2010 14:32|      0.0|      NULL|United Kingdom|
|   536546|    22145|       NULL|       1|12/1/2010 14:33|      0.0|      NULL|United Kingdom|
+---------+---------+-----------+--------+---------------+---------+----------+--------------+
only showing top 3 rows



In [87]:
possibly_free = data.filter(
    (col("UnitPrice") == 0) &
    (col("Quantity") > 0) &
    (col("Description").isNotNull())
)


possibly_free.show(20, truncate=False)

+---------+---------+--------------------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                     |Quantity|InvoiceDate    |UnitPrice|CustomerID|Country       |
+---------+---------+--------------------------------+--------+---------------+---------+----------+--------------+
|536941   |22734    |amazon                          |20      |12/3/2010 12:08|0.0      |NULL      |United Kingdom|
|536942   |22139    |amazon                          |15      |12/3/2010 12:08|0.0      |NULL      |United Kingdom|
|537197   |22841    |ROUND CAKE TIN VINTAGE GREEN    |1       |12/5/2010 14:02|0.0      |12647     |Germany       |
|537534   |85064    |CREAM SWEETHEART LETTER RACK    |1       |12/7/2010 11:48|0.0      |NULL      |United Kingdom|
|537534   |84832    |ZINC WILLIE WINKIE  CANDLE STICK|1       |12/7/2010 11:48|0.0      |NULL      |United Kingdom|
|537534   |84692    |BOX OF 24 COCKTAIL PARASOLS     |2       |12/7/2010

In [88]:
free_with_customer = possibly_free.filter(col("CustomerID").isNotNull()).count()
free_without_customer = possibly_free.filter(col("CustomerID").isNull()).count()

In [89]:
free_with_customer

40

In [90]:
free_without_customer

547