In [1]:
import getpass as gp
from pyspark.sql import SparkSession, functions as F, types as T

In [2]:
user = gp.getuser()
spark = SparkSession.builder \
    .appName(f'{user}-Week-5-Assignment-2') \
    .config('spark.sql.warehouse.dir', f'/user/{user}/warehouse') \
    .config('spark.sql.catalogImplementation', 'hive') \
    .enableHiveSupport() \
    .master('yarn') \
    .getOrCreate()

In [3]:
spark

In [4]:
!hadoop fs -ls /public/trendytech/retail_db/products

Found 1 items
-rw-r--r--   3 itv005857 supergroup     174155 2023-04-26 16:47 /public/trendytech/retail_db/products/part-00000


In [5]:
!hadoop fs -head /public/trendytech/retail_db/products/part-00000

1,2,Quest Q64 10 FT. x 10 FT. Slant Leg Instant U,,59.98,http://images.acmesports.sports/Quest+Q64+10+FT.+x+10+FT.+Slant+Leg+Instant+Up+Canopy
2,2,Under Armour Men's Highlight MC Football Clea,,129.99,http://images.acmesports.sports/Under+Armour+Men%27s+Highlight+MC+Football+Cleat
3,2,Under Armour Men's Renegade D Mid Football Cl,,89.99,http://images.acmesports.sports/Under+Armour+Men%27s+Renegade+D+Mid+Football+Cleat
4,2,Under Armour Men's Renegade D Mid Football Cl,,89.99,http://images.acmesports.sports/Under+Armour+Men%27s+Renegade+D+Mid+Football+Cleat
5,2,Riddell Youth Revolution Speed Custom Footbal,,199.99,http://images.acmesports.sports/Riddell+Youth+Revolution+Speed+Custom+Football+Helmet
6,2,Jordan Men's VI Retro TD Football Cleat,,134.99,http://images.acmesports.sports/Jordan+Men%27s+VI+Retro+TD+Football+Cleat
7,2,Schutt Youth Recruit Hybrid Custom Football H,,99.99,http://images.acmesports.sports/Schutt+Youth+Recruit+Hybrid+Custom+Football+Helmet+2014
8,2,Nike Men's Vapor Ca

In [6]:
schema = T.StructType([
    T.StructField('ProductID', T.IntegerType()), \
    T.StructField('Category', T.IntegerType()), \
    T.StructField('ProductName', T.StringType()), \
    T.StructField('Description', T.StringType()), \
    T.StructField('Price', T.FloatType()), \
    T.StructField('ImageURL', T.StringType()),
])

In [7]:
df_prod = spark.read \
    .schema(schema) \
    .csv('/public/trendytech/retail_db/products/part-00000')

In [8]:
df_prod.printSchema()

root
 |-- ProductID: integer (nullable = true)
 |-- Category: integer (nullable = true)
 |-- ProductName: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Price: float (nullable = true)
 |-- ImageURL: string (nullable = true)



In [9]:
df_prod.show(5, truncate=False)

+---------+--------+---------------------------------------------+-----------+------+-------------------------------------------------------------------------------------+
|ProductID|Category|ProductName                                  |Description|Price |ImageURL                                                                             |
+---------+--------+---------------------------------------------+-----------+------+-------------------------------------------------------------------------------------+
|1        |2       |Quest Q64 10 FT. x 10 FT. Slant Leg Instant U|null       |59.98 |http://images.acmesports.sports/Quest+Q64+10+FT.+x+10+FT.+Slant+Leg+Instant+Up+Canopy|
|2        |2       |Under Armour Men's Highlight MC Football Clea|null       |129.99|http://images.acmesports.sports/Under+Armour+Men%27s+Highlight+MC+Football+Cleat     |
|3        |2       |Under Armour Men's Renegade D Mid Football Cl|null       |89.99 |http://images.acmesports.sports/Under+Armour+Men%27s+Re

In [10]:
df_prod.createOrReplaceTempView('products')

In [11]:
spark.sql('show tables').filter('isTemporary = "true" and tableName = "products"').show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        | products|       true|
+--------+---------+-----------+



In [12]:
spark.sql('select * from products limit 10').show()

+---------+--------+--------------------+-----------+------+--------------------+
|ProductID|Category|         ProductName|Description| Price|            ImageURL|
+---------+--------+--------------------+-----------+------+--------------------+
|        1|       2|Quest Q64 10 FT. ...|       null| 59.98|http://images.acm...|
|        2|       2|Under Armour Men'...|       null|129.99|http://images.acm...|
|        3|       2|Under Armour Men'...|       null| 89.99|http://images.acm...|
|        4|       2|Under Armour Men'...|       null| 89.99|http://images.acm...|
|        5|       2|Riddell Youth Rev...|       null|199.99|http://images.acm...|
|        6|       2|Jordan Men's VI R...|       null|134.99|http://images.acm...|
|        7|       2|Schutt Youth Recr...|       null| 99.99|http://images.acm...|
|        8|       2|Nike Men's Vapor ...|       null|129.99|http://images.acm...|
|        9|       2|Nike Adult Vapor ...|       null|  50.0|http://images.acm...|
|       10|     

### Find the total number of products in the given dataset.

In [13]:
df_prod.select('ProductID').count()

1345

In [14]:
spark.sql('select count(ProductID) as count from products').show()

+-----+
|count|
+-----+
| 1345|
+-----+



### Find the number of unique categories of products in the given dataset.

In [15]:
df_prod.select('Category').distinct().count()

55

In [16]:
spark.sql('select count(distinct Category) as count from products').show()

+-----+
|count|
+-----+
|   55|
+-----+



### Find the top 5 most expensive products based on their price, along with their product name, category, and image URL.

In [17]:
df_prod.select('Category', 'ProductName', 'Price', 'ImageURL').orderBy('Price', ascending=False).limit(5).show()

+--------+--------------------+-------+--------------------+
|Category|         ProductName|  Price|            ImageURL|
+--------+--------------------+-------+--------------------+
|      10| SOLE E35 Elliptical|1999.99|http://images.acm...|
|       4|  SOLE F85 Treadmill|1799.99|http://images.acm...|
|      10|  SOLE F85 Treadmill|1799.99|http://images.acm...|
|      22|  SOLE F85 Treadmill|1799.99|http://images.acm...|
|      47|"Spalding Beast 6...|1099.99|http://images.acm...|
+--------+--------------------+-------+--------------------+



In [18]:
spark.sql('select Category, ProductName, Price, ImageURL from products order by price desc limit 5').show()

+--------+--------------------+-------+--------------------+
|Category|         ProductName|  Price|            ImageURL|
+--------+--------------------+-------+--------------------+
|      10| SOLE E35 Elliptical|1999.99|http://images.acm...|
|       4|  SOLE F85 Treadmill|1799.99|http://images.acm...|
|      10|  SOLE F85 Treadmill|1799.99|http://images.acm...|
|      22|  SOLE F85 Treadmill|1799.99|http://images.acm...|
|      47|"Spalding Beast 6...|1099.99|http://images.acm...|
+--------+--------------------+-------+--------------------+



### Find the number of products in each category that have a price greater than $100. Display the results in a tabular format that shows the category name and the number of products that satisfy the condition

In [19]:
df_prod.filter('Price > 100') \
.groupBy('Category').count().show()

+--------+-----+
|Category|count|
+--------+-----+
|      31|   17|
|      53|   16|
|      34|   15|
|      44|    9|
|      12|    3|
|      22|    4|
|      47|   10|
|      52|    5|
|      13|    1|
|       6|    5|
|      16|   11|
|       3|    5|
|      20|    7|
|      57|    6|
|      54|    6|
|      48|   17|
|       5|   11|
|      19|   13|
|      41|   11|
|      43|   23|
+--------+-----+
only showing top 20 rows



In [20]:
spark.sql('select Category, count(ProductID) as count from products where Price > 100 group by Category').show()

+--------+-----+
|Category|count|
+--------+-----+
|      31|   17|
|      53|   16|
|      34|   15|
|      44|    9|
|      12|    3|
|      22|    4|
|      47|   10|
|      52|    5|
|      13|    1|
|       6|    5|
|      16|   11|
|       3|    5|
|      20|    7|
|      57|    6|
|      54|    6|
|      48|   17|
|       5|   11|
|      19|   13|
|      41|   11|
|      43|   23|
+--------+-----+
only showing top 20 rows



### What are the product names and prices of products that have a price greater than $200 and belong to category 5?

In [21]:
df_prod.filter('Price > 200 and Category == 5').select('ProductName', 'Price').show()

+--------------------+------+
|         ProductName| Price|
+--------------------+------+
|"Goaliath 54"" In...|499.99|
|Fitness Gear 300 ...|209.99|
|Teeter Hang Ups N...|299.99|
+--------------------+------+



In [22]:
spark.sql('select ProductName, Price from products where Price > 200 and Category = 5').show()

+--------------------+------+
|         ProductName| Price|
+--------------------+------+
|"Goaliath 54"" In...|499.99|
|Fitness Gear 300 ...|209.99|
|Teeter Hang Ups N...|299.99|
+--------------------+------+



In [23]:
spark.stop()