In [1]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("Amazon Sales Analysis").getOrCreate()


In [7]:
file_path = "/content/sample_data/amazon.csv"  # Change this path to your file location
df = spark.read.csv(file_path, header=True, inferSchema=True)


In [8]:
df.printSchema()  # Show column names and types
df.show(5)  # Show first 5 rows


root
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- discounted_price: string (nullable = true)
 |-- actual_price: string (nullable = true)
 |-- discount_percentage: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- rating_count: string (nullable = true)
 |-- about_product: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_name: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- review_title: string (nullable = true)
 |-- review_content: string (nullable = true)
 |-- img_link: string (nullable = true)
 |-- product_link: string (nullable = true)

+----------+--------------------+--------------------+----------------+------------+-------------------+------+------------+---------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|product_

In [9]:
df = df.na.drop()


In [11]:
from pyspark.sql.functions import to_date

df.orderBy("discount_percentage", ascending=False).show(10)


+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+---------------------+--------------------+--------------------+
|product_id|        product_name|            category|    discounted_price|        actual_price| discount_percentage|              rating|        rating_count|about_product|             user_id|           user_name|           review_id|        review_title|       review_content|            img_link|        product_link|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+---------------------+--------------------+--------------------+
|B0BLV1GNLN|"WZATCO Pixel | P...| 

In [13]:
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType

# Cast 'rating' column to DoubleType
df = df.withColumn("rating", col("rating").cast(DoubleType()))

# Now you can perform the aggregation
df.groupBy("category").avg("rating").orderBy("avg(rating)", ascending=False).show()

+--------------------+------------------+
|            category|       avg(rating)|
+--------------------+------------------+
|Computers&Accesso...|               4.6|
|Computers&Accesso...|               4.5|
|Computers&Accesso...|               4.5|
|OfficeProducts|Of...|               4.5|
|Electronics|Camer...|               4.5|
|Electronics|Power...|               4.5|
|Home&Kitchen|Kitc...|               4.5|
|Electronics|HomeA...|               4.5|
|HomeImprovement|E...|               4.5|
|Home&Kitchen|Craf...|               4.5|
|Electronics|HomeT...|               4.5|
|Home&Kitchen|Kitc...|               4.5|
|Electronics|Mobil...|4.4714285714285715|
|Home&Kitchen|Kitc...|              4.46|
|OfficeProducts|Of...|              4.45|
|Home&Kitchen|Craf...| 4.433333333333334|
|Electronics|Gener...| 4.414285714285714|
|Home&Kitchen|Kitc...|               4.4|
|Electronics|Acces...|               4.4|
|Computers&Accesso...|               4.4|
+--------------------+------------

In [14]:
df.orderBy("rating_count", ascending=False).show(10)


+----------+--------------------+--------------------+--------------------+--------------------+--------------------+------+------------+-------------+-------+---------+--------------------+--------------------+---------------------+--------------------+--------------------+
|product_id|        product_name|            category|    discounted_price|        actual_price| discount_percentage|rating|rating_count|about_product|user_id|user_name|           review_id|        review_title|       review_content|            img_link|        product_link|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+------+------------+-------------+-------+---------+--------------------+--------------------+---------------------+--------------------+--------------------+
|B0B3RRWSF6|Fire-Boltt Phoeni...|   120+ Sports Modes| 240*240 PX High ...| Heart Rate Monit...|Electronics|Weara...|  NULL|      ₹9,999|          80%|    4.3|   27,696|"Fi

In [15]:
from pyspark.sql.functions import corr
df.select(corr("discount_percentage", "rating")).show()


+---------------------------------+
|corr(discount_percentage, rating)|
+---------------------------------+
|                             NULL|
+---------------------------------+

