<a href="https://colab.research.google.com/github/HiepHuynh09/structure-api/blob/main/StructureAPI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Bước 1: Cài đặt Spark trong Colab
!pip install pyspark



In [2]:
# Bước 2: Khởi tạo SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, desc

spark = SparkSession.builder \
    .appName("Online Retail Analysis") \
    .getOrCreate()

In [3]:
# Bước 3: Tải file dataset
from google.colab import files
uploaded = files.upload()

Saving Online Retail Dataset.csv to Online Retail Dataset.csv


In [6]:
# Bước 4: Đọc file CSV bằng Structured API
  # Đọc dữ liệu
df = spark.read.csv("Online Retail Dataset.csv", header=True, inferSchema=True)

  # Kiểm tra schema
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [7]:
# Bước 5: Phân tích dữ liệu

  # Tính doanh thu cho từng giao dịch
    # Thêm cột Revenue = Quantity * UnitPrice
df = df.withColumn("Revenue", col("Quantity") * col("UnitPrice"))
    # Hiển thị 5 dòng đầu
df.show(5)

  #Tổng doanh thu theo quốc gia
    # Tổng doanh thu theo từng quốc gia
revenue_by_country = df.groupBy("Country").agg(sum("Revenue").alias("TotalRevenue"))
  # Sắp xếp giảm dần
revenue_by_country.orderBy(desc("TotalRevenue")).show(10, truncate=False)

  #Tìm 10 sản phẩm bán chạy nhất
    # Nhóm theo sản phẩm (StockCode) và tính tổng số lượng bán ra
top_products = df.groupBy("StockCode", "Description") \
    .agg(sum("Quantity").alias("TotalQuantity")) \
    .orderBy(desc("TotalQuantity")) \
    .limit(10)
top_products.show(truncate=False)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+------------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|           Revenue|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+------------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|15.299999999999999|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|             20.34|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|              22.0|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|             20.34|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|             20.34|
+---------+---------+---