# PySpark Online Retail II Dataset Analysis

This notebook demonstrates how to load and analyze the Online Retail II dataset using PySpark in Google Colab.


## 1. Install Required Packages

First, install PySpark and related dependencies in Google Colab.


In [1]:
# Install required packages
!pip install pyspark pandas openpyxl


Collecting pyspark
  Downloading pyspark-4.0.1.tar.gz (434.2 MB)
     ---------------------------------------- 0.0/434.2 MB ? eta -:--:--
     --------------------------------------- 1.6/434.2 MB 10.3 MB/s eta 0:00:42
     --------------------------------------- 3.9/434.2 MB 11.2 MB/s eta 0:00:39
      -------------------------------------- 6.6/434.2 MB 11.4 MB/s eta 0:00:38
      -------------------------------------- 8.9/434.2 MB 11.5 MB/s eta 0:00:37
     - ------------------------------------ 11.5/434.2 MB 11.6 MB/s eta 0:00:37
     - ------------------------------------ 13.9/434.2 MB 11.6 MB/s eta 0:00:37
     - ------------------------------------ 16.3/434.2 MB 11.6 MB/s eta 0:00:36
     - ------------------------------------ 18.6/434.2 MB 11.6 MB/s eta 0:00:36
     - ------------------------------------ 21.2/434.2 MB 11.7 MB/s eta 0:00:36
     -- ----------------------------------- 23.9/434.2 MB 11.7 MB/s eta 0:00:36
     -- ----------------------------------- 26.2/434.2 MB 11.7

  DEPRECATION: Building 'pyspark' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'pyspark'. Discussion can be found at https://github.com/pypa/pip/issues/6334


## 2. Import Libraries and Initialize Spark Session


In [None]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, sum as spark_sum, count, when, isnan, isnull, desc, min as spark_min, max as spark_max
import pandas as pd

# Initialize Spark session
# Configure Spark for both local and Colab environments
spark = SparkSession.builder \
    .appName("OnlineRetailAnalysis") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.sql.execution.arrow.maxRecordsPerBatch", "10000") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "true") \
    .config("spark.python.worker.timeout", "300") \
    .config("spark.python.worker.reuse", "true") \
    .getOrCreate()

# Set log level to reduce output noise
spark.sparkContext.setLogLevel("WARN")

print("Spark session initialized successfully!")
print(f"Spark version: {spark.version}")


Spark session initialized successfully!
Spark version: 4.0.1


## 3. Load Data from GitHub

Since PySpark cannot directly read Excel files, we use pandas to read from GitHub and then convert to Spark DataFrame.


In [6]:
# Use pandas to read Excel file from GitHub
print("Reading Excel file from GitHub...")

# GitHub repository information
github_user = "Hachi630"
github_repo = "BDAS"
file_path = "online_retail_II.xlsx"

# Construct GitHub raw URL
github_url = f"https://raw.githubusercontent.com/{github_user}/{github_repo}/main/{file_path}"

# Use pandas to read Excel file from GitHub
pandas_df = pd.read_excel(github_url)

# Convert pandas DataFrame to Spark DataFrame
# Ensure DataFrame is named df for consistency
df = spark.createDataFrame(pandas_df)

print("Data successfully loaded from GitHub into Spark DataFrame!")


Reading Excel file from GitHub...
Data successfully loaded from GitHub into Spark DataFrame!


## 4. Check Data Dimensions

Determine the number of rows and columns in the dataset.


In [5]:
# Check data dimensions
print("=== Data Dimension Information ===")

# Get row count with error handling
try:
    row_count = df.count()
    print(f"Dataset row count: {row_count:,}")
except Exception as e:
    print(f"Error getting row count with Spark: {e}")
    print("Using pandas DataFrame for row count...")
    row_count = len(pandas_df)
    print(f"Dataset row count (from pandas): {row_count:,}")

# Get column count
column_count = len(df.columns)
print(f"Dataset column count: {column_count}")

# Display column names
print(f"Column names: {df.columns}")


=== Data Dimension Information ===
Error getting row count with Spark: An error occurred while calling o55.count.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 7 in stage 1.0 failed 1 times, most recent failure: Lost task 7.0 in stage 1.0 (TID 27) (windows10.microdone.cn executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:252)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:143)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:158)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:178)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:261)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.a

## 5. Preview Data

Display the first few rows to understand the content structure.


In [None]:
# Preview data - show first 5 rows
print("=== Data Preview (First 5 Rows) ===")
df.show(5, truncate=False)


## 6. Data Schema

Print the DataFrame schema to verify data types.


In [None]:
# Print data schema to verify data types
print("=== Data Schema ===")
df.printSchema()


## 7. Statistical Summary of Numeric Columns

Get basic statistical information for numeric columns.


In [None]:
# Display basic statistical summary for numeric columns
print("=== Numeric Columns Statistical Summary ===")
# Use describe() method to get statistical information for numeric columns
df.describe().show()


In [None]:
# Additional statistical information - use summary() method for more detailed statistics
print("=== Detailed Statistical Summary ===")
df.summary().show()


## 8. Missing Values Check

Check for missing values in the data.


In [None]:
# Check for missing values
print("=== Missing Values Check ===")

# Calculate missing value count for each column
missing_values = df.select([spark_sum(when(isnull(c) | isnan(c), 1).otherwise(0)).alias(c) for c in df.columns])
missing_values.show()


## 9. Specific Column Analysis

Analyze special cases in Quantity and UnitPrice columns.


In [None]:
# Check negative values in Quantity column (returns)
print("=== Quantity Column Analysis ===")

quantity_stats = df.select(
    spark_min("Quantity").alias("Min Quantity"),
    spark_max("Quantity").alias("Max Quantity"),
    count(when(col("Quantity") < 0, 1)).alias("Return Records Count"),
    count(when(col("Quantity") > 0, 1)).alias("Normal Sales Records Count")
)
quantity_stats.show()


In [None]:
# Check UnitPrice column range
print("=== UnitPrice Column Analysis ===")

price_stats = df.select(
    spark_min("UnitPrice").alias("Min Unit Price"),
    spark_max("UnitPrice").alias("Max Unit Price"),
    count(when(col("UnitPrice") < 0, 1)).alias("Negative Price Records Count"),
    count(when(col("UnitPrice") == 0, 1)).alias("Zero Price Records Count")
)
price_stats.show()


## 10. Group Analysis

Perform group analysis by country and customer.


In [None]:
# Display record counts by country
print("=== Record Count by Country (Top 10) ===")
df.groupBy("Country").count().orderBy(desc("count")).show(10)


In [None]:
# Display record counts by customer
print("=== Record Count by Customer (Top 10) ===")
df.groupBy("Customer ID").count().orderBy(desc("count")).show(10)


## 11. Summary

Dataset basic information summary.


In [None]:
print("=== Analysis Complete ===")
print("Dataset basic information summary:")
print(f"- Total records: {row_count:,}")
print(f"- Column count: {column_count}")
print(f"- Main columns: InvoiceNo, StockCode, Description, Quantity, InvoiceDate, UnitPrice, Customer ID, Country")
print("- Data types verified through printSchema()")
print("- Statistical summary shows distribution of numeric columns")
print("- Missing values and anomalies checked")

# Stop Spark session (optional, usually not needed in Colab)
# spark.stop()


## Alternative: Pandas-Only Analysis

If PySpark encounters issues in your local environment, you can use this pandas-only version for analysis.


In [None]:
# Alternative pandas-only analysis (use if PySpark fails)
import pandas as pd
import numpy as np

# Load data using pandas from GitHub
print("=== Loading Data with Pandas from GitHub ===")

# GitHub repository information
github_user = "Hachi630"
github_repo = "BDAS"
file_path = "online_retail_II.xlsx"

# Construct GitHub raw URL
github_url = f"https://raw.githubusercontent.com/{github_user}/{github_repo}/main/{file_path}"

# Use pandas to read Excel file from GitHub
pandas_df = pd.read_excel(github_url)
print("Data successfully loaded from GitHub!")

# Check data dimensions
print("\n=== Data Dimension Information ===")
row_count = len(pandas_df)
column_count = len(pandas_df.columns)
print(f"Dataset row count: {row_count:,}")
print(f"Dataset column count: {column_count}")
print(f"Column names: {list(pandas_df.columns)}")

# Preview data
print("\n=== Data Preview (First 5 Rows) ===")
print(pandas_df.head())

# Data types
print("\n=== Data Schema ===")
print(pandas_df.dtypes)

# Statistical summary
print("\n=== Statistical Summary ===")
print(pandas_df.describe())

# Missing values
print("\n=== Missing Values Check ===")
print(pandas_df.isnull().sum())

# Quantity analysis
print("\n=== Quantity Column Analysis ===")
print(f"Min Quantity: {pandas_df['Quantity'].min()}")
print(f"Max Quantity: {pandas_df['Quantity'].max()}")
print(f"Return Records Count: {(pandas_df['Quantity'] < 0).sum()}")
print(f"Normal Sales Records Count: {(pandas_df['Quantity'] > 0).sum()}")

# UnitPrice analysis
print("\n=== UnitPrice Column Analysis ===")
print(f"Min Unit Price: {pandas_df['UnitPrice'].min()}")
print(f"Max Unit Price: {pandas_df['UnitPrice'].max()}")
print(f"Negative Price Records Count: {(pandas_df['UnitPrice'] < 0).sum()}")
print(f"Zero Price Records Count: {(pandas_df['UnitPrice'] == 0).sum()}")

# Group analysis
print("\n=== Record Count by Country (Top 10) ===")
print(pandas_df['Country'].value_counts().head(10))

print("\n=== Record Count by Customer (Top 10) ===")
print(pandas_df['Customer ID'].value_counts().head(10))
