In [2]:
# 1. Install PySpark
!pip install pyspark

# 2. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 3. Import Libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, rank, lag, year, month, dayofmonth, udf
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType

# 4. Create SparkSession
spark = SparkSession.builder.appName("RetailTransactions").getOrCreate()

# 5. Load CSV with Manual Schema
file_path = "/content/drive/My Drive/retail_data.csv"

schema = StructType([
    StructField("TransactionID", StringType()),
    StructField("Customer", StringType()),
    StructField("City", StringType()),
    StructField("Product", StringType()),
    StructField("Category", StringType()),
    StructField("Quantity", IntegerType()),
    StructField("UnitPrice", DoubleType()),
    StructField("TotalPrice", DoubleType()),
    StructField("TransactionDate", StringType()),
    StructField("PaymentMode", StringType())
])

df = spark.read.csv(file_path, header=True, schema=schema)
df = df.withColumn("TransactionDate", col("TransactionDate").cast(DateType()))

# 6. Data Exploration & Filtering
df.filter(col("TotalPrice") > 40000).show()
df.select("City").distinct().show()
df.filter(col("City") == "Delhi").show()
df.where("City = 'Delhi'").show()

# 7. Data Manipulation
df = df.withColumn("DiscountedPrice", col("TotalPrice") * 0.9)
df = df.withColumnRenamed("TransactionDate", "TxnDate")
df = df.drop("UnitPrice")

# 8. Aggregations
df.groupBy("City").sum("TotalPrice").withColumnRenamed("sum(TotalPrice)", "TotalSales").show()

df_original = spark.read.csv(file_path, header=True, inferSchema=True)
df_original.groupBy("Category").avg("UnitPrice").withColumnRenamed("avg(UnitPrice)", "AvgUnitPrice").show()

df.groupBy("PaymentMode").count().show()

# 9. Window Functions
windowSpec = Window.partitionBy("City").orderBy(col("TotalPrice").desc())
df.withColumn("Rank", rank().over(windowSpec)).select("TransactionID", "City", "TotalPrice", "Rank").show()

df.withColumn("PrevAmount", lag("TotalPrice").over(windowSpec)).select("TransactionID", "City", "TotalPrice", "PrevAmount").show()

# 10. Joins
city_region_data = [("Mumbai", "West"), ("Delhi", "North"), ("Bangalore", "South"), ("Hyderabad", "South")]
city_region_schema = ["City", "Region"]
city_region_df = spark.createDataFrame(city_region_data, city_region_schema)

df_joined = df.join(city_region_df, on="City", how="left")
df_joined.groupBy("Region").sum("TotalPrice").withColumnRenamed("sum(TotalPrice)", "TotalSales").show()

# 11. Nulls & Data Cleaning
df_with_null = df.withColumn("Quantity", when(col("TransactionID") == "T1002", None).otherwise(col("Quantity")))
df_with_null.na.drop(subset=["Quantity"]).show()

df_filled = df_with_null.na.fill({"PaymentMode": "Unknown"})
df_filled.show()

# 12. UDF for Order Labeling
def label_order(amount):
    if amount > 50000:
        return "High"
    elif amount >= 30000:
        return "Medium"
    else:
        return "Low"

label_udf = udf(label_order, StringType())
df = df.withColumn("OrderLabel", label_udf(col("TotalPrice")))
df.select("TransactionID", "TotalPrice", "OrderLabel").show()

# 13. Date & Time
df_date_parts = df.withColumn("Year", year("TxnDate")) \
                  .withColumn("Month", month("TxnDate")) \
                  .withColumn("Day", dayofmonth("TxnDate"))

df_date_parts.select("TransactionID", "Year", "Month", "Day").show()
df_date_parts.filter(col("Month") == 2).select("TransactionID", "TxnDate").show()

# 14. Union & Deduplication
df_dup = df.union(df)
df_no_dup = df_dup.dropDuplicates()

print("Original count:", df.count())
print("After union() :", df_dup.count())
print("After dropDuplicates():", df_no_dup.count())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|  70000.0|   70000.0|     2024-01-15|       Card|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|  30000.0|   60000.0|     2024-01-20|        UPI|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|  50000.0|   50000.0|     2024-02-15|       Card|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+

+---------+
|     City|
+---------+
|Bangalore|
|   Mumbai|
|    Delhi|
|Hydera