In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Data Transformation") \
    .getOrCreate()

# Load the data from CSV into a Spark DataFrame
df = spark.read.csv('/content/btw.csv', header=True, inferSchema=True)

# 1. Ensuring 'order_id' is present and is an integer
df = df.filter(df['order_id'].isNotNull() & df['order_id'].cast("int").isNotNull())

# 2. Ensuring 'product_id' is not 0
df = df.filter(df['product_id'] != 0)

# 3. Capping 'amount' at 1500 if it exceeds this value
df = df.withColumn("amount", when(col("amount") > 1500, 1500).otherwise(col("amount")))

# 4. Removing rows where 'status' is null or None
df = df.filter(df['status'].isNotNull())

# Display the transformed DataFrame
df.show()

# Stop the Spark session
spark.stop()



+--------------------+--------+----------+-------+---------+--------------+--------------------+------------------+--------------------+--------------------+--------------------+
|             item_id|order_id|product_id| amount|   status|item_timestamp|            location|     customer_name|      customer_phone|             country|         description|
+--------------------+--------+----------+-------+---------+--------------+--------------------+------------------+--------------------+--------------------+--------------------+
|4dc01ae9-c1a8-461...|  160794|       647| 1500.0|Cancelled|       34:34.6|        East Cameron|   Richard Stevens|   (774)709-6342x106|           Guatemala|Room as address h...|
|77944e0c-f500-456...|  510841|       243| 848.88|Cancelled|       05:37.4|South Christinaburgh|     Patrick Allen|    001-734-642-3018|          Mauritania|Accept part crime...|
|1019711d-53c9-401...|  259964|       209| 614.64| Returned|       18:15.2|    South Jeremybury|       We

# New Section