In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Big Data Cleaning with PySpark") \
    .getOrCreate()

# Step 1: Load the dataset
df = spark.read.option("header", True).option("inferSchema", True).csv("Titanic.csv")

print("Original Dataset:")
df.show(10)
print(f"Total Records: {df.count()}")

# Step 2: Handle Missing Values
df_clean = df.dropna()

print("\nAfter Handling Missing Values:")
df_clean.show(10)
print(f"Total Records After Dropping Missing Values: {df_clean.count()}")

# Step 3: Remove Duplicates
df_clean = df_clean.dropDuplicates()

print("\nAfter Removing Duplicates:")
df_clean.show(5)
print(f"Total Records After Removing Duplicates: {df_clean.count()}")

# Step 4: Final Schema & Summary
print("\nFinal Dataset Schema:")
df_clean.printSchema()

print("\nFinal Record Count:", df_clean.count())

df_clean.write.csv("cleaned_titanic_dataset.csv", header=True, mode="overwrite")

# Stop Spark session
spark.stop()


Original Dataset:
+------+----+-----+-----+-------+--------+------+-----+-----+--------+
|   sex| age|sibsp|parch|   fare|embarked| class|  who|alone|survived|
+------+----+-----+-----+-------+--------+------+-----+-----+--------+
|  male|22.0|    1|    0|   7.25|       S| Third|  man|false|       0|
|female|38.0|    1|    0|71.2833|       C| First|woman|false|       1|
|female|26.0|    0|    0|  7.925|       S| Third|woman| true|       1|
|female|35.0|    1|    0|   53.1|       S| First|woman|false|       1|
|  male|35.0|    0|    0|   8.05|       S| Third|  man| true|       0|
|  male|NULL|    0|    0| 8.4583|       Q| Third|  man| true|       0|
|  male|54.0|    0|    0|51.8625|       S| First|  man| true|       0|
|  male| 2.0|    3|    1| 21.075|       S| Third|child|false|       0|
|female|27.0|    0|    2|11.1333|       S| Third|woman|false|       1|
|female|14.0|    1|    0|30.0708|       C|Second|child|false|       1|
+------+----+-----+-----+-------+--------+------+-----+----