In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count

# Step 1: Spark session
spark = SparkSession.builder \
    .appName("Task 2 - Titanic Data Analysis") \
    .getOrCreate()

# Step 2: Load Titanic dataset
df = spark.read.option("header", True).option("inferSchema", True).csv("Titanic.csv")

# Step 3: Filtering - Female passengers
female_passengers = df.filter(col("Sex") == "female")
print("Female Passengers:")
female_passengers.show(5)

# Step 4: Aggregation by Passenger Class
grouped_df = female_passengers.groupBy("class", "sex").agg(
    avg("age").alias("Average_Age"),
    avg("fare").alias("Average_Fare"),
    avg("survived").alias("Survival_Rate"),
    count("*").alias("Total_Passengers")
)

print("\nGrouped and Aggregated Results (Class & Gender):")
grouped_df.show()

# Step 5: Survival count by gender
survival_by_gender = df.groupBy("Sex", "Survived").agg(count("*").alias("Count"))
print("\nSurvival by Gender:")
survival_by_gender.show()

spark.stop()


Female Passengers:
+------+----+-----+-----+-------+--------+------+-----+-----+--------+
|   sex| age|sibsp|parch|   fare|embarked| class|  who|alone|survived|
+------+----+-----+-----+-------+--------+------+-----+-----+--------+
|female|38.0|    1|    0|71.2833|       C| First|woman|false|       1|
|female|26.0|    0|    0|  7.925|       S| Third|woman| true|       1|
|female|35.0|    1|    0|   53.1|       S| First|woman|false|       1|
|female|27.0|    0|    2|11.1333|       S| Third|woman|false|       1|
|female|14.0|    1|    0|30.0708|       C|Second|child|false|       1|
+------+----+-----+-----+-------+--------+------+-----+-----+--------+
only showing top 5 rows


Grouped and Aggregated Results (Class & Gender):
+------+------+------------------+------------------+------------------+----------------+
| class|   sex|       Average_Age|      Average_Fare|     Survival_Rate|Total_Passengers|
+------+------+------------------+------------------+------------------+---------------