In [None]:

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("FlightsAnalysis") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()


In [None]:
# Load dataset
flights = spark.read.csv("flights.csv", header=True, inferSchema=True)

# See schema
flights.printSchema()

# Sample rows
flights.show(5)

In [None]:
print("total flights:",flights.count())

In [None]:
from pyspark.sql import functions as F

busiest_routes = flights.groupBy("ORIGIN", "DEST") \
    .count() \
    .orderBy(F.desc("count"))

busiest_routes.show(10, truncate=False)

In [None]:
avg_delay = flights.groupBy("CARRIER") \
    .agg(F.avg("DEP_DELAY").alias("AvgDepDelay"),
         F.avg("ARR_DELAY").alias("AvgArrDelay")) \
    .orderBy("CARRIER")

avg_delay.show()

In [None]:
monthly = flights.groupBy("MONTH") \
    .agg(F.count("*").alias("TotalFlights"),
         F.avg("CANCELLED").alias("CancelRate")) \
    .orderBy("MONTH")

monthly.show()

In [None]:
import matplotlib.pyplot as plt

monthly_pd = monthly.toPandas()
plt.plot(monthly_pd["MONTH"], monthly_pd["TotalFlights"])
plt.xlabel("Month")
plt.ylabel("Total Flights")
plt.title("Flights per Month")
plt.show()