In [7]:
from pyspark.sql import SparkSession

# Spark 세션 생성
spark = SparkSession.builder \
    .appName("NYC TLC Trip Analysis - RDD") \
    .getOrCreate()

In [8]:
file_path = "/data/yellow_tripdata_2025-06.parquet"

# Parquet 파일을 DataFrame으로 읽기
df = spark.read.parquet(file_path)

# RDD로 변환
rdd = df.rdd

In [9]:
# 필요한 컬럼 추출: (pickup_date, fare_amount, trip_distance)
from pyspark.sql.functions import to_date

df = df.withColumn("pickup_date", to_date(df.tpep_pickup_datetime))  # 날짜만 추출

rdd = df.select("pickup_date", "fare_amount", "trip_distance") \
        .rdd \
        .filter(lambda row: row["fare_amount"] > 0 and row["trip_distance"] > 0)

In [10]:
# 총 운행 수
total_trips = rdd.count()

# 총 수익
total_revenue = rdd.map(lambda row: row["fare_amount"]).sum()

# 평균 거리
avg_distance = rdd.map(lambda row: row["trip_distance"]).mean()

# 일별 운행 수
trips_per_day = rdd.map(lambda row: (row["pickup_date"], 1)).reduceByKey(lambda a, b: a + b).collect()

# 일별 수익
revenue_per_day = rdd.map(lambda row: (row["pickup_date"], row["fare_amount"])).reduceByKey(lambda a, b: a + b).collect()

# 실제 운행 날 수
distinct_dates = rdd.map(lambda row: row["pickup_date"]).distinct()

# 전체 RDD 에서 10%만 샘플링
sampled_rdd = rdd.sample(withReplacement=False, fraction=0.1)

                                                                                

In [12]:
from pyspark.sql import Row

summary = spark.createDataFrame(
    [Row(metric="total_trips", value=float(total_trips))] +
    [Row(metric="total_revenue", value=float(total_revenue))] +
    [Row(metric="avg_distance", value=float(avg_distance))] +
    [Row(metric=f"daily_trips_{str(k)}", value=float(v)) for k, v in trips_per_day] +
    [Row(metric=f"daily_revenue_{str(k)}", value=float(v)) for k, v in revenue_per_day] +
    [Row(metric="distinct_dates", value=str(distinct_dates))] +
    [Row(metric="sampled_rdd", value="RDD object (not serializable)")]
)

summary.write.csv("/output/summary", header=True, mode="overwrite")

                                                                                