In [1]:
# Initialize findspark
import findspark
findspark.init()

# Import SparkSession
from pyspark.sql import SparkSession
from dotenv import load_dotenv
import os
load_dotenv()
AWS_ACCESS_KEY=os.getenv("AWS_ACCESS_KEY")
AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")

# Create SparkSession with AWS packages
spark = SparkSession.builder \
    .appName("Flight Delays Streaming in Notebook") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1,com.amazonaws:aws-java-sdk-bundle:1.11.1026") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_KEY) \
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
    .getOrCreate()

# Set log level to avoid too much noise
spark.sparkContext.setLogLevel("WARN")



:: loading settings :: url = jar:file:/usr/local/Cellar/apache-spark/3.5.5/libexec/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/soham/.ivy2/cache
The jars for the packages stored in: /Users/soham/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9e29d862-b098-4ec7-aed7-49b5f49b296a;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.1 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.1026 in central
:: resolution report :: resolve 893ms :: artifacts dl 21ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.1026 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.1 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	:: evicted modules:
	com.amazonaws#aws-java-sdk-bundle;1.11.901 by [com.amazonaws#aws-java-sdk-bundle;1.11.1026] in [default]
	------------------------------------------------------------

In [5]:
from pyspark.sql.functions import avg, month, when, col, count, sum as _sum
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, PCA
from pyspark.ml.clustering import KMeans

# 1. Read Data
df = spark.read.csv("s3a://bigdata-flight-delays/2024_*.csv", header=True, inferSchema=True)

# 2. Cast delay columns
delay_cols = [
    "DepDelayMinutes", "ArrDelayMinutes", "WeatherDelay",
    "CarrierDelay", "NASDelay", "LateAircraftDelay"
]
for colname in delay_cols:
    df = df.withColumn(colname, col(colname).cast("float"))

# 3. Filter out cancelled flights
df = df.filter((col("Cancelled") == 0) | col("Cancelled").isNull())

# 4. Add Month and Season
df = df.withColumn("Month", month("FlightDate").cast("int"))
df = df.withColumn("Season", when(col("Month").isin([12, 1, 2]), "Winter")
                             .when(col("Month").isin([3, 4, 5]), "Spring")
                             .when(col("Month").isin([6, 7, 8]), "Summer")
                             .otherwise("Fall"))


# 6. Fill nulls in delay columns with 0
df = df.fillna(0)



                                                                                

In [None]:
# 7. Group and aggregate
df_grouped = df.groupBy("Origin", "Dest", "Reporting_Airline", "Season", "DayOfWeek").agg(
    count("*").alias("TotalFlights"),
    sum("DepDel15").alias("DepDelayedCount"),
    sum("ArrDel15").alias("ArrDelayedCount"),

    avg(when(col("DepDel15") ==1, col("DepDelayMinutes"))).alias("AvgDepDelayDelayed"),
    avg(when(col("ArrDel15") ==1, col("ArrDelayMinutes"))).alias("AvgArrDelayDelayed"),
    avg(when(col("WeatherDelay") > 0, col("WeatherDelay"))).alias("AvgWeatherDelayDelayed"),
    avg(when(col("CarrierDelay") > 0, col("CarrierDelay"))).alias("AvgCarrierDelayDelayed"),
    avg(when(col("NASDelay") > 0, col("NASDelay"))).alias("AvgNASDelayDelayed"),
    avg(when(col("LateAircraftDelay") > 0, col("LateAircraftDelay"))).alias("AvgLateAircraftDelayDelayed")
)

# 8. Compute % delayed
df_grouped = df_grouped.withColumn("PctDepDelayed", (col("DepDelayedCount") / col("TotalFlights")) * 100)
df_grouped = df_grouped.withColumn("PctArrDelayed", (col("ArrDelayedCount") / col("TotalFlights")) * 100)

# 9. Save the grouped data to S3 in 1 file using coalesce
df_grouped = df_grouped.coalesce(1).write.csv("s3a://bigdata-flight-delays/flight_grouped", header=True, mode="overwrite")


25/05/09 12:33:55 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/05/09 12:33:57 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
                                                                                

In [14]:
df_grouped = spark.read.csv("s3a://bigdata-flight-delays/flight_grouped", header=True, inferSchema=True)
df_grouped=df_grouped.fillna(0)

                                                                                

In [15]:
df_grouped.printSchema()

root
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Reporting_Airline: string (nullable = true)
 |-- Season: string (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- TotalFlights: integer (nullable = true)
 |-- DepDelayedCount: double (nullable = false)
 |-- ArrDelayedCount: double (nullable = false)
 |-- AvgDepDelayDelayed: double (nullable = false)
 |-- AvgArrDelayDelayed: double (nullable = false)
 |-- AvgWeatherDelayDelayed: double (nullable = false)
 |-- AvgCarrierDelayDelayed: double (nullable = false)
 |-- AvgNASDelayDelayed: double (nullable = false)
 |-- AvgLateAircraftDelayDelayed: double (nullable = false)
 |-- PctDepDelayed: double (nullable = false)
 |-- PctArrDelayed: double (nullable = false)



In [None]:

# 9. Encode categorical variables
indexers = [
    StringIndexer(inputCol="Origin", outputCol="OriginIdx"),
    StringIndexer(inputCol="Dest", outputCol="DestIdx"),
    StringIndexer(inputCol="Reporting_Airline", outputCol="AirlineIdx"),
    StringIndexer(inputCol="Season", outputCol="SeasonIdx")

]
for indexer in indexers:
    df_grouped = indexer.fit(df_grouped).transform(df_grouped)
# for colname in input_cols:
#     df_grouped = df_grouped.withColumn(colname, col(colname).cast(DoubleType()))
# 10. Assemble features
assembler = VectorAssembler(
    inputCols=[
        "OriginIdx", "DestIdx", "AirlineIdx", "SeasonIdx", "DayOfWeek",
        "PctDepDelayed", "PctArrDelayed",
        "AvgDepDelayDelayed", "AvgArrDelayDelayed",
        "AvgWeatherDelayDelayed", "AvgCarrierDelayDelayed",
        "AvgNASDelayDelayed", "AvgLateAircraftDelayDelayed"
    ],
    outputCol="features"
)
df_vector = assembler.transform(df_grouped)

# 11. Scale features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withMean=True, withStd=True)
df_scaled = scaler.fit(df_vector).transform(df_vector)

# 12. PCA (reduce to 5 principal components)
pca = PCA(k=5, inputCol="scaledFeatures", outputCol="pcaFeatures")
pca_model = pca.fit(df_scaled)
df_pca = pca_model.transform(df_scaled)







In [18]:
#Persist the DataFrame to avoid recomputation
from pyspark import StorageLevel
df_pca = df_pca.persist(StorageLevel.MEMORY_AND_DISK)

In [19]:
df_pca.show(5)



+------+----+-----------------+------+---------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+------------------+---------------------------+------------------+------------------+---------+-------+----------+---------+--------------------+--------------------+--------------------+
|Origin|Dest|Reporting_Airline|Season|DayOfWeek|TotalFlights|DepDelayedCount|ArrDelayedCount|AvgDepDelayDelayed|AvgArrDelayDelayed|AvgWeatherDelayDelayed|AvgCarrierDelayDelayed|AvgNASDelayDelayed|AvgLateAircraftDelayDelayed|     PctDepDelayed|     PctArrDelayed|OriginIdx|DestIdx|AirlineIdx|SeasonIdx|            features|      scaledFeatures|         pcaFeatures|
+------+----+-----------------+------+---------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+------------------+---------------------------+------------------+------------------+--------

                                                                                

In [20]:
wssse_list = []
for k in range(2, 11):
    kmeans = KMeans(k=k, seed=42, featuresCol="pcaFeatures", predictionCol="cluster")
    model = kmeans.fit(df_pca)
    wssse = model.summary.trainingCost
    wssse_list.append((k, wssse))
    print(f"K={k}, WSSSE={wssse}")

# OPTIONAL: Save WSSSEs to S3 or local for plotting
spark.createDataFrame(wssse_list, ["k", "WSSSE"]).coalesce(1).write.csv(
    "s3a://bigdata-flight-delays/kmeans_elbow_values", header=True, mode="overwrite"
)

                                                                                

K=2, WSSSE=1748529.6019895023


                                                                                

K=3, WSSSE=1452806.7347469297


                                                                                

K=4, WSSSE=1317582.592265395


                                                                                

K=5, WSSSE=1203555.978888232


                                                                                

K=6, WSSSE=1097472.7017834478


                                                                                

K=7, WSSSE=1019806.9181546681


                                                                                

K=8, WSSSE=1005972.6096542766


                                                                                

K=9, WSSSE=894631.8437328222


                                                                                

K=10, WSSSE=848847.6727699636


25/05/09 13:42:53 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/05/09 13:42:54 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
                                                                                

In [21]:
# 14. Use best k (k=7) after inspecting elbow)
optimal_k = 7
final_kmeans = KMeans(k=optimal_k, seed=42, featuresCol="pcaFeatures", predictionCol="cluster")
final_model = final_kmeans.fit(df_pca)
df_clustered = final_model.transform(df_pca)

# 15. Export clustered result
df_clustered.select(
    "Origin", "Dest", "Reporting_Airline", "Season", "DayOfWeek",
    "PctDepDelayed", "PctArrDelayed",
    "AvgDepDelayDelayed", "AvgArrDelayDelayed",
    "AvgWeatherDelayDelayed", "AvgCarrierDelayDelayed",
    "AvgNASDelayDelayed", "AvgLateAircraftDelayDelayed",
    "cluster"
).write.csv("s3a://bigdata-flight-delays/flight_7_clusters_delay_profiles", header=True, mode="overwrite")


25/05/09 13:53:16 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/05/09 13:53:17 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/05/09 13:53:17 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/05/09 13:53:17 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/05/09 13:53:17 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
                                                                                

In [24]:
df_clustered = spark.read.csv("s3a://bigdata-flight-delays/flight_7_clusters_delay_profiles", header=True, inferSchema=True)
# Step 9: Save the result to local filesystem
df_clustered.select("Origin", "Dest", "Reporting_Airline", "Season", "DayOfWeek",
    "PctDepDelayed", "PctArrDelayed",
    "AvgDepDelayDelayed", "AvgArrDelayDelayed",
    "AvgWeatherDelayDelayed", "AvgCarrierDelayDelayed",
    "AvgNASDelayDelayed", "AvgLateAircraftDelayDelayed",
    "cluster"
).coalesce(1).write.csv(
    "/Users/soham/PycharmProjects/bigdata-project/output/flight_clusters_7_delays", header=True, mode="overwrite"
)

                                                                                

In [25]:
df_grouped.show(5)

[Stage 674:>                                                        (0 + 1) / 1]

+------+----+-----------------+------+---------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+------------------+---------------------------+------------------+------------------+---------+-------+----------+---------+
|Origin|Dest|Reporting_Airline|Season|DayOfWeek|TotalFlights|DepDelayedCount|ArrDelayedCount|AvgDepDelayDelayed|AvgArrDelayDelayed|AvgWeatherDelayDelayed|AvgCarrierDelayDelayed|AvgNASDelayDelayed|AvgLateAircraftDelayDelayed|     PctDepDelayed|     PctArrDelayed|OriginIdx|DestIdx|AirlineIdx|SeasonIdx|
+------+----+-----------------+------+---------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+------------------+---------------------------+------------------+------------------+---------+-------+----------+---------+
|   TUL| ATL|               9E|Winter|        5|           8|            1.0|            1.0| 

                                                                                

In [26]:
df_clustered.show(5)

+------+----+-----------------+------+---------+------------------+-----------------+------------------+------------------+----------------------+----------------------+------------------+---------------------------+-------+
|Origin|Dest|Reporting_Airline|Season|DayOfWeek|     PctDepDelayed|    PctArrDelayed|AvgDepDelayDelayed|AvgArrDelayDelayed|AvgWeatherDelayDelayed|AvgCarrierDelayDelayed|AvgNASDelayDelayed|AvgLateAircraftDelayDelayed|cluster|
+------+----+-----------------+------+---------+------------------+-----------------+------------------+------------------+----------------------+----------------------+------------------+---------------------------+-------+
|   MYR| LBE|               NK|  Fall|        1| 5.555555555555555|5.555555555555555|             182.0|             173.0|                   0.0|                  48.0|               1.0|                      124.0|      5|
|   TPA| IND|               NK|  Fall|        5|               0.0|              0.0|               

In [27]:
df_joined = df_grouped.join(
    df_clustered.select("Origin", "Dest", "Reporting_Airline", "Season", "DayOfWeek", "cluster"),
    on=["Origin", "Dest", "Reporting_Airline", "Season", "DayOfWeek"],
    how="inner"
)


In [29]:
df_joined.show(5)

[Stage 681:>                                                        (0 + 1) / 1]

+------+----+-----------------+------+---------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+------------------+---------------------------+------------------+------------------+---------+-------+----------+---------+-------+
|Origin|Dest|Reporting_Airline|Season|DayOfWeek|TotalFlights|DepDelayedCount|ArrDelayedCount|AvgDepDelayDelayed|AvgArrDelayDelayed|AvgWeatherDelayDelayed|AvgCarrierDelayDelayed|AvgNASDelayDelayed|AvgLateAircraftDelayDelayed|     PctDepDelayed|     PctArrDelayed|OriginIdx|DestIdx|AirlineIdx|SeasonIdx|cluster|
+------+----+-----------------+------+---------+------------+---------------+---------------+------------------+------------------+----------------------+----------------------+------------------+---------------------------+------------------+------------------+---------+-------+----------+---------+-------+
|   ABE| ATL|               9E|  Fall|        2|          36|         

                                                                                

In [None]:
df_joined = df_joined.select(
    "Origin", "Dest", "Reporting_Airline", "Season", "DayOfWeek",
    "TotalFlights", "DepDelayedCount", "ArrDelayedCount",
    "AvgDepDelayDelayed", "AvgArrDelayDelayed",
    "AvgWeatherDelayDelayed", "AvgCarrierDelayDelayed",
    "AvgNASDelayDelayed", "AvgLateAircraftDelayDelayed",
    "PctDepDelayed", "PctArrDelayed",
    "cluster"
)

In [38]:

from pyspark.sql.functions import count, sum, when, col

df_group2 = df.groupBy("Origin", "Dest", "Reporting_Airline", "Season", "DayOfWeek").agg(
    count("*").alias("TotalFlights"),
    sum(when(col("DepDel15") == 1, 1).otherwise(0)).alias("DepDelayedCount"),
    sum(when(col("ArrDel15") == 1, 1).otherwise(0)).alias("ArrDelayedCount"),
    sum(when(col("DepDel15") == 1, col("DepDelayMinutes")).otherwise(0)).alias("TotalDepDelay"),
    sum(when(col("ArrDel15") == 1, col("ArrDelayMinutes")).otherwise(0)).alias("TotalArrDelay"),
    sum(col("WeatherDelay")).alias("TotalWeatherDelay"),
    sum(col("CarrierDelay")).alias("TotalCarrierDelay"),
    sum(col("NASDelay")).alias("TotalNASDelay"),
    sum(col("LateAircraftDelay")).alias("TotalLateAircraftDelay")
)



In [39]:
#Join df_group2 with df_clustered
df_joined = df_group2.join(
    df_clustered.select("Origin", "Dest", "Reporting_Airline", "Season", "DayOfWeek", "cluster"),
    on=["Origin", "Dest", "Reporting_Airline", "Season", "DayOfWeek"],
    how="inner"
)

In [40]:
df_joined.coalesce(1).write.csv(
    "/Users/soham/PycharmProjects/bigdata-project/output/flight_clusters_final", header=True, mode="overwrite"
)

                                                                                

In [41]:
#read the final output
df_joined = spark.read.csv(
    "/Users/soham/PycharmProjects/bigdata-project/output/flight_clusters_final", header=True, inferSchema=True
)
df_joined.show(5)



+------+----+-----------------+------+---------+------------+---------------+---------------+-------------+-------------+-----------------+-----------------+-------------+----------------------+-------+
|Origin|Dest|Reporting_Airline|Season|DayOfWeek|TotalFlights|DepDelayedCount|ArrDelayedCount|TotalDepDelay|TotalArrDelay|TotalWeatherDelay|TotalCarrierDelay|TotalNASDelay|TotalLateAircraftDelay|cluster|
+------+----+-----------------+------+---------+------------+---------------+---------------+-------------+-------------+-----------------+-----------------+-------------+----------------------+-------+
|   ABE| ATL|               9E|  Fall|        2|          36|              3|              1|         71.0|         44.0|              0.0|              0.0|         14.0|                  30.0|      2|
|   ABE| ATL|               9E|  Fall|        5|          36|              3|              3|        256.0|        227.0|            132.0|             61.0|         12.0|                 

                                                                                

In [42]:
#Count rows in each cluster
df_joined.groupBy("cluster").count().show()

                                                                                

+-------+-----+
|cluster|count|
+-------+-----+
|      1| 1781|
|      6|86587|
|      3|17906|
|      5|44324|
|      4|46786|
|      2|41994|
|      0|30902|
+-------+-----+



In [44]:
#Join df_clustered with original df to get more details
df_overall_cluster = df.join(
    df_clustered.select("Origin", "Dest", "Reporting_Airline", "Season", "DayOfWeek", "cluster"),
    on=["Origin", "Dest", "Reporting_Airline", "Season", "DayOfWeek"],
    how="inner"
)

In [45]:
df_overall_cluster.show(5)

25/05/09 17:19:32 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Year, Quarter, DayofMonth, DayOfWeek, FlightDate, Reporting_Airline, DOT_ID_Reporting_Airline, IATA_CODE_Reporting_Airline, Tail_Number, Flight_Number_Reporting_Airline, OriginAirportID, OriginAirportSeqID, OriginCityMarketID, Origin, OriginCityName, OriginState, OriginStateFips, OriginStateName, OriginWac, DestAirportID, DestAirportSeqID, DestCityMarketID, Dest, DestCityName, DestState, DestStateFips, DestStateName, DestWac, CRSDepTime, DepTime, DepDelay, DepDelayMinutes, DepDel15, DepartureDelayGroups, DepTimeBlk, TaxiOut, WheelsOff, WheelsOn, TaxiIn, CRSArrTime, ArrTime, ArrDelay, ArrDelayMinutes, ArrDel15, ArrivalDelayGroups, ArrTimeBlk, Cancelled, CancellationCode, Diverted, CRSElapsedTime, ActualElapsedTime, AirTime, Flights, Distance, DistanceGroup, CarrierDelay, WeatherDelay, NASDelay, SecurityDelay, LateAircraftDelay, FirstDepTime, TotalAddGTime, LongestAddGTime, DivAirportLandings, Di

+------+----+-----------------+------+---------+----+-------+-----+----------+----------+------------------------+---------------------------+-----------+-------------------------------+---------------+------------------+------------------+--------------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+------------+---------+-------------+-------------+-------+----------+-------+--------+---------------+--------+--------------------+----------+-------+---------+--------+------+----------+-------+--------+---------------+--------+------------------+----------+---------+----------------+--------+--------------+-----------------+-------+-------+--------+-------------+------------+------------+--------+-------------+-----------------+------------+-------------+---------------+------------------+--------------+--------------------+-----------+-----------+-----------+-------------+----------------+------------+--------------+--------

                                                                                

In [48]:
df_overall_cluster.createOrReplaceTempView("df_overall_cluster")
df_overall_summ = spark.sql("""
    SELECT Origin, Season, DayOfWeek,OriginState,OriginStateName, Dest, 
                       DestCityName,    DestState, DestStateName, Reporting_Airline, cluster,
            SUM(CASE WHEN Diverted = 1 THEN 1 ELSE 0 END) AS DivertedFlights,
            COUNT(*) AS TotalFlights,
            Sum(DepDel15) AS DepartureDelayedFlights,
            Sum(ArrDel15) AS ArrivalDelayedFlights,
            sum(CASE WHEN DepDel15 =1 THEN DepDelay ELSE 0 END) AS TotalDepDelay_gt_15M,
            sum(CASE WHEN ArrDel15 =1 THEN ArrDelay ELSE 0 END) AS TotalArrDelay_gt_15M,
            sum(DepDelay) AS TotalDepDelay,
            sum(ArrDelay) AS TotalArrDelay,
            sum(CarrierDelay) AS TotalCarrierDelay,
            sum(WeatherDelay) AS TotalWeatherDelay,
            sum(NASDelay) AS TotalNASDelay,
            sum(SecurityDelay) AS TotalSecurityDelay,
            sum(LateAircraftDelay) AS TotalLateAircraftDelay
            
    FROM df_overall_cluster
    GROUP BY Origin, Season, DayOfWeek,OriginState,OriginStateName, Dest, 
                       DestCityName,    DestState, DestStateName, Reporting_Airline, cluster
    
""")

In [49]:
df_overall_summ.write.csv(
    "/Users/soham/PycharmProjects/bigdata-project/output/flight_clusters_overall_summ", header=True, mode="overwrite"
)

25/05/09 17:33:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/05/09 17:33:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/05/09 17:33:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/05/09 17:33:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/05/09 17:33:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/05/09 17:33:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/05/09 17:33:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/05/09 17:33:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/05/09 17:33:12 WARN RowBasedKeyValueBatch: Calling spill() on