# Preprocessing and EDA

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import (
    col, when, count, trim, avg, round as spark_round, 
    to_date, split, size, substring
)
from pyspark.sql import Window
import os

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("NOAA_Parquet_Processor") \
    .config("spark.sql.shuffle.partitions", "8") \
    .getOrCreate()

base_path = "/home/alumno/reposirotio/Grupo3"
path_2025 = f"{base_path}/2025.tar/*"

25/12/31 12:59:24 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
from pyspark.sql.types import *

schema = StructType([
    StructField("STATION", StringType(), True),
    StructField("DATE", StringType(), True),
    StructField("LATITUDE", DoubleType(), True),
    StructField("LONGITUDE", DoubleType(), True),
    StructField("ELEVATION", DoubleType(), True),
    StructField("NAME", StringType(), True),
    StructField("TEMP", DoubleType(), True),
    StructField("TEMP_ATTRIBUTES", StringType(), True),
    StructField("DEWP", DoubleType(), True),
    StructField("DEWP_ATTRIBUTES", StringType(), True),
    StructField("SLP", DoubleType(), True),
    StructField("SLP_ATTRIBUTES", StringType(), True),
    StructField("STP", DoubleType(), True),
    StructField("STP_ATTRIBUTES", StringType(), True),
    StructField("VISIB", DoubleType(), True),
    StructField("VISIB_ATTRIBUTES", StringType(), True),
    StructField("WDSP", DoubleType(), True),
    StructField("WDSP_ATTRIBUTES", StringType(), True),
    StructField("MXSPD", DoubleType(), True),
    StructField("GUST", DoubleType(), True),
    StructField("MAX", DoubleType(), True),
    StructField("MAX_ATTRIBUTES", StringType(), True),
    StructField("MIN", DoubleType(), True),
    StructField("MIN_ATTRIBUTES", StringType(), True),
    StructField("PRCP", DoubleType(), True),
    StructField("PRCP_ATTRIBUTES", StringType(), True),
    StructField("SNDP", DoubleType(), True),
    StructField("FRSHTT", StringType(), True),
])

In [3]:
def process_year(year, base_path, output_path):
    """
    Process a single year of climate data and save as parquet
    
    Args:
        year: Year to process (e.g., 2010)
        base_path: Base directory containing year folders
        output_path: Directory to save processed parquet files
    """
    print(f"\n{'='*60}")
    print(f"Processing year: {year}")
    print(f"{'='*60}\n")
    
    # Read data for the year
    year_path = f"{base_path}/{year}.tar/*"
    print(f"Reading from: {year_path}")
    
    df = spark.read \
        .option("header", "true") \
        .option("quote", '"') \
        .option("escape", '"') \
        .schema(schema) \
        .csv(year_path)
    
    initial_count = df.count()
    print(f"Initial row count: {initial_count:,}")
    

    # STEP 1: Clean SNDP (Snow Depth)

    print("\n[1/7] Cleaning SNDP (snow depth)...")
    boundary_f = 5 * 9/5 + 32  # 5°C in Fahrenheit ≈ 41°F
    
    df = df.withColumn(
        "SNDP",
        when(
            ((col("SNDP").isNull()) | (col("SNDP") == 999.9) | (col("SNDP") == 9999.9)) & (col("MIN") > boundary_f),
            0
        ).otherwise(col("SNDP"))
    )
    
    # Drop rows where snow is possible but missing
    df = df.filter(
        ~(((col("SNDP").isNull()) | (col("SNDP") == 999.9) | (col("SNDP") == 9999.9)) & (col("MIN") <= boundary_f))
    )
    print(f"   Rows after SNDP cleaning: {df.count():,}")
    

    # STEP 2: Drop unnecessary columns

    print("\n[2/7] Dropping attribute columns and GUST...")
    cols_to_drop = [
        "TEMP_ATTRIBUTES", "DEWP_ATTRIBUTES", "SLP_ATTRIBUTES", "STP_ATTRIBUTES",
        "VISIB_ATTRIBUTES", "WDSP_ATTRIBUTES", "GUST", "MAX_ATTRIBUTES",
        "MIN_ATTRIBUTES", "PRCP_ATTRIBUTES"
    ]
    df = df.drop(*cols_to_drop)
    

    # STEP 3: Drop critical missing values

    print("\n[3/7] Dropping rows with missing critical columns...")
    critical_cols = ["LATITUDE", "LONGITUDE", "ELEVATION", "NAME"]
    df = df.na.drop(subset=critical_cols)
    print(f"   Rows after dropping critical missing: {df.count():,}")
    

    # STEP 4: Impute missing values

    print("\n[4/7] Imputing missing values...")
    numeric_cols = ["DEWP", "SLP", "STP", "VISIB", "WDSP", "MXSPD"]
    
    # Station-wise mean imputation
    print("   - Station-wise mean imputation...")
    for c in numeric_cols:
        window = Window.partitionBy("STATION")
        df = df.withColumn(
            c,
            when(
                (col(c).isNull()) | (col(c) == 999.9) | (col(c) == 9999.9),
                avg(
                    when((col(c) != 999.9) & (col(c) != 9999.9) & col(c).isNotNull(), col(c))
                ).over(window)
            ).otherwise(col(c))
        )
    
    # Grid-wise mean imputation for remaining missing
    print("   - Grid-wise mean imputation...")
    grid_precision = 2.0
    df = df.withColumn("LAT_GRID", spark_round(col("LATITUDE") / grid_precision) * grid_precision) \
           .withColumn("LON_GRID", spark_round(col("LONGITUDE") / grid_precision) * grid_precision)
    
    for c in numeric_cols:
        grid_mean = df.groupBy("LAT_GRID", "LON_GRID").agg(
            avg(
                when((col(c) != 999.9) & (col(c) != 9999.9) & col(c).isNotNull(), col(c))
            ).alias(f"{c}_grid_mean")
        )
        
        df = df.join(grid_mean, on=["LAT_GRID", "LON_GRID"], how="left")
        
        df = df.withColumn(
            c,
            when((col(c).isNull()) | (col(c) == 999.9) | (col(c) == 9999.9), col(f"{c}_grid_mean"))
            .otherwise(col(c))
        ).drop(f"{c}_grid_mean")
    
    df = df.drop("LAT_GRID", "LON_GRID")
    

    # STEP 5: Drop remaining missing values

    print("\n[5/7] Dropping remaining rows with missing values...")
    df = df.dropna()
    
    all_numeric_cols = ["TEMP", "DEWP", "SLP", "STP", "VISIB", "WDSP", "MXSPD", "MAX", "MIN", "PRCP", "SNDP"]
    for c in all_numeric_cols:
        df = df.filter(~col(c).isin([999.9, 9999.9]))
    
    print(f"   Rows after final cleaning: {df.count():,}")
    

    # STEP 6: Data transformations

    print("\n[6/7] Applying data transformations...")
    df = (
        df
        # Convert DATE to datetime
        .withColumn("DATE", to_date(col("DATE"), "yyyy-MM-dd"))
        
        # Extract country from NAME
        .withColumn("COUNTRY", split(col("NAME"), ", ")[size(split(col("NAME"), ", ")) - 1])
        
        # Convert temperatures from °F to °C
        .withColumn("TEMP", (col("TEMP") - 32) * 5/9)
        .withColumn("DEWP", (col("DEWP") - 32) * 5/9)
        .withColumn("MAX", (col("MAX") - 32) * 5/9)
        .withColumn("MIN", (col("MIN") - 32) * 5/9)
        
        # Convert visibility from miles to km
        .withColumn("VISIB", col("VISIB") * 1.60934)
        
        # Convert wind speed from knots to km/h
        .withColumn("WDSP", col("WDSP") * 1.852)
        .withColumn("MXSPD", col("MXSPD") * 1.852)
        
        # Convert precipitation from inches to liters/m²
        .withColumn("PRCP", col("PRCP") * 25.4)
        
        # Convert snow depth from inches to cm
        .withColumn("SNDP", col("SNDP") * 2.54)
    )
    

    # STEP 7: One-hot encode weather events

    print("\n[7/7] One-hot encoding weather events...")
    weather_cols = ["is_Fog", "is_Rain", "is_Snow", "is_Hail", "is_Thunder", "is_Tornado"]
    
    df = df.select(
        "*",
        *[substring(col("FRSHTT"), i + 1, 1).cast("int").alias(w) for i, w in enumerate(weather_cols)]
    ).drop("FRSHTT")
    

    # STEP 8: Save as Parquet

    final_count = df.count()
    print(f"\n{'='*60}")
    print(f"Final row count: {final_count:,}")
    print(f"Rows retained: {100 * final_count / initial_count:.2f}%")
    print(f"{'='*60}\n")
    
    output_file = f"{output_path}/climate_{year}.parquet"
    print(f"Saving to: {output_file}")
    
    df.write.mode("overwrite").parquet(output_file)
    
    print(f"✓ Year {year} successfully processed and saved!\n")
    
    return final_count

In [10]:
base_path = "/home/alumno/reposirotio/Grupo3"
output_path = "processed_data/"

# Create output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)

# TEST: Process only 2010 first
print("\n" + "="*60)
print("TESTING WITH YEAR 2010")
print("="*60)

try:
    print("\n" + "="*60)
    print("PROCESSING ALL YEARS (2010-2025)")
    print("="*60)
    
    total_rows = 0
    for year in range(2011, 2025): # From 2010 to 2024  
        try:
            rows = process_year(year, base_path, output_path)
            total_rows += rows
        except Exception as e:
            print(f"\nERROR processing year {year}: {str(e)}")
            continue
    
    print("\n" + "="*60)
    print(f"ALL YEARS PROCESSED!")
    print(f"Total rows processed: {total_rows:,}")
    print("="*60)

except Exception as e:
        print(f"\nERROR: {str(e)}")
        import traceback
        traceback.print_exc()

spark.stop()


TESTING WITH YEAR 2010

PROCESSING ALL YEARS (2010-2025)

Processing year: 2011

Reading from: /home/alumno/reposirotio/Grupo3/2011.tar/*


25/12/31 13:06:57 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: /home/alumno/reposirotio/Grupo3/2011.tar/*.
java.io.FileNotFoundException: File /home/alumno/reposirotio/Grupo3/2011.tar/* does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:917)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1238)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:907)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:56)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:381)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource.org$apache$spark$sql$catalyst$analysis$ResolveDataSource$$loadV1BatchSource(ResolveDataSource.scala:143)
	at org.ap

Initial row count: 7,382,831

[1/7] Cleaning SNDP (snow depth)...


                                                                                

   Rows after SNDP cleaning: 4,920,926

[2/7] Dropping attribute columns and GUST...

[3/7] Dropping rows with missing critical columns...


                                                                                

   Rows after dropping critical missing: 4,917,070

[4/7] Imputing missing values...
   - Station-wise mean imputation...
   - Grid-wise mean imputation...

[5/7] Dropping remaining rows with missing values...


25/12/31 13:08:37 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 00826899999.csv                                                                                     0100644 0000000 0000000 00000013507 14542562725 011550  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, STP, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, STP, MIN, SNDP
Expected: STATION but found: 00826899999.csv                                                                                     0100644 0000000 0000000 00000013507 14542562725 011550  0                                                                          

   Rows after final cleaning: 4,187,992

[6/7] Applying data transformations...

[7/7] One-hot encoding weather events...


25/12/31 13:11:04 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 00826899999.csv                                                                                     0100644 0000000 0000000 00000013507 14542562725 011550  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, WDSP, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, WDSP, MIN, SNDP
Expected: STATION but found: 00826899999.csv                                                                                     0100644 0000000 0000000 00000013507 14542562725 011550  0                                                                        


Final row count: 4,187,992
Rows retained: 56.73%

Saving to: processed_data//climate_2011.parquet


25/12/31 13:13:18 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 00826899999.csv                                                                                     0100644 0000000 0000000 00000013507 14542562725 011550  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, VISIB, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, VISIB, MIN, SNDP
Expected: STATION but found: 00826899999.csv                                                                                     0100644 0000000 0000000 00000013507 14542562725 011550  0                                                                      

✓ Year 2011 successfully processed and saved!


Processing year: 2012

Reading from: /home/alumno/reposirotio/Grupo3/2012.tar/*


                                                                                

Initial row count: 7,724,790

[1/7] Cleaning SNDP (snow depth)...


                                                                                

   Rows after SNDP cleaning: 5,189,226

[2/7] Dropping attribute columns and GUST...

[3/7] Dropping rows with missing critical columns...


                                                                                

   Rows after dropping critical missing: 5,180,786

[4/7] Imputing missing values...
   - Station-wise mean imputation...
   - Grid-wise mean imputation...

[5/7] Dropping remaining rows with missing values...


25/12/31 13:17:42 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 00826899999.csv                                                                                     0100644 0000000 0000000 00000002275 14542562773 011553  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, SLP, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, SLP, MIN, SNDP
Expected: STATION but found: 00826899999.csv                                                                                     0100644 0000000 0000000 00000002275 14542562773 011553  0                                                                          

   Rows after final cleaning: 4,476,770

[6/7] Applying data transformations...

[7/7] One-hot encoding weather events...


25/12/31 13:20:09 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 00826899999.csv                                                                                     0100644 0000000 0000000 00000002275 14542562773 011553  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, SLP, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, SLP, MIN, SNDP
Expected: STATION but found: 00826899999.csv                                                                                     0100644 0000000 0000000 00000002275 14542562773 011553  0                                                                          


Final row count: 4,476,770
Rows retained: 57.95%

Saving to: processed_data//climate_2012.parquet


25/12/31 13:22:29 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 00826899999.csv                                                                                     0100644 0000000 0000000 00000002275 14542562773 011553  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, DEWP, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, DEWP, MIN, SNDP
Expected: STATION but found: 00826899999.csv                                                                                     0100644 0000000 0000000 00000002275 14542562773 011553  0                                                                        

✓ Year 2012 successfully processed and saved!


Processing year: 2013

Reading from: /home/alumno/reposirotio/Grupo3/2013.tar/*


                                                                                

Initial row count: 7,740,011

[1/7] Cleaning SNDP (snow depth)...


                                                                                

   Rows after SNDP cleaning: 5,216,702

[2/7] Dropping attribute columns and GUST...

[3/7] Dropping rows with missing critical columns...


                                                                                

   Rows after dropping critical missing: 5,209,136

[4/7] Imputing missing values...
   - Station-wise mean imputation...
   - Grid-wise mean imputation...

[5/7] Dropping remaining rows with missing values...


25/12/31 13:26:58 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 00701899999.csv                                                                                     0100644 0000000 0000000 00000005461 14542563046 011535  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, MXSPD, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, MXSPD, MIN, SNDP
Expected: STATION but found: 00701899999.csv                                                                                     0100644 0000000 0000000 00000005461 14542563046 011535  0                                                                      

   Rows after final cleaning: 4,544,148

[6/7] Applying data transformations...

[7/7] One-hot encoding weather events...


25/12/31 13:29:29 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 00701899999.csv                                                                                     0100644 0000000 0000000 00000005461 14542563046 011535  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
 Schema: STATION, DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
Expected: STATION but found: 00701899999.csv                                                                                     0100644


Final row count: 4,544,148
Rows retained: 58.71%

Saving to: processed_data//climate_2013.parquet


25/12/31 13:32:11 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 00701899999.csv                                                                                     0100644 0000000 0000000 00000005461 14542563046 011535  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, STP, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, STP, MIN, SNDP
Expected: STATION but found: 00701899999.csv                                                                                     0100644 0000000 0000000 00000005461 14542563046 011535  0                                                                          

✓ Year 2013 successfully processed and saved!


Processing year: 2014

Reading from: /home/alumno/reposirotio/Grupo3/2014.tar/*


                                                                                

Initial row count: 7,873,501

[1/7] Cleaning SNDP (snow depth)...


                                                                                

   Rows after SNDP cleaning: 5,324,034

[2/7] Dropping attribute columns and GUST...

[3/7] Dropping rows with missing critical columns...


                                                                                

   Rows after dropping critical missing: 5,315,424

[4/7] Imputing missing values...
   - Station-wise mean imputation...
   - Grid-wise mean imputation...

[5/7] Dropping remaining rows with missing values...


25/12/31 13:36:45 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 00702699999.csv                                                                                     0100644 0000000 0000000 00000004657 14542563121 011534  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, VISIB, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, VISIB, MIN, SNDP
Expected: STATION but found: 00702699999.csv                                                                                     0100644 0000000 0000000 00000004657 14542563121 011534  0                                                                      

   Rows after final cleaning: 4,554,964

[6/7] Applying data transformations...

[7/7] One-hot encoding weather events...


25/12/31 13:39:33 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 00702699999.csv                                                                                     0100644 0000000 0000000 00000004657 14542563121 011534  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, DEWP, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, DEWP, MIN, SNDP
Expected: STATION but found: 00702699999.csv                                                                                     0100644 0000000 0000000 00000004657 14542563121 011534  0                                                                        


Final row count: 4,554,964
Rows retained: 57.85%

Saving to: processed_data//climate_2014.parquet


25/12/31 13:42:31 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 00702699999.csv                                                                                     0100644 0000000 0000000 00000004657 14542563121 011534  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, SLP, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, SLP, MIN, SNDP
Expected: STATION but found: 00702699999.csv                                                                                     0100644 0000000 0000000 00000004657 14542563121 011534  0                                                                          

✓ Year 2014 successfully processed and saved!


Processing year: 2015

Reading from: /home/alumno/reposirotio/Grupo3/2015.tar/*


25/12/31 13:45:52 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: /home/alumno/reposirotio/Grupo3/2015.tar/*.
java.io.FileNotFoundException: File /home/alumno/reposirotio/Grupo3/2015.tar/* does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:917)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1238)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:907)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:56)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:381)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource.org$apache$spark$sql$catalyst$analysis$ResolveDataSource$$loadV1BatchSource(ResolveDataSource.scala:143)
	at org.ap

Initial row count: 8,055,835

[1/7] Cleaning SNDP (snow depth)...


                                                                                

   Rows after SNDP cleaning: 5,474,224

[2/7] Dropping attribute columns and GUST...

[3/7] Dropping rows with missing critical columns...


                                                                                

   Rows after dropping critical missing: 5,458,668

[4/7] Imputing missing values...
   - Station-wise mean imputation...
   - Grid-wise mean imputation...

[5/7] Dropping remaining rows with missing values...


25/12/31 13:47:32 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01001099999.csv                                                                                     0100644 0000000 0000000 00000247731 14542563173 011530  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, DEWP, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, DEWP, MIN, SNDP
Expected: STATION but found: 01001099999.csv                                                                                     0100644 0000000 0000000 00000247731 14542563173 011530  0                                                                        

   Rows after final cleaning: 4,737,350

[6/7] Applying data transformations...

[7/7] One-hot encoding weather events...


25/12/31 13:50:31 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01001099999.csv                                                                                     0100644 0000000 0000000 00000247731 14542563173 011530  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, VISIB, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, VISIB, MIN, SNDP
Expected: STATION but found: 01001099999.csv                                                                                     0100644 0000000 0000000 00000247731 14542563173 011530  0                                                                      


Final row count: 4,737,350
Rows retained: 58.81%

Saving to: processed_data//climate_2015.parquet


25/12/31 13:53:33 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01001099999.csv                                                                                     0100644 0000000 0000000 00000247731 14542563173 011530  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
 Schema: STATION, DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
Expected: STATION but found: 01001099999.csv                                                                                     0100644

✓ Year 2015 successfully processed and saved!


Processing year: 2016

Reading from: /home/alumno/reposirotio/Grupo3/2016.tar/*


25/12/31 13:57:08 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: /home/alumno/reposirotio/Grupo3/2016.tar/*.
java.io.FileNotFoundException: File /home/alumno/reposirotio/Grupo3/2016.tar/* does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:917)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1238)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:907)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:56)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:381)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource.org$apache$spark$sql$catalyst$analysis$ResolveDataSource$$loadV1BatchSource(ResolveDataSource.scala:143)
	at org.ap

Initial row count: 8,007,368

[1/7] Cleaning SNDP (snow depth)...


                                                                                

   Rows after SNDP cleaning: 5,532,984

[2/7] Dropping attribute columns and GUST...

[3/7] Dropping rows with missing critical columns...


                                                                                

   Rows after dropping critical missing: 5,518,264

[4/7] Imputing missing values...
   - Station-wise mean imputation...
   - Grid-wise mean imputation...

[5/7] Dropping remaining rows with missing values...


25/12/31 13:58:58 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 00702699999.csv                                                                                     0100644 0000000 0000000 00000003775 14542563247 011545  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, STP, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, STP, MIN, SNDP
Expected: STATION but found: 00702699999.csv                                                                                     0100644 0000000 0000000 00000003775 14542563247 011545  0                                                                          

   Rows after final cleaning: 4,772,986

[6/7] Applying data transformations...

[7/7] One-hot encoding weather events...


25/12/31 14:01:39 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 00702699999.csv                                                                                     0100644 0000000 0000000 00000003775 14542563247 011545  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, SLP, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, SLP, MIN, SNDP
Expected: STATION but found: 00702699999.csv                                                                                     0100644 0000000 0000000 00000003775 14542563247 011545  0                                                                          


Final row count: 4,772,986
Rows retained: 59.61%

Saving to: processed_data//climate_2016.parquet


25/12/31 14:04:33 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 00702699999.csv                                                                                     0100644 0000000 0000000 00000003775 14542563247 011545  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
 Schema: STATION, DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
Expected: STATION but found: 00702699999.csv                                                                                     0100644

✓ Year 2016 successfully processed and saved!


Processing year: 2017

Reading from: /home/alumno/reposirotio/Grupo3/2017.tar/*


                                                                                

Initial row count: 8,325,522

[1/7] Cleaning SNDP (snow depth)...


                                                                                

   Rows after SNDP cleaning: 5,806,590

[2/7] Dropping attribute columns and GUST...

[3/7] Dropping rows with missing critical columns...


                                                                                

   Rows after dropping critical missing: 5,792,892

[4/7] Imputing missing values...
   - Station-wise mean imputation...
   - Grid-wise mean imputation...

[5/7] Dropping remaining rows with missing values...


25/12/31 14:09:45 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 00702699999.csv                                                                                     0100644 0000000 0000000 00000042405 14542563322 011530  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
 Schema: STATION, DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
Expected: STATION but found: 00702699999.csv                                                                                     0100644

   Rows after final cleaning: 4,989,296

[6/7] Applying data transformations...

[7/7] One-hot encoding weather events...


25/12/31 14:12:47 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 00702699999.csv                                                                                     0100644 0000000 0000000 00000042405 14542563322 011530  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, STP, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, STP, MIN, SNDP
Expected: STATION but found: 00702699999.csv                                                                                     0100644 0000000 0000000 00000042405 14542563322 011530  0                                                                          


Final row count: 4,989,296
Rows retained: 59.93%

Saving to: processed_data//climate_2017.parquet


25/12/31 14:15:49 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 00702699999.csv                                                                                     0100644 0000000 0000000 00000042405 14542563322 011530  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, MXSPD, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, MXSPD, MIN, SNDP
Expected: STATION but found: 00702699999.csv                                                                                     0100644 0000000 0000000 00000042405 14542563322 011530  0                                                                      

✓ Year 2017 successfully processed and saved!


Processing year: 2018

Reading from: /home/alumno/reposirotio/Grupo3/2018.tar/*


                                                                                

Initial row count: 8,274,167

[1/7] Cleaning SNDP (snow depth)...


                                                                                

   Rows after SNDP cleaning: 5,759,392

[2/7] Dropping attribute columns and GUST...

[3/7] Dropping rows with missing critical columns...


                                                                                

   Rows after dropping critical missing: 5,745,050

[4/7] Imputing missing values...
   - Station-wise mean imputation...
   - Grid-wise mean imputation...

[5/7] Dropping remaining rows with missing values...


25/12/31 14:21:03 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01002099999.csv                                                                                     0100644 0000000 0000000 00000235561 14542563400 011520  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, MXSPD, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, MXSPD, MIN, SNDP
Expected: STATION but found: 01002099999.csv                                                                                     0100644 0000000 0000000 00000235561 14542563400 011520  0                                                                      

   Rows after final cleaning: 4,972,378

[6/7] Applying data transformations...

[7/7] One-hot encoding weather events...


25/12/31 14:24:29 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01002099999.csv                                                                                     0100644 0000000 0000000 00000235561 14542563400 011520  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
 Schema: STATION, DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
Expected: STATION but found: 01002099999.csv                                                                                     0100644



25/12/31 14:27:09 WARN TaskMemoryManager: Failed to allocate a page (16777216 bytes), try again.
                                                                                


Final row count: 4,972,378
Rows retained: 60.10%

Saving to: processed_data//climate_2018.parquet


25/12/31 14:27:53 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01002099999.csv                                                                                     0100644 0000000 0000000 00000235561 14542563400 011520  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
 Schema: STATION, DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
Expected: STATION but found: 01002099999.csv                                                                                     0100644

✓ Year 2018 successfully processed and saved!


Processing year: 2019

Reading from: /home/alumno/reposirotio/Grupo3/2019.tar/*


25/12/31 14:31:50 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: /home/alumno/reposirotio/Grupo3/2019.tar/*.
java.io.FileNotFoundException: File /home/alumno/reposirotio/Grupo3/2019.tar/* does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:917)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1238)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:907)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:56)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:381)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource.org$apache$spark$sql$catalyst$analysis$ResolveDataSource$$loadV1BatchSource(ResolveDataSource.scala:143)
	at org.ap

Initial row count: 8,345,512

[1/7] Cleaning SNDP (snow depth)...


                                                                                

   Rows after SNDP cleaning: 5,729,088

[2/7] Dropping attribute columns and GUST...

[3/7] Dropping rows with missing critical columns...


                                                                                

   Rows after dropping critical missing: 5,716,696

[4/7] Imputing missing values...
   - Station-wise mean imputation...
   - Grid-wise mean imputation...

[5/7] Dropping remaining rows with missing values...


25/12/31 14:33:40 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01001099999.csv                                                                                     0100644 0000000 0000000 00000250661 14542563457 011531  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, STP, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, STP, MIN, SNDP
Expected: STATION but found: 01001099999.csv                                                                                     0100644 0000000 0000000 00000250661 14542563457 011531  0                                                                          

   Rows after final cleaning: 4,949,120

[6/7] Applying data transformations...

[7/7] One-hot encoding weather events...


25/12/31 14:37:26 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01001099999.csv                                                                                     0100644 0000000 0000000 00000250661 14542563457 011531  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
 Schema: STATION, DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
Expected: STATION but found: 01001099999.csv                                                                                     0100644


Final row count: 4,949,120
Rows retained: 59.30%

Saving to: processed_data//climate_2019.parquet


25/12/31 14:41:02 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01001099999.csv                                                                                     0100644 0000000 0000000 00000250661 14542563457 011531  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, STP, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, STP, MIN, SNDP
Expected: STATION but found: 01001099999.csv                                                                                     0100644 0000000 0000000 00000250661 14542563457 011531  0                                                                          

✓ Year 2019 successfully processed and saved!


Processing year: 2020

Reading from: /home/alumno/reposirotio/Grupo3/2020.tar/*


                                                                                

Initial row count: 8,250,708

[1/7] Cleaning SNDP (snow depth)...


                                                                                

   Rows after SNDP cleaning: 5,632,956

[2/7] Dropping attribute columns and GUST...

[3/7] Dropping rows with missing critical columns...


                                                                                

   Rows after dropping critical missing: 5,617,482

[4/7] Imputing missing values...
   - Station-wise mean imputation...
   - Grid-wise mean imputation...

[5/7] Dropping remaining rows with missing values...


25/12/31 14:47:03 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01001099999.csv                                                                                     0100644 0000000 0000000 00000250661 14543153122 011512  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, SLP, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, SLP, MIN, SNDP
Expected: STATION but found: 01001099999.csv                                                                                     0100644 0000000 0000000 00000250661 14543153122 011512  0                                                                          

   Rows after final cleaning: 4,913,710

[6/7] Applying data transformations...

[7/7] One-hot encoding weather events...


25/12/31 14:49:49 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01001099999.csv                                                                                     0100644 0000000 0000000 00000250661 14543153122 011512  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, STP, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, STP, MIN, SNDP
Expected: STATION but found: 01001099999.csv                                                                                     0100644 0000000 0000000 00000250661 14543153122 011512  0                                                                          


Final row count: 4,913,710
Rows retained: 59.56%

Saving to: processed_data//climate_2020.parquet


25/12/31 14:52:45 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01001099999.csv                                                                                     0100644 0000000 0000000 00000250661 14543153122 011512  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, VISIB, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, VISIB, MIN, SNDP
Expected: STATION but found: 01001099999.csv                                                                                     0100644 0000000 0000000 00000250661 14543153122 011512  0                                                                      

✓ Year 2020 successfully processed and saved!


Processing year: 2021

Reading from: /home/alumno/reposirotio/Grupo3/2021.tar/*


                                                                                

Initial row count: 8,065,374

[1/7] Cleaning SNDP (snow depth)...


                                                                                

   Rows after SNDP cleaning: 5,488,234

[2/7] Dropping attribute columns and GUST...

[3/7] Dropping rows with missing critical columns...


                                                                                

   Rows after dropping critical missing: 5,473,274

[4/7] Imputing missing values...
   - Station-wise mean imputation...
   - Grid-wise mean imputation...

[5/7] Dropping remaining rows with missing values...


25/12/31 14:58:09 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01001099999.csv                                                                                     0100644 0000000 0000000 00000247355 14543153201 011516  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, VISIB, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, VISIB, MIN, SNDP
Expected: STATION but found: 01001099999.csv                                                                                     0100644 0000000 0000000 00000247355 14543153201 011516  0                                                                      

   Rows after final cleaning: 4,816,032

[6/7] Applying data transformations...

[7/7] One-hot encoding weather events...


25/12/31 15:01:37 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01001099999.csv                                                                                     0100644 0000000 0000000 00000247355 14543153201 011516  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
 Schema: STATION, DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
Expected: STATION but found: 01001099999.csv                                                                                     0100644



25/12/31 15:04:25 WARN TaskMemoryManager: Failed to allocate a page (4194288 bytes), try again.
                                                                                


Final row count: 4,816,032
Rows retained: 59.71%

Saving to: processed_data//climate_2021.parquet


25/12/31 15:05:09 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01001099999.csv                                                                                     0100644 0000000 0000000 00000247355 14543153201 011516  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, MXSPD, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, MXSPD, MIN, SNDP
Expected: STATION but found: 01001099999.csv                                                                                     0100644 0000000 0000000 00000247355 14543153201 011516  0                                                                      

✓ Year 2021 successfully processed and saved!


Processing year: 2022

Reading from: /home/alumno/reposirotio/Grupo3/2022.tar/*


                                                                                

Initial row count: 8,001,716

[1/7] Cleaning SNDP (snow depth)...


                                                                                

   Rows after SNDP cleaning: 5,531,194

[2/7] Dropping attribute columns and GUST...

[3/7] Dropping rows with missing critical columns...


                                                                                

   Rows after dropping critical missing: 5,516,744

[4/7] Imputing missing values...
   - Station-wise mean imputation...
   - Grid-wise mean imputation...

[5/7] Dropping remaining rows with missing values...


25/12/31 15:11:01 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01001099999.csv                                                                                     0100644 0000000 0000000 00000144501 14543153252 011511  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, STP, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, STP, MIN, SNDP
Expected: STATION but found: 01001099999.csv                                                                                     0100644 0000000 0000000 00000144501 14543153252 011511  0                                                                          

   Rows after final cleaning: 4,850,846

[6/7] Applying data transformations...

[7/7] One-hot encoding weather events...


25/12/31 15:14:47 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01001099999.csv                                                                                     0100644 0000000 0000000 00000144501 14543153252 011511  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, STP, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, STP, MIN, SNDP
Expected: STATION but found: 01001099999.csv                                                                                     0100644 0000000 0000000 00000144501 14543153252 011511  0                                                                          


Final row count: 4,850,846
Rows retained: 60.62%

Saving to: processed_data//climate_2022.parquet


25/12/31 15:18:38 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01001099999.csv                                                                                     0100644 0000000 0000000 00000144501 14543153252 011511  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
 Schema: STATION, DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
Expected: STATION but found: 01001099999.csv                                                                                     0100644

✓ Year 2022 successfully processed and saved!


Processing year: 2023

Reading from: /home/alumno/reposirotio/Grupo3/2023.tar/*


25/12/31 15:22:58 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: /home/alumno/reposirotio/Grupo3/2023.tar/*.
java.io.FileNotFoundException: File /home/alumno/reposirotio/Grupo3/2023.tar/* does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:917)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1238)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:907)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:56)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:381)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource.org$apache$spark$sql$catalyst$analysis$ResolveDataSource$$loadV1BatchSource(ResolveDataSource.scala:143)
	at org.ap

Initial row count: 8,089,804

[1/7] Cleaning SNDP (snow depth)...


                                                                                

   Rows after SNDP cleaning: 5,736,090

[2/7] Dropping attribute columns and GUST...

[3/7] Dropping rows with missing critical columns...


                                                                                

   Rows after dropping critical missing: 5,719,706

[4/7] Imputing missing values...
   - Station-wise mean imputation...
   - Grid-wise mean imputation...

[5/7] Dropping remaining rows with missing values...


25/12/31 15:24:46 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01001099999.csv                                                                                     0100644 0000000 0000000 00000250305 14546266114 011516  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
 Schema: STATION, DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
Expected: STATION but found: 01001099999.csv                                                                                     0100644



                                                                                

   Rows after final cleaning: 4,993,172

[6/7] Applying data transformations...

[7/7] One-hot encoding weather events...


25/12/31 15:29:06 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01001099999.csv                                                                                     0100644 0000000 0000000 00000250305 14546266114 011516  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, DEWP, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, DEWP, MIN, SNDP
Expected: STATION but found: 01001099999.csv                                                                                     0100644 0000000 0000000 00000250305 14546266114 011516  0                                                                        


Final row count: 4,993,172
Rows retained: 61.72%

Saving to: processed_data//climate_2023.parquet


25/12/31 15:33:09 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01001099999.csv                                                                                     0100644 0000000 0000000 00000250305 14546266114 011516  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
 Schema: STATION, DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
Expected: STATION but found: 01001099999.csv                                                                                     0100644



25/12/31 15:36:10 WARN TaskMemoryManager: Failed to allocate a page (8388608 bytes), try again.
25/12/31 15:36:10 WARN TaskMemoryManager: Failed to allocate a page (8388608 bytes), try again.
25/12/31 15:36:10 WARN TaskMemoryManager: Failed to allocate a page (8388608 bytes), try again.
25/12/31 15:36:10 WARN TaskMemoryManager: Failed to allocate a page (8388608 bytes), try again.
25/12/31 15:36:21 WARN TaskMemoryManager: Failed to allocate a page (4194304 bytes), try again.




                                                                                

✓ Year 2023 successfully processed and saved!


Processing year: 2024

Reading from: /home/alumno/reposirotio/Grupo3/2024.tar/*


25/12/31 15:37:48 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: /home/alumno/reposirotio/Grupo3/2024.tar/*.
java.io.FileNotFoundException: File /home/alumno/reposirotio/Grupo3/2024.tar/* does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:917)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1238)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:907)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:56)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:381)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource.org$apache$spark$sql$catalyst$analysis$ResolveDataSource$$loadV1BatchSource(ResolveDataSource.scala:143)
	at org.ap

Initial row count: 7,874,996

[1/7] Cleaning SNDP (snow depth)...


                                                                                

   Rows after SNDP cleaning: 5,634,506

[2/7] Dropping attribute columns and GUST...

[3/7] Dropping rows with missing critical columns...


                                                                                

   Rows after dropping critical missing: 5,619,738

[4/7] Imputing missing values...
   - Station-wise mean imputation...
   - Grid-wise mean imputation...

[5/7] Dropping remaining rows with missing values...


25/12/31 15:39:16 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01001099999.csv                                                                                     0100644 0000000 0000000 00000244171 14744005554 011521  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, STP, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, STP, MIN, SNDP
Expected: STATION but found: 01001099999.csv                                                                                     0100644 0000000 0000000 00000244171 14744005554 011521  0                                                                          

   Rows after final cleaning: 4,956,096

[6/7] Applying data transformations...

[7/7] One-hot encoding weather events...


25/12/31 15:42:13 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01001099999.csv                                                                                     0100644 0000000 0000000 00000244171 14744005554 011521  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", LATITUDE, LONGITUDE, ELEVATION, NAME, STP, MIN, SNDP
 Schema: STATION, LATITUDE, LONGITUDE, ELEVATION, NAME, STP, MIN, SNDP
Expected: STATION but found: 01001099999.csv                                                                                     0100644 0000000 0000000 00000244171 14744005554 011521  0                                                                          



25/12/31 15:45:11 WARN TaskMemoryManager: Failed to allocate a page (4194288 bytes), try again.
                                                                                


Final row count: 4,956,096
Rows retained: 62.93%

Saving to: processed_data//climate_2024.parquet


25/12/31 15:45:40 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 01001099999.csv                                                                                     0100644 0000000 0000000 00000244171 14744005554 011521  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION", DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
 Schema: STATION, DATE, LATITUDE, LONGITUDE, ELEVATION, NAME, TEMP, DEWP, SLP, STP, VISIB, WDSP, MXSPD, MAX, MIN, PRCP, SNDP, FRSHTT
Expected: STATION but found: 01001099999.csv                                                                                     0100644

✓ Year 2024 successfully processed and saved!


ALL YEARS PROCESSED!
Total rows processed: 66,714,860
