# Analysis

In [2]:
from pyspark.sql import SparkSession
from pathlib import Path

# Initialize Spark session
spark = SparkSession.builder \
    .appName("NOAA GSOD Analysis") \
    .getOrCreate()

# Define the path to 1975 data
OUTPUT_DIR = Path("/home/alumno/reposirotio/Grupo3/")
year_1975_path = str(OUTPUT_DIR / "1975.tar" / "*.csv")

# Read all CSV files from 1975
df_1975 = spark.read.csv(
    year_1975_path,
    header=True,
    inferSchema=True
)

# Count unique stations
unique_stations = df_1975.select("STATION").distinct().count()

print(f"Number of unique stations in 1975: {unique_stations}")

# Optional: Show some sample data
print("\nSample data:")
df_1975.show(5, truncate=False)

# Optional: Show schema
print("\nSchema:")
df_1975.printSchema()

# Optional: Get total number of records
total_records = df_1975.count()
print(f"\nTotal records in 1975: {total_records}")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/18 13:39:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/12/18 13:39:20 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: /home/alumno/reposirotio/Grupo3/1975.tar/*.csv.
java.io.FileNotFoundException: File /home/alumno/reposirotio/Grupo3/1975.tar/*.csv does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:917)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1238)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:907)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.

Number of unique stations in 1975: 8425

Sample data:
+-----------+----------+--------+---------+---------+-----------------------------------------------------------------+----+---------------+----+---------------+------+--------------+-----+--------------+-----+----------------+----+---------------+-----+-----+----+--------------+----+--------------+-----+---------------+-----+------+
|STATION    |DATE      |LATITUDE|LONGITUDE|ELEVATION|NAME                                                             |TEMP|TEMP_ATTRIBUTES|DEWP|DEWP_ATTRIBUTES|SLP   |SLP_ATTRIBUTES|STP  |STP_ATTRIBUTES|VISIB|VISIB_ATTRIBUTES|WDSP|WDSP_ATTRIBUTES|MXSPD|GUST |MAX |MAX_ATTRIBUTES|MIN |MIN_ATTRIBUTES|PRCP |PRCP_ATTRIBUTES|SNDP |FRSHTT|
+-----------+----------+--------+---------+---------+-----------------------------------------------------------------+----+---------------+----+---------------+------+--------------+-----+--------------+-----+----------------+----+---------------+-----+-----+----+---------




Total records in 1975: 2190974


                                                                                

In [9]:
import pyspark.sql.functions as sql_f

spanish_stations = (
    df_1975
    .filter(sql_f.col("NAME").contains(", SP"))
    .select("NAME")
    .distinct()
)

spanish_stations.write \
    .mode("overwrite") \
    .option("header", True) \
    .csv("spanish_stations_1975")

                                                                                