In [45]:
# Importiamo le librerie necessarie
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, DoubleType, TimestampType
from pyspark.sql.functions import col, year, sum, to_timestamp, count, expr, max, count_distinct
from pyspark.sql import Window
from pyspark.sql.dataframe import DataFrame
import numpy as np

# Supponiamo che SparkSession sia già stato creato
ss: SparkSession = SparkSession.builder.appName("PoliSalesAnalysis").getOrCreate()

# Variabili per i percorsi di input e output
# Percorsi dei file di input e output
jupyter = False
if jupyter:
    input_prefix = "/user/s339450/esami/20240912/"
    output_prefix= "/user/s339450/esami/20240912/out/"
else:
    input_prefix = ".\\data\\"
    output_prefix= ".\\out\\"

NEOBjects_path = f"{input_prefix}NEOBjects.txt"
observations_path = f"{input_prefix}Observations.txt"
observatories_path = f"{input_prefix}Observatories.txt"
output_folder_1 = f"{output_prefix}1/"
output_folder_2 = f"{output_prefix}2/"

In [46]:
observatories_schema = StructType([
    StructField("ObservatoryID", StringType(), False),
    StructField("Name", StringType(), False),
    StructField("Lat", DoubleType(), False),
    StructField("Lon", DoubleType(), False),
    StructField("Country", StringType(), False),
    StructField("Continent", StringType(), False),
    StructField("Amateur", StringType(), False),
])

observatories: DataFrame = ss.read.load(observatories_path,
    format="csv",
    header=False,
    schema=observatories_schema,
    sep=",")

observatories.show()

NEObjects_schema = StructType([
    StructField("NEOID", StringType(), False),
    StructField("Dimension", IntegerType(), False),
    StructField("MaterialStrength", IntegerType(), False),
    StructField("alreadyFallen", StringType(), False)
])

NEObjects: DataFrame = ss.read.load(NEOBjects_path,
    format="csv",
    header=False,
    schema=NEObjects_schema,
    sep=",")

NEObjects.show()

observations_schema = StructType([
    StructField("NEOID", StringType(), False),
    StructField("ObservatoryID", StringType(), False),
    StructField("ObsDateTime", StringType(), False),
    StructField("EclipticLat", DoubleType(), False),
    StructField("EclipticLon", DoubleType(), False),
    StructField("EstimatedDistance", DoubleType(), False)
])

observations: DataFrame = ss.read.load(observations_path,
    format="csv",
    header=False,
    schema=observations_schema,
    sep=",")

# Conversione del timestamp
observations = observations.withColumn(
    "ObsDateTime",
    to_timestamp(col("ObsDateTime"), "yyyy-MM-dd HH:mm:ss")  # Adatta il formato del timestamp
)

observations.show()

+-------------+-------------------+--------+---------+---------+-------------+-------+
|ObservatoryID|               Name|     Lat|      Lon|  Country|    Continent|Amateur|
+-------------+-------------------+--------+---------+---------+-------------+-------+
|        OB101|   Keck Observatory| 19.8283|-155.4783|      USA|North America|   True|
|        OB202|Galileo Observatory| 41.9028|  12.4964|    Italy|       Europe|  False|
|        OB303|Amateur Skywatchers| 35.6895| 139.6917|    Japan|         Asia|   True|
|        OB404|      Cerro Paranal| 24.6272| -70.4042|    Chile|South America|  False|
|        OB505| Sydney Observatory|-33.8675|  151.207|Australia|      Oceania|  False|
|        OB606|  Trump Observatory| 17.8283|-155.4783|      USA|North America|   True|
+-------------+-------------------+--------+---------+---------+-------------+-------+

+------+---------+----------------+-------------+
| NEOID|Dimension|MaterialStrength|alreadyFallen|
+------+---------+-----------

# Punto 1

In [47]:
avg_dimension = NEObjects.groupBy().avg("Dimension").withColumnRenamed("avg(Dimension)", "AvgDimension").select("AvgDimension").first()['AvgDimension']

most_reveant_neos = NEObjects.filter((col("Dimension") > avg_dimension) & (col("alreadyFallen") == "False")).select("NEOID")

# Estrai la lista di NEOID che soddisfano i criteri
neo_ids_list = [row["NEOID"] for row in most_reveant_neos.collect()]

# response2 = observations.filter((year(col("ObsDateTime")) >= 2023) & col("NEOID").isin(neo_ids_list))
response2 = observations.filter((year(col("ObsDateTime")) >= 2023)).join(most_reveant_neos, "NEOID", "semi")

response2.show()


+------+-------------+-------------------+-----------+-----------+-----------------+
| NEOID|ObservatoryID|        ObsDateTime|EclipticLat|EclipticLon|EstimatedDistance|
+------+-------------+-------------------+-----------+-----------+-----------------+
|NEO202|        OB202|2023-07-15 04:30:00|    38.9072|    -77.037|              1.4|
+------+-------------+-------------------+-----------+-----------+-----------------+



# Punto 2

In [54]:
most_reveant_neos = observations.filter((year(col("ObsDateTime")) >= 2023)).groupBy(col("NEOID")).agg(count_distinct("ObservatoryID")).filter(col("count(DISTINCT ObservatoryID)") < 10)

most_reveant_neos = most_reveant_neos.withColumn(
    "tmp",
    col("count(DISTINCT ObservatoryID)").cast("string")
)

most_reveant_neos = most_reveant_neos.replace(["1"], ["NONE"], "tmp")



most_reveant_neos.show()

+------+-----------------------------+---+
| NEOID|count(DISTINCT ObservatoryID)|tmp|
+------+-----------------------------+---+
|NEO202|                            1|  1|
|NEO101|                            1|  1|
+------+-----------------------------+---+

