In [1]:
import sys
from pathlib import Path
from typing import List, Dict

from loguru import logger
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import udf, col, when

In [2]:
CONF_LOG_PREFIX = 'CONFLOG'
FLST_LOG_PREFIX = 'FLSTLOG'
GEO_LOG_PREFIX = 'GEOLOG'
LOS_LOG_PREFIX = 'LOSLOG'
REG_LOG_PREFIX = 'REGLOG'
LOADING_PATH = '../output'
DATAFRAMES_NAMES = [CONF_LOG_PREFIX, FLST_LOG_PREFIX, GEO_LOG_PREFIX, LOS_LOG_PREFIX, REG_LOG_PREFIX]

Give access to the constants that defines

In [3]:
sys.path.append(str(Path(Path().absolute().parent, 'platform_code')))
from schemas.tables_attributes import *

In [4]:
def load_dataframes(files_names: List[str], loading_path: str, spark: SparkSession) -> Dict[str, DataFrame]:
    """ Loads the dataframes which macht the file names passed by arguments.
    The method read from the config the path were to read the files, which
    matches the folder where the files are saved in `save_dataframes_dict()`.

    :param files_names: list of the names of the files.
    :param loading_path: path were the files are saved.
    :param spark: spark session.
    :return: dictionary with the dataframes loaded from the files, with the
     file name as key.
    """
    dataframes = dict()

    for file_name in files_names:
        file_path = Path(loading_path, f'{file_name.lower()}.parquet')
        logger.info('Loading dataframe from `{}`.', file_path)
        df = spark.read.parquet(str(file_path))
        dataframes[file_name] = df

    return dataframes

In [5]:
spark = SparkSession.builder.appName('Notebook').getOrCreate()

In [6]:
input_dataframes = load_dataframes(DATAFRAMES_NAMES, LOADING_PATH, spark)

2022-03-28 14:21:38.233 | INFO     | __main__:load_dataframes:16 - Loading dataframe from `..\output\conflog.parquet`.
2022-03-28 14:21:47.192 | INFO     | __main__:load_dataframes:16 - Loading dataframe from `..\output\flstlog.parquet`.
2022-03-28 14:21:47.460 | INFO     | __main__:load_dataframes:16 - Loading dataframe from `..\output\geolog.parquet`.
2022-03-28 14:21:47.684 | INFO     | __main__:load_dataframes:16 - Loading dataframe from `..\output\loslog.parquet`.
2022-03-28 14:21:47.869 | INFO     | __main__:load_dataframes:16 - Loading dataframe from `..\output\reglog.parquet`.


In [None]:
input_dataframes[REG_LOG_PREFIX].show()

In [None]:
#ENV-1

In [None]:
#ENV-2
import geopy.distance
from pyspark.sql.functions import *
from pyspark.sql import Window
import pyspark.sql.functions as F

In [None]:
@udf
def get_coordinates_distance(origin_latitude: float, origin_longitude: float,
                             destination_latitude: float, destination_longitude: float) -> float:
    """ Calculates the distance in meters between two world coordinates.

    :param origin_latitude: origin latitude point.
    :param origin_longitude: origin longitude point.
    :param destination_latitude: destination latitude point.
    :param destination_longitude: destination longitude point.
    :return: distance in meters.
    """
    origin_tuple = (origin_latitude, origin_longitude)
    destination_tuple = (destination_latitude, destination_longitude)
    # TODO: direct distance calculation (in meters) between two points, is this approach correct?
    return geopy.distance.distance(origin_tuple, destination_tuple).m

In [None]:
dataframe = input_dataframes[REG_LOG_PREFIX]

# compute next latitude and longitude foreach row 
window = Window.partitionBy(SCENARIO_NAME).orderBy(SIMULATION_TIME)    
dataframe = dataframe.withColumn("NEXT_LATITUDE",lag(LATITUDE, -1).over(window)).withColumn("NEXT_LONGITUDE",lag(LONGITUDE, -1).over(window))


In [None]:
#Remove rows with NEXT_LATITUDE and NEXT_LONGITUDE null (they are the rows of separation between scenarios)
dataframe = dataframe.na.drop(subset=["NEXT_LATITUDE","NEXT_LONGITUDE"])
#Check it
dataframe.filter(col("NEXT_LATITUDE").isNull() | col("NEXT_LATITUDE").isNull()).show()

In [None]:
dataframe = dataframe.withColumn("DIST_NEXT_POINT", get_coordinates_distance(LATITUDE, LONGITUDE, "NEXT_LATITUDE", "NEXT_LONGITUDE"))
dataframe = dataframe.withColumn("WEIGHT_SEGMENT",  col(ALTITUDE)*col("DIST_NEXT_POINT"))
df = dataframe.groupby(SCENARIO_NAME, ACID).agg(F.sum(col("WEIGHT_SEGMENT")).alias("FP_ENV2"))
df.show()

In [None]:
#Calculate average per scenario
df = df.groupBy(SCENARIO_NAME).agg(mean("FP_ENV2").alias(ENV2))


In [None]:
df.show()

In [None]:
#ENV4
dataframe4 = input_dataframes[REG_LOG_PREFIX]
dataframe4 = dataframe4.groupby(SCENARIO_NAME, ACID).agg(F.max(ALTITUDE).alias("MAX_ALTITUDE"), F.min(ALTITUDE).alias("MIN_ALTITUDE"))
dataframe4 = dataframe4.withColumn("DIFF_ALTITUDE", col("MAX_ALTITUDE") - col("MIN_ALTITUDE"))
avg_delay = dataframe4.select(mean("DIFF_ALTITUDE").alias("MEAN_DIFF_ALTITUDE"))
dataframe4 = dataframe4.join(avg_delay, how='outer')
dataframe4.show()

dataframe4 = dataframe4.withColumn(ENV4, col("DIFF_ALTITUDE")/col("MEAN_DIFF_ALTITUDE"))
dataframe4 = dataframe4.select(SCENARIO_NAME, ACID, ENV4)
dataframe4.show()
# dataframe5 = dataframe4.groupby(SCENARIO_NAME).agg(F.max('MAX_ALTITUDE').alias("MAX_ALTITUDE_SCN"), F.min('MIN_ALTITUDE').alias("MIN_ALTITUDE_SCN"))
# dataframe5.show()


In [None]:
from pyspark.sql.functions import udf
def get_coordinates_distance(origin_latitude: float, origin_longitude: float,
                             destination_latitude: float, destination_longitude: float) -> float:
    """ Calculates the distance in meters between two world coordinates.

    :param origin_latitude: origin latitude point.
    :param origin_longitude: origin longitude point.
    :param destination_latitude: destination latitude point.
    :param destination_longitude: destination longitude point.
    :return: distance in meters.
    """
    origin_tuple = (origin_latitude, origin_longitude)
    destination_tuple = (destination_latitude, destination_longitude)
    # TODO: direct distance calculation (in meters) between two points, is this approach correct?
    return geopy.distance.distance(origin_tuple, destination_tuple).m


@udf
def in_circle (x_center, y_center, x, y, radius):
    return get_coordinates_distance(x_center, y_center, x, y) <= radius

In [8]:
from geopy.distance import great_circle
from pyspark.sql.functions import udf

@udf("double")
def great_circle_udf(x, y):
    return great_circle(x, y).kilometers

In [14]:
#ENV3
from pyspark.sql.functions import lit, struct
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import pow, col, log
from pyspark.sql.functions import *
from pyspark.sql import Window
import pyspark.sql.functions as F


x_center = 48.15636078 #lat
y_center = 16.32453111 #lon
radius_roi = 10 #meters
altitude_roi = 100 #meters
time_roi = 120 #seconds


dataframe3 = input_dataframes[REG_LOG_PREFIX]
dataframe3.printSchema() 
dataframe3.show()

point = struct(lit(x_center), lit(y_center))
udf_func = udf(great_circle_udf,DoubleType()) #Creating a 'User Defined Function' to calculate distance between two points.
dataframe3 = dataframe3.withColumn("distance", udf_func(point, struct(col(LATITUDE), col(LONGITUDE)))).filter((col("distance") <= radius_roi) & (col(ALTITUDE)<= altitude_roi) & (col(SIMULATION_TIME) == time_roi)) #Creating column "distance" based on function 'get_distance'
#dataframe3 = dataframe3.withColumn("noise_level", 10*log(1/pow(col("distance"),2)))
dataframe3 = dataframe3.withColumn("noise_level", log10(1/pow(col("distance"),2)))
dataframe3.show()

root
 |-- REG_ID: integer (nullable = true)
 |-- Scenario: string (nullable = true)
 |-- Simulation_time: double (nullable = true)
 |-- ACID: string (nullable = true)
 |-- ALT: double (nullable = true)
 |-- LAT: double (nullable = true)
 |-- LON: double (nullable = true)

+------+------------------+---------------+-----+------------------+-----------+-----------+
|REG_ID|          Scenario|Simulation_time| ACID|               ALT|        LAT|        LON|
+------+------------------+---------------+-----+------------------+-----------+-----------+
| 76800|1_very_low_40_8_R2|         2880.0|D2320|             45.72|48.15636078|16.32453111|
| 76801|1_very_low_40_8_R2|         2880.0|D2340|128.01600000000002|48.17504097|16.36041054|
| 76802|1_very_low_40_8_R2|         2880.0|D2337|            73.152|48.23478639|16.42459103|
| 76803|1_very_low_40_8_R2|         2880.0|D2342|  136.185390000024|48.19298318|16.39227946|
| 76804|1_very_low_40_8_R2|         2880.0|D2343|          31.83255|48.21440

In [12]:
dataframe3.groupBy(SCENARIO_NAME).agg(sum("noise_level")).show()

+------------------+-------------------+
|          Scenario|   sum(noise_level)|
+------------------+-------------------+
|1_very_low_40_8_W1|-188.96845944379098|
|1_very_low_40_8_R2|-188.96845944379098|
|2_very_low_40_8_W1|-188.96845944379098|
|3_very_low_40_8_W1|-188.96845944379098|
|3_very_low_40_8_R2|-188.96845944379098|
|2_very_low_40_8_R2|-188.96845944379098|
+------------------+-------------------+



In [None]:
dataframe3.show()