In [1]:
import sys
from pathlib import Path
from typing import List, Dict

from loguru import logger
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import udf, col, when

In [2]:
CONF_LOG_PREFIX = 'CONFLOG'
FLST_LOG_PREFIX = 'FLSTLOG'
GEO_LOG_PREFIX = 'GEOLOG'
LOS_LOG_PREFIX = 'LOSLOG'
REG_LOG_PREFIX = 'REGLOG'
LOADING_PATH = '../output'
DATAFRAMES_NAMES = [CONF_LOG_PREFIX, FLST_LOG_PREFIX, GEO_LOG_PREFIX, LOS_LOG_PREFIX, REG_LOG_PREFIX]

Give access to the constants that defines

In [130]:
sys.path.append(str(Path(Path().absolute().parent, 'platform_code')))
from schemas.tables_attributes import *

In [4]:
def load_dataframes(files_names: List[str], loading_path: str, spark: SparkSession) -> Dict[str, DataFrame]:
    """ Loads the dataframes which macht the file names passed by arguments.
    The method read from the config the path were to read the files, which
    matches the folder where the files are saved in `save_dataframes_dict()`.

    :param files_names: list of the names of the files.
    :param loading_path: path were the files are saved.
    :param spark: spark session.
    :return: dictionary with the dataframes loaded from the files, with the
     file name as key.
    """
    dataframes = dict()

    for file_name in files_names:
        file_path = Path(loading_path, f'{file_name.lower()}.parquet')
        logger.info('Loading dataframe from `{}`.', file_path)
        df = spark.read.parquet(str(file_path))
        dataframes[file_name] = df

    return dataframes

In [5]:
spark = SparkSession.builder.appName('Notebook').getOrCreate()

In [6]:
input_dataframes = load_dataframes(DATAFRAMES_NAMES, LOADING_PATH, spark)

2022-03-25 11:05:40.840 | INFO     | __main__:load_dataframes:16 - Loading dataframe from `..\output\conflog.parquet`.
2022-03-25 11:05:46.134 | INFO     | __main__:load_dataframes:16 - Loading dataframe from `..\output\flstlog.parquet`.
2022-03-25 11:05:46.418 | INFO     | __main__:load_dataframes:16 - Loading dataframe from `..\output\geolog.parquet`.
2022-03-25 11:05:46.590 | INFO     | __main__:load_dataframes:16 - Loading dataframe from `..\output\loslog.parquet`.
2022-03-25 11:05:46.781 | INFO     | __main__:load_dataframes:16 - Loading dataframe from `..\output\reglog.parquet`.


In [7]:
input_dataframes[REG_LOG_PREFIX].show()

+------+------------------+---------------+-----+------------------+-----------+-----------+
|REG_ID|          Scenario|Simulation_time| ACID|               ALT|        LAT|        LON|
+------+------------------+---------------+-----+------------------+-----------+-----------+
| 76800|1_very_low_40_8_R2|         2880.0|D2320|             45.72|48.15636078|16.32453111|
| 76801|1_very_low_40_8_R2|         2880.0|D2340|128.01600000000002|48.17504097|16.36041054|
| 76802|1_very_low_40_8_R2|         2880.0|D2337|            73.152|48.23478639|16.42459103|
| 76803|1_very_low_40_8_R2|         2880.0|D2342|  136.185390000024|48.19298318|16.39227946|
| 76804|1_very_low_40_8_R2|         2880.0|D2343|          31.83255|48.21440571|16.35687219|
| 76805|1_very_low_40_8_R2|         2880.0|D2346|            73.152|48.19509474|16.39687844|
| 76806|1_very_low_40_8_R2|         2880.0|D2358|            9.8679|48.20259996|16.30276413|
| 76807|1_very_low_40_8_R2|         2880.0|D2364|            36.576|48

In [8]:
#ENV-1

In [16]:
#ENV-2
import geopy.distance
from pyspark.sql.functions import *
from pyspark.sql import Window
import pyspark.sql.functions as F

In [10]:
@udf
def get_coordinates_distance(origin_latitude: float, origin_longitude: float,
                             destination_latitude: float, destination_longitude: float) -> float:
    """ Calculates the distance in meters between two world coordinates.

    :param origin_latitude: origin latitude point.
    :param origin_longitude: origin longitude point.
    :param destination_latitude: destination latitude point.
    :param destination_longitude: destination longitude point.
    :return: distance in meters.
    """
    origin_tuple = (origin_latitude, origin_longitude)
    destination_tuple = (destination_latitude, destination_longitude)
    # TODO: direct distance calculation (in meters) between two points, is this approach correct?
    return geopy.distance.distance(origin_tuple, destination_tuple).m

In [11]:
dataframe = input_dataframes[REG_LOG_PREFIX].groupby(SCENARIO_NAME, ACID).count()
dataframe.show()

+------------------+-----+-----+
|          Scenario| ACID|count|
+------------------+-----+-----+
|1_very_low_40_8_R2|D2366|   59|
|1_very_low_40_8_R2|D3665|   28|
|1_very_low_40_8_R2|D3746|   11|
|1_very_low_40_8_R2|D3840|   13|
|1_very_low_40_8_R2|D4165|    9|
|1_very_low_40_8_R2|D4192|   14|
|1_very_low_40_8_R2|D4233|   14|
|1_very_low_40_8_R2|D4531|    6|
|1_very_low_40_8_W1|D3157|   17|
|1_very_low_40_8_W1|D4130|   16|
|1_very_low_40_8_W1|D4261|   12|
|1_very_low_40_8_W1|D4388|    6|
|1_very_low_40_8_W1|D4401|   10|
|3_very_low_40_8_R2|D3389|   17|
|3_very_low_40_8_R2|D3440|    8|
|3_very_low_40_8_R2|D3546|   12|
|3_very_low_40_8_R2|D3571|   18|
|3_very_low_40_8_R2|D3623|   30|
|3_very_low_40_8_R2|D3635|   14|
|3_very_low_40_8_R2|D3731|   23|
+------------------+-----+-----+
only showing top 20 rows



In [12]:
dataframe = input_dataframes[REG_LOG_PREFIX]

# compute next latitude and longitude foreach row 
window = Window.partitionBy(SCENARIO_NAME).orderBy(SIMULATION_TIME)    
dataframe = dataframe.withColumn("NEXT_LATITUDE",lag(LATITUDE, -1).over(window)).withColumn("NEXT_LONGITUDE",lag(LONGITUDE, -1).over(window))


In [13]:
#Remove rows with NEXT_LATITUDE and NEXT_LONGITUDE null (they are the rows of separation between scenarios)
dataframe = dataframe.na.drop(subset=["NEXT_LATITUDE","NEXT_LONGITUDE"])
#Check it
dataframe.filter(col("NEXT_LATITUDE").isNull() | col("NEXT_LATITUDE").isNull()).show()

+------+--------+---------------+----+---+---+---+-------------+--------------+
|REG_ID|Scenario|Simulation_time|ACID|ALT|LAT|LON|NEXT_LATITUDE|NEXT_LONGITUDE|
+------+--------+---------------+----+---+---+---+-------------+--------------+
+------+--------+---------------+----+---+---+---+-------------+--------------+



In [28]:
dataframe = dataframe.withColumn("DIST_NEXT_POINT", get_coordinates_distance(LATITUDE, LONGITUDE, "NEXT_LATITUDE", "NEXT_LONGITUDE"))
dataframe = dataframe.withColumn("WEIGHT_SEGMENT",  col(ALTITUDE)*col("DIST_NEXT_POINT"))
df = dataframe.groupby(SCENARIO_NAME, ACID).agg(F.sum(col("WEIGHT_SEGMENT")).alias("FP_ENV2"))
df.show()

+------------------+-----+--------------------+
|          Scenario| ACID|             FP_ENV2|
+------------------+-----+--------------------+
|1_very_low_40_8_W1| D297|   8758230.592084356|
|1_very_low_40_8_W1| D760|   8027879.154410325|
|1_very_low_40_8_W1|D1305|  4644231.4278680375|
|1_very_low_40_8_W1|D1454|   284918.9770182947|
|1_very_low_40_8_W1|D1467|   323733.6400512988|
|1_very_low_40_8_W1|D1607|  6047048.1692083515|
|1_very_low_40_8_W1|D1782|  4105609.9031290486|
|1_very_low_40_8_W1|D1775|   949347.6831496685|
|1_very_low_40_8_W1|D1914|  2135758.1258949274|
|1_very_low_40_8_W1|D1923|  3011369.0843983875|
|1_very_low_40_8_W1|D2195|  3265810.3652293948|
|1_very_low_40_8_W1|D2411|  3866343.2139310758|
|1_very_low_40_8_W1|D2503|3.0518177728980314E7|
|1_very_low_40_8_W1|D2853|   975331.3151750166|
|1_very_low_40_8_W1|D3023|   2678757.668442695|
|1_very_low_40_8_W1|D3157|   1177752.059597437|
|1_very_low_40_8_W1|D3196|   6314961.186097211|
|1_very_low_40_8_W1|D3343|  1680877.8918

In [29]:
#Calculate average per scenario
df = df.groupBy(SCENARIO_NAME).agg(mean("FP_ENV2").alias(ENV2))


In [30]:
df.show()

+------------------+-----------------+
|          Scenario|             ENV2|
+------------------+-----------------+
|1_very_low_40_8_W1|5074994.109194892|
|1_very_low_40_8_R2|5074994.109194907|
|2_very_low_40_8_W1|5074994.109194913|
|3_very_low_40_8_W1|5074994.109194895|
|3_very_low_40_8_R2|5074994.109194884|
|2_very_low_40_8_R2|5074994.109194905|
+------------------+-----------------+



In [60]:
#ENV4
dataframe4 = input_dataframes[REG_LOG_PREFIX]
dataframe4 = dataframe4.groupby(SCENARIO_NAME, ACID).agg(F.max(ALTITUDE).alias("MAX_ALTITUDE"), F.min(ALTITUDE).alias("MIN_ALTITUDE"))
dataframe4 = dataframe4.withColumn("DIFF_ALTITUDE", col("MAX_ALTITUDE") - col("MIN_ALTITUDE"))
avg_delay = dataframe4.select(mean("DIFF_ALTITUDE").alias("MEAN_DIFF_ALTITUDE"))
dataframe4 = dataframe4.join(avg_delay, how='outer')
dataframe4.show()

dataframe4 = dataframe4.withColumn(ENV4, col("DIFF_ALTITUDE")/col("MEAN_DIFF_ALTITUDE"))
dataframe4 = dataframe4.select(SCENARIO_NAME, ACID, ENV4)
dataframe4.show()
# dataframe5 = dataframe4.groupby(SCENARIO_NAME).agg(F.max('MAX_ALTITUDE').alias("MAX_ALTITUDE_SCN"), F.min('MIN_ALTITUDE').alias("MIN_ALTITUDE_SCN"))
# dataframe5.show()


+------------------+-----+------------------+------------------+------------------+------------------+
|          Scenario| ACID|      MAX_ALTITUDE|      MIN_ALTITUDE|     DIFF_ALTITUDE|MEAN_DIFF_ALTITUDE|
+------------------+-----+------------------+------------------+------------------+------------------+
|1_very_low_40_8_R2|D2366|           146.304|1.3893899994720003|  144.914610000528| 78.24832099400818|
|1_very_low_40_8_R2|D3665|            73.152|             9.144|            64.008| 78.24832099400818|
|1_very_low_40_8_R2|D3746|            82.296|             9.144|            73.152| 78.24832099400818|
|1_very_low_40_8_R2|D3840|           146.304|3.0333900002640006|  143.270609999736| 78.24832099400818|
|1_very_low_40_8_R2|D4165|           51.8922|             9.144|42.748200000000004| 78.24832099400818|
|1_very_low_40_8_R2|D4192|            82.296|             9.144|            73.152| 78.24832099400818|
|1_very_low_40_8_R2|D4233|54.864000000000004|             45.72| 9.144000

In [143]:
#ENV3
lat_roi = "(48.15636078, 48.16)"
lon_roi = "(16.32453111, 16.33)"
alt_roi = "(0, 50)"
time_roi = 120

lat_roi = eval(lat_roi)
lon_roi = eval(lon_roi)
alt_roi = eval(alt_roi)

dataframe3 = input_dataframes[REG_LOG_PREFIX]
dataframe3.show()


dataframe3 = dataframe3.withColumn("reached_roi", when((alt_roi[0] <= col(ALTITUDE)) & (col(ALTITUDE) < alt_roi[1]) &
                                                       (lat_roi[0] <= col(LATITUDE)) & (col(LATITUDE) < lat_roi[1]) &
                                                       (lon_roi[0] <= col(LONGITUDE)) & (col(LONGITUDE) < lon_roi[1]),
                                                       True).otherwise(False))

dataframe3.show()
dataframe3 = dataframe3.select(SCENARIO_NAME, col("reached_roi")).where(col("reached_roi") == True).groupby(SCENARIO_NAME).count().withColumnRenamed("count", "ENV3")
dataframe3.show()

+------+------------------+---------------+-----+------------------+-----------+-----------+
|REG_ID|          Scenario|Simulation_time| ACID|               ALT|        LAT|        LON|
+------+------------------+---------------+-----+------------------+-----------+-----------+
| 76800|1_very_low_40_8_R2|         2880.0|D2320|             45.72|48.15636078|16.32453111|
| 76801|1_very_low_40_8_R2|         2880.0|D2340|128.01600000000002|48.17504097|16.36041054|
| 76802|1_very_low_40_8_R2|         2880.0|D2337|            73.152|48.23478639|16.42459103|
| 76803|1_very_low_40_8_R2|         2880.0|D2342|  136.185390000024|48.19298318|16.39227946|
| 76804|1_very_low_40_8_R2|         2880.0|D2343|          31.83255|48.21440571|16.35687219|
| 76805|1_very_low_40_8_R2|         2880.0|D2346|            73.152|48.19509474|16.39687844|
| 76806|1_very_low_40_8_R2|         2880.0|D2358|            9.8679|48.20259996|16.30276413|
| 76807|1_very_low_40_8_R2|         2880.0|D2364|            36.576|48

In [None]:
dataframe1 = input_dataframes[REG_LOG_PREFIX]
