In [1]:
from pathlib import Path
from typing import List, Dict

from loguru import logger
from pyspark.sql import SparkSession, DataFrame
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col, when
import sys

In [2]:
CONF_LOG_PREFIX = 'CONFLOG'
FLST_LOG_PREFIX = 'FLSTLOG'
GEO_LOG_PREFIX = 'GEOLOG'
LOS_LOG_PREFIX = 'LOSLOG'
REG_LOG_PREFIX = 'REGLOG'
LOADING_PATH = 'C:/Users/jpedrero/SpyderProjects/M2_data_analysis_platform/output'
DATAFRAMES_NAMES = [CONF_LOG_PREFIX, FLST_LOG_PREFIX, GEO_LOG_PREFIX, LOS_LOG_PREFIX, REG_LOG_PREFIX]

In [3]:
sys.path.append(str(Path(Path().absolute().parent, 'platform_code')))
from schemas.tables_attributes import *

In [4]:
def load_dataframes(files_names: List[str], loading_path: str, spark: SparkSession) -> Dict[str, DataFrame]:
    """ Loads the dataframes which macht the file names passed by arguments.
    The method read from the config the path were to read the files, which
    matches the folder where the files are saved in `save_dataframes_dict()`.

    :param files_names: list of the names of the files.
    :param loading_path: path were the files are saved.
    :param spark: spark session.
    :return: dictionary with the dataframes loaded from the files, with the
     file name as key.
    """
    dataframes = dict()

    for file_name in files_names:
        file_path = Path(loading_path, f'{file_name.lower()}.parquet')
        logger.info('Loading dataframe from `{}`.', file_path)
        df = spark.read.parquet(str(file_path))
        dataframes[file_name] = df

    return dataframes

In [5]:
spark = SparkSession.builder.appName('Notebook').getOrCreate()

In [6]:
input_dataframes = load_dataframes(DATAFRAMES_NAMES, LOADING_PATH, spark)

2022-03-24 16:48:18.151 | INFO     | __main__:load_dataframes:16 - Loading dataframe from `C:\Users\jpedrero\SpyderProjects\M2_data_analysis_platform\output\conflog.parquet`.
2022-03-24 16:48:23.910 | INFO     | __main__:load_dataframes:16 - Loading dataframe from `C:\Users\jpedrero\SpyderProjects\M2_data_analysis_platform\output\flstlog.parquet`.
2022-03-24 16:48:24.140 | INFO     | __main__:load_dataframes:16 - Loading dataframe from `C:\Users\jpedrero\SpyderProjects\M2_data_analysis_platform\output\geolog.parquet`.
2022-03-24 16:48:24.308 | INFO     | __main__:load_dataframes:16 - Loading dataframe from `C:\Users\jpedrero\SpyderProjects\M2_data_analysis_platform\output\loslog.parquet`.
2022-03-24 16:48:24.482 | INFO     | __main__:load_dataframes:16 - Loading dataframe from `C:\Users\jpedrero\SpyderProjects\M2_data_analysis_platform\output\reglog.parquet`.


In [7]:
input_dataframes

{'CONFLOG': DataFrame[CONF_ID: bigint, Scenario: string, CONF_detected_time: double, CPALAT: double, CPALON: double],
 'FLSTLOG': DataFrame[Flight_id: bigint, Scenario: string, ACID: string, Vehicle: string, Origin_LAT: string, Origin_LON: string, Destination_LAT: string, Destination_LON: string, Baseline_departure_time: int, cruising_speed: double, Vertical_Speed: double, Priority: int, loitering: boolean, Baseline_2D_distance: string, Baseline_vertical_distance: double, Baseline_ascending_distance: double, Baseline_3D_distance: double, Baseline_flight_time: double, Baseline_arrival_time: double, Deletion_Time: double, Spawn_Time: double, Flight_time: string, Distance_2D: string, Distance_3D: double, Distance_ALT: double, Deletion_LAT: double, Deletion_LON: double, Deletion_ALT: double, Distance_ascend: double, Work_Done: double],
 'GEOLOG': DataFrame[GEO_id: bigint, Scenario: string, Deletion_Time: double, Geofence_name: string, Max_intrusion: double, Violation_severity: boolean, Ope

In [8]:
input_dataframes

{'CONFLOG': DataFrame[CONF_ID: bigint, Scenario: string, CONF_detected_time: double, CPALAT: double, CPALON: double],
 'FLSTLOG': DataFrame[Flight_id: bigint, Scenario: string, ACID: string, Vehicle: string, Origin_LAT: string, Origin_LON: string, Destination_LAT: string, Destination_LON: string, Baseline_departure_time: int, cruising_speed: double, Vertical_Speed: double, Priority: int, loitering: boolean, Baseline_2D_distance: string, Baseline_vertical_distance: double, Baseline_ascending_distance: double, Baseline_3D_distance: double, Baseline_flight_time: double, Baseline_arrival_time: double, Deletion_Time: double, Spawn_Time: double, Flight_time: string, Distance_2D: string, Distance_3D: double, Distance_ALT: double, Deletion_LAT: double, Deletion_LON: double, Deletion_ALT: double, Distance_ascend: double, Work_Done: double],
 'GEOLOG': DataFrame[GEO_id: bigint, Scenario: string, Deletion_Time: double, Geofence_name: string, Max_intrusion: double, Violation_severity: boolean, Ope

In [9]:
from config import settings
print(settings.thresholds.emergency_mission_delay)

300


In [10]:
dataframe = input_dataframes[FLST_LOG_PREFIX].select(SCENARIO_NAME, PRIORITY, LOITERING, BASELINE_ARRIVAL_TIME, DEL_TIME)

In [11]:
dataframe = dataframe.withColumn("delay", col(DEL_TIME)-col(BASELINE_ARRIVAL_TIME))

In [12]:
dataframe = dataframe.withColumn("cancelation_limit", 
                                             when(col(PRIORITY) == 4, settings.thresholds.emergency_mission_delay)
                                            .otherwise(when(col(LOITERING), settings.thresholds.loitering_mission_delay)
                                            .otherwise(settings.thresholds.delivery_mission_delay)))

In [13]:
dataframe = dataframe.select(SCENARIO_NAME, PRIORITY, LOITERING, "delay","cancelation_limit").withColumn("cancelled_demand", col("delay") >= col("cancelation_limit"))

In [14]:
dataframe = dataframe.select(SCENARIO_NAME, "cancelled_demand").where(col("cancelled_demand")).groupby(SCENARIO_NAME).count().withColumnRenamed("count", "AEQ1")
dataframe.show()

+------------------+----+
|          Scenario|AEQ1|
+------------------+----+
|1_very_low_40_8_W1|  90|
|1_very_low_40_8_R2|  90|
|3_very_low_40_8_W1|  90|
|3_very_low_40_8_R2|  90|
|2_very_low_40_8_W1|  90|
|2_very_low_40_8_R2|  90|
+------------------+----+



In [15]:
dataframe.select(AEQ1).show()

+----+
|AEQ1|
+----+
|  90|
|  90|
|  90|
|  90|
|  90|
|  90|
+----+



In [16]:
dataframe2 = input_dataframes[FLST_LOG_PREFIX].select(SCENARIO_NAME, ACID)
dataframe2 = dataframe2.groupby(SCENARIO_NAME).count().withColumnRenamed("count", "Num_Acids")
dataframe2 = dataframe2.join(dataframe, on=[SCENARIO_NAME], how='outer')
dataframe2 = dataframe2.withColumn(AEQ2, (col(AEQ1) / col("Num_Acids")) * 100).select(SCENARIO_NAME, AEQ2)
dataframe2.show()

+------------------+-----------------+
|          Scenario|             AEQ2|
+------------------+-----------------+
|1_very_low_40_8_R2|2.393617021276596|
|1_very_low_40_8_W1|2.393617021276596|
|2_very_low_40_8_R2|2.393617021276596|
|2_very_low_40_8_W1|2.393617021276596|
|3_very_low_40_8_R2|2.393617021276596|
|3_very_low_40_8_W1|2.393617021276596|
+------------------+-----------------+



In [17]:
df3 = df1.join(df2, on=[SCENARIO_NAME], how='outer')
df3 = df3.withColumn("AEQ2", (col("AEQ1")/col("Num_Acids"))*100).select(SCENARIO_NAME, AEQ2)
df3.show()

NameError: name 'df1' is not defined

In [None]:
input_dataframes[FLST_LOG_PREFIX].select(SCENARIO_NAME, ACID).groupby(SCENARIO_NAME).count().show()

In [None]:
input_dataframes[FLST_LOG_PREFIX].select(SCENARIO_NAME, PRIORITY, LOITERING, "delay", PRIORITY, LOITERING).((col(PRIORITY)==4) | (col(LOITERING))).show(30)

In [19]:
dataframe3 = input_dataframes[FLST_LOG_PREFIX].select(SCENARIO_NAME, ACID, FLIGHT_TIME, VEHICLE)
dataframe3.show()

+------------------+----+------------+-------+
|          Scenario|ACID| Flight_time|Vehicle|
+------------------+----+------------+-------+
|1_very_low_40_8_R2| D10|128.00000000|   MP30|
|1_very_low_40_8_R2| D54|199.00000000|   MP30|
|1_very_low_40_8_R2| D71|206.50000000|   MP30|
|1_very_low_40_8_R2| D86|221.50000000|   MP30|
|1_very_low_40_8_R2| D35|245.50000000|   MP20|
|1_very_low_40_8_R2|D151|250.50000000|   MP20|
|1_very_low_40_8_R2|D103|252.50000000|   MP30|
|1_very_low_40_8_R2|  D5|263.00000000|   MP20|
|1_very_low_40_8_R2| D85|273.00000000|   MP30|
|1_very_low_40_8_R2|D158|284.00000000|   MP30|
|1_very_low_40_8_R2|  D9|301.00000000|   MP20|
|1_very_low_40_8_R2| D95|301.50000000|   MP30|
|1_very_low_40_8_R2| D13|304.50000000|   MP30|
|1_very_low_40_8_R2|D165|314.50000000|   MP30|
|1_very_low_40_8_R2| D51|318.00000000|   MP20|
|1_very_low_40_8_R2|D111|325.00000000|   MP20|
|1_very_low_40_8_R2| D80|326.00000000|   MP30|
|1_very_low_40_8_R2|D243|326.50000000|   MP20|
|1_very_low_4

In [23]:
dataframe3 = input_dataframes[FLST_LOG_PREFIX].withColumn("autonomy", when(col(VEHICLE) == "MP20", settings.MP20.autonomy)
                                                         .otherwise(settings.MP30.autonomy))

dataframe3 = dataframe3.select(SCENARIO_NAME, ACID, FLIGHT_TIME, VEHICLE, "autonomy")
dataframe3.show()

+------------------+----+------------+-------+--------+
|          Scenario|ACID| Flight_time|Vehicle|autonomy|
+------------------+----+------------+-------+--------+
|1_very_low_40_8_R2| D10|128.00000000|   MP30|    1800|
|1_very_low_40_8_R2| D54|199.00000000|   MP30|    1800|
|1_very_low_40_8_R2| D71|206.50000000|   MP30|    1800|
|1_very_low_40_8_R2| D86|221.50000000|   MP30|    1800|
|1_very_low_40_8_R2| D35|245.50000000|   MP20|    1800|
|1_very_low_40_8_R2|D151|250.50000000|   MP20|    1800|
|1_very_low_40_8_R2|D103|252.50000000|   MP30|    1800|
|1_very_low_40_8_R2|  D5|263.00000000|   MP20|    1800|
|1_very_low_40_8_R2| D85|273.00000000|   MP30|    1800|
|1_very_low_40_8_R2|D158|284.00000000|   MP30|    1800|
|1_very_low_40_8_R2|  D9|301.00000000|   MP20|    1800|
|1_very_low_40_8_R2| D95|301.50000000|   MP30|    1800|
|1_very_low_40_8_R2| D13|304.50000000|   MP30|    1800|
|1_very_low_40_8_R2|D165|314.50000000|   MP30|    1800|
|1_very_low_40_8_R2| D51|318.00000000|   MP20|  

In [31]:
dataframe3 = dataframe3.withColumn("inoperative", when(col(FLIGHT_TIME)>= col("autonomy"), True).otherwise(False))
dataframe3 = dataframe3.select(SCENARIO_NAME, col("inoperative")).where(col("inoperative") == True).groupby(SCENARIO_NAME).count().withColumnRenamed("count", AEQ2)
dataframe3.show()

+------------------+----+
|          Scenario|AEQ2|
+------------------+----+
|1_very_low_40_8_W1|2347|
|1_very_low_40_8_R2|2347|
|3_very_low_40_8_W1|2347|
|3_very_low_40_8_R2|2347|
|2_very_low_40_8_W1|2347|
|2_very_low_40_8_R2|2347|
+------------------+----+



In [75]:
from pyspark.sql.functions import mean
from pyspark.sql.functions import abs

dataframe4 = input_dataframes[FLST_LOG_PREFIX].select(SCENARIO_NAME, BASELINE_ARRIVAL_TIME, DEL_TIME)
dataframe4 = dataframe4.withColumn("delay", (col(DEL_TIME) - col(BASELINE_ARRIVAL_TIME)))
avg_delay= dataframe4.select(mean("delay").alias("avg_delay"))
avg_delay.show()
dataframe4 = dataframe4.join(avg_delay, how='outer')
dataframe4 = dataframe4.withColumn("delay_increment", abs(col("delay")-col("avg_delay")))
dataframe4.show()
dataframe4 = dataframe4.groupby(SCENARIO_NAME).agg(F.max("delay_increment").alias(AEQ4))
dataframe4.show()

+------------------+
|         avg_delay|
+------------------+
|174.88071918752135|
+------------------+

+------------------+---------------------+-------------+-------------------+------------------+------------------+
|          Scenario|Baseline_arrival_time|Deletion_Time|              delay|         avg_delay|   delay_increment|
+------------------+---------------------+-------------+-------------------+------------------+------------------+
|1_very_low_40_8_R2|    93.58550517846484|        128.0|  34.41449482153516|174.88071918752135| 140.4662243659862|
|1_very_low_40_8_R2|   234.92153759118554|        199.0| -35.92153759118554|174.88071918752135| 210.8022567787069|
|1_very_low_40_8_R2|   194.43859065760762|        206.5|  12.06140934239238|174.88071918752135|162.81930984512897|
|1_very_low_40_8_R2|   153.96310445561087|        221.5|  67.53689554438913|174.88071918752135|107.34382364313223|
|1_very_low_40_8_R2|   191.42273558168245|        245.5| 54.077264418317554|174.880719187

In [98]:
dataframe5 = input_dataframes[FLST_LOG_PREFIX].select(SCENARIO_NAME, ACID, BASELINE_ARRIVAL_TIME, DEL_TIME)
dataframe5 = dataframe5.withColumn("delay", (col(DEL_TIME) - col(BASELINE_ARRIVAL_TIME)))
avg_delay= dataframe5.select(mean("delay").alias("avg_delay"))
avg_delay.show()
dataframe5 = dataframe5.join(avg_delay, how='outer')
dataframe5.show()
dataframe5.select(SCENARIO_NAME, ACID).where((col("delay") > col("avg_delay")+5) | (col("delay") < col("avg_delay")-5)).groupby(SCENARIO_NAME).count().withColumnRenamed("count", AEQ5).show()


+------------------+
|         avg_delay|
+------------------+
|174.88071918752135|
+------------------+

+------------------+----+---------------------+-------------+-------------------+------------------+
|          Scenario|ACID|Baseline_arrival_time|Deletion_Time|              delay|         avg_delay|
+------------------+----+---------------------+-------------+-------------------+------------------+
|1_very_low_40_8_R2| D10|    93.58550517846484|        128.0|  34.41449482153516|174.88071918752135|
|1_very_low_40_8_R2| D54|   234.92153759118554|        199.0| -35.92153759118554|174.88071918752135|
|1_very_low_40_8_R2| D71|   194.43859065760762|        206.5|  12.06140934239238|174.88071918752135|
|1_very_low_40_8_R2| D86|   153.96310445561087|        221.5|  67.53689554438913|174.88071918752135|
|1_very_low_40_8_R2| D35|   191.42273558168245|        245.5| 54.077264418317554|174.88071918752135|
|1_very_low_40_8_R2|D151|   207.18630252338767|        250.5|  43.31369747661233|174.8

In [105]:
from pyspark.sql.functions import stddev
dataframe6 = input_dataframes[FLST_LOG_PREFIX].select(SCENARIO_NAME, ACID, BASELINE_ARRIVAL_TIME, DEL_TIME)
dataframe6 = dataframe6.groupby(SCENARIO_NAME).agg(stddev(col(DEL_TIME) - col(BASELINE_ARRIVAL_TIME)).alias(AEQ3))
dataframe6.show()

+------------------+------------------+
|          Scenario|              AEQ3|
+------------------+------------------+
|1_very_low_40_8_W1|174.56794296252107|
|1_very_low_40_8_R2|174.56794296252107|
|3_very_low_40_8_W1|174.56794296252107|
|3_very_low_40_8_R2|174.56794296252107|
|2_very_low_40_8_W1|174.56794296252107|
|2_very_low_40_8_R2|174.56794296252107|
+------------------+------------------+

