In [7]:
import sys
from pathlib import Path
from typing import List, Dict

import pyspark.sql.functions as F
from loguru import logger
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, when

from config import settings

In [8]:
CONF_LOG_PREFIX = 'CONFLOG'
FLST_LOG_PREFIX = 'FLSTLOG'
GEO_LOG_PREFIX = 'GEOLOG'
LOS_LOG_PREFIX = 'LOSLOG'
REG_LOG_PREFIX = 'REGLOG'
LOADING_PATH = '/mnt/shared/repos/metropolis/M2_data_analysis_platform/output'
DATAFRAMES_NAMES = [CONF_LOG_PREFIX, FLST_LOG_PREFIX, GEO_LOG_PREFIX, LOS_LOG_PREFIX, REG_LOG_PREFIX]

In [9]:
sys.path.append(str(Path(Path().absolute().parent, 'platform_code')))
from schemas.tables_attributes import *

In [10]:
def load_dataframes(files_names: List[str], loading_path: str, spark: SparkSession) -> Dict[str, DataFrame]:
    """ Loads the dataframes which macht the file names passed by arguments.
    The method read from the config the path were to read the files, which
    matches the folder where the files are saved in `save_dataframes_dict()`.

    :param files_names: list of the names of the files.
    :param loading_path: path were the files are saved.
    :param spark: spark session.
    :return: dictionary with the dataframes loaded from the files, with the
     file name as key.
    """
    dataframes = dict()

    for file_name in files_names:
        file_path = Path(loading_path, f'{file_name.lower()}.parquet')
        logger.info('Loading dataframe from `{}`.', file_path)
        df = spark.read.parquet(str(file_path))
        dataframes[file_name] = df

    return dataframes

In [11]:
spark = SparkSession.builder.appName('Notebook').getOrCreate()

In [12]:
input_dataframes = load_dataframes(DATAFRAMES_NAMES, LOADING_PATH, spark)

2022-03-25 12:06:50.225 | INFO     | __main__:load_dataframes:16 - Loading dataframe from `/mnt/shared/repos/metropolis/M2_data_analysis_platform/output/conflog.parquet`.
2022-03-25 12:06:55.596 | INFO     | __main__:load_dataframes:16 - Loading dataframe from `/mnt/shared/repos/metropolis/M2_data_analysis_platform/output/flstlog.parquet`.
2022-03-25 12:06:55.881 | INFO     | __main__:load_dataframes:16 - Loading dataframe from `/mnt/shared/repos/metropolis/M2_data_analysis_platform/output/geolog.parquet`.
2022-03-25 12:06:56.081 | INFO     | __main__:load_dataframes:16 - Loading dataframe from `/mnt/shared/repos/metropolis/M2_data_analysis_platform/output/loslog.parquet`.
2022-03-25 12:06:56.284 | INFO     | __main__:load_dataframes:16 - Loading dataframe from `/mnt/shared/repos/metropolis/M2_data_analysis_platform/output/reglog.parquet`.


In [13]:
dataframe = input_dataframes[FLST_LOG_PREFIX].select(SCENARIO_NAME, PRIORITY, LOITERING, BASELINE_ARRIVAL_TIME,
                                                     DEL_TIME)

In [14]:
dataframe = dataframe.withColumn("delay", col(DEL_TIME) - col(BASELINE_ARRIVAL_TIME))

In [15]:
dataframe.show()

                                                                                

+------------------+--------+---------+---------------------+-------------+-------------------+
|          Scenario|Priority|loitering|Baseline_arrival_time|Deletion_Time|              delay|
+------------------+--------+---------+---------------------+-------------+-------------------+
|1_very_low_40_8_W1|       3|    false|    93.58550517846484|        128.0|  34.41449482153516|
|1_very_low_40_8_W1|       2|    false|   234.92153759118554|        199.0| -35.92153759118554|
|1_very_low_40_8_W1|       3|    false|   194.43859065760762|        206.5|  12.06140934239238|
|1_very_low_40_8_W1|       2|    false|   153.96310445561087|        221.5|  67.53689554438913|
|1_very_low_40_8_W1|       3|    false|   191.42273558168245|        245.5| 54.077264418317554|
|1_very_low_40_8_W1|       1|    false|   207.18630252338767|        250.5|  43.31369747661233|
|1_very_low_40_8_W1|       3|    false|    184.8503295284063|        252.5|  67.64967047159371|
|1_very_low_40_8_W1|       1|    false| 

In [None]:
dataframe = dataframe.withColumn("cancelation_limit",
                                 when(col(PRIORITY) == 4, settings.thresholds.emergency_mission_delay)
                                 .otherwise(when(col(LOITERING), settings.thresholds.loitering_mission_delay)
                                            .otherwise(settings.thresholds.delivery_mission_delay)))

In [None]:
dataframe = dataframe.select(SCENARIO_NAME, PRIORITY, LOITERING, "delay", "cancelation_limit").withColumn(
    "cancelled_demand", col("delay") >= col("cancelation_limit"))

In [None]:
dataframe = dataframe.select(SCENARIO_NAME, "cancelled_demand").where(col("cancelled_demand")).groupby(
    SCENARIO_NAME).count().withColumnRenamed("count", "AEQ1")
dataframe.show()

In [None]:
dataframe.select(AEQ1).show()

In [None]:
dataframe2 = input_dataframes[FLST_LOG_PREFIX].select(SCENARIO_NAME, ACID)
dataframe2 = dataframe2.groupby(SCENARIO_NAME).count().withColumnRenamed("count", "Num_Acids")
dataframe2 = dataframe2.join(dataframe, on=[SCENARIO_NAME], how='outer')
dataframe2 = dataframe2.withColumn(AEQ2, (col(AEQ1) / col("Num_Acids")) * 100).select(SCENARIO_NAME, AEQ2)
dataframe2.show()

In [None]:
df3 = df1.join(df2, on=[SCENARIO_NAME], how='outer')
df3 = df3.withColumn("AEQ2", (col("AEQ1") / col("Num_Acids")) * 100).select(SCENARIO_NAME, AEQ2)
df3.show()

In [None]:
input_dataframes[FLST_LOG_PREFIX].select(SCENARIO_NAME, ACID).groupby(SCENARIO_NAME).count().show()

In [None]:
input_dataframes[FLST_LOG_PREFIX].select(SCENARIO_NAME, PRIORITY, LOITERING, "delay", PRIORITY, LOITERING).(
    (col(PRIORITY) == 4) | (col(LOITERING))).show(30)

In [None]:
dataframe3 = input_dataframes[FLST_LOG_PREFIX].select(SCENARIO_NAME, ACID, FLIGHT_TIME, VEHICLE)
dataframe3.show()

In [None]:
dataframe3 = input_dataframes[FLST_LOG_PREFIX].withColumn("autonomy",
                                                          when(col(VEHICLE) == "MP20", settings.MP20.autonomy)
                                                          .otherwise(settings.MP30.autonomy))

dataframe3 = dataframe3.select(SCENARIO_NAME, ACID, FLIGHT_TIME, VEHICLE, "autonomy")
dataframe3.show()

In [None]:
dataframe3 = dataframe3.withColumn("inoperative", when(col(FLIGHT_TIME) >= col("autonomy"), True).otherwise(False))
dataframe3 = dataframe3.select(SCENARIO_NAME, col("inoperative")).where(col("inoperative") == True).groupby(
    SCENARIO_NAME).count().withColumnRenamed("count", AEQ2)
dataframe3.show()

In [17]:
from pyspark.sql.functions import col, mean

avg_delay = input_dataframes[FLST_LOG_PREFIX].select(SCENARIO_NAME, BASELINE_ARRIVAL_TIME, DEL_TIME) \
    .groupby(SCENARIO_NAME) \
    .agg(mean(col(DEL_TIME) - col(BASELINE_ARRIVAL_TIME)).alias("avg_delay")).show()

+------------------+------------------+
|          Scenario|         avg_delay|
+------------------+------------------+
|1_very_low_40_8_W1|174.88071918752135|
|3_very_low_40_8_W1|174.88071918752135|
|2_very_low_40_8_W1|174.88071918752135|
|3_very_low_40_8_R2|174.88071918752135|
|1_very_low_40_8_R2|174.88071918752135|
|2_very_low_40_8_R2|174.88071918752135|
+------------------+------------------+



In [16]:
from pyspark.sql.functions import mean
from pyspark.sql.functions import abs

dataframe4 = input_dataframes[FLST_LOG_PREFIX].select(SCENARIO_NAME, BASELINE_ARRIVAL_TIME, DEL_TIME)
dataframe4 = dataframe4.withColumn("delay", (col(DEL_TIME) - col(BASELINE_ARRIVAL_TIME)))
avg_delay = dataframe4.select(mean("delay").alias("avg_delay"))
avg_delay.show()
dataframe4 = dataframe4.join(avg_delay, how='outer')
dataframe4 = dataframe4.withColumn("delay_increment", abs(col("delay") - col("avg_delay")))
dataframe4.show()
dataframe4 = dataframe4.groupby(SCENARIO_NAME).agg(F.max("delay_increment").alias(AEQ4))
dataframe4.show()

+------------------+
|         avg_delay|
+------------------+
|174.88071918752118|
+------------------+

+------------------+---------------------+-------------+-------------------+------------------+------------------+
|          Scenario|Baseline_arrival_time|Deletion_Time|              delay|         avg_delay|   delay_increment|
+------------------+---------------------+-------------+-------------------+------------------+------------------+
|1_very_low_40_8_W1|    93.58550517846484|        128.0|  34.41449482153516|174.88071918752118|140.46622436598602|
|1_very_low_40_8_W1|   234.92153759118554|        199.0| -35.92153759118554|174.88071918752118|210.80225677870672|
|1_very_low_40_8_W1|   194.43859065760762|        206.5|  12.06140934239238|174.88071918752118| 162.8193098451288|
|1_very_low_40_8_W1|   153.96310445561087|        221.5|  67.53689554438913|174.88071918752118|107.34382364313205|
|1_very_low_40_8_W1|   191.42273558168245|        245.5| 54.077264418317554|174.880719187

In [None]:
dataframe5 = input_dataframes[FLST_LOG_PREFIX].select(SCENARIO_NAME, ACID, BASELINE_ARRIVAL_TIME, DEL_TIME)
dataframe5 = dataframe5.withColumn("delay", (col(DEL_TIME) - col(BASELINE_ARRIVAL_TIME)))
avg_delay = dataframe5.select(mean("delay").alias("avg_delay"))
avg_delay.show()
dataframe5 = dataframe5.join(avg_delay, how='outer')
dataframe5.show()
dataframe5.select(SCENARIO_NAME, ACID).where(
    (col("delay") > col("avg_delay") + 5) | (col("delay") < col("avg_delay") - 5)).groupby(
    SCENARIO_NAME).count().withColumnRenamed("count", AEQ5).show()


In [None]:
from pyspark.sql.functions import stddev

dataframe6 = input_dataframes[FLST_LOG_PREFIX].select(SCENARIO_NAME, ACID, BASELINE_ARRIVAL_TIME, DEL_TIME)
dataframe6 = dataframe6.groupby(SCENARIO_NAME).agg(stddev(col(DEL_TIME) - col(BASELINE_ARRIVAL_TIME)).alias(AEQ3))
dataframe6.show()