In [25]:
import pyspark
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pyspark.sql.functions as F

In [26]:
account_name = "REDACTED"
account_key = "REDACTED"

spark = (
    SparkSession
        .builder
        .master('local[*]')
        .appName("Ingestion")
        .config("spark.driver.memory", "4g")
        .config("fs.azure.account.auth.type." + account_name + ".dfs.core.windows.net", "SharedKey")
        .config("fs.azure.account.key." + account_name + ".dfs.core.windows.net", account_key)
        .getOrCreate()
)

sc = spark.sparkContext

In [27]:
container_name = 'data'
path_to_table = '/masterdata/'

def readDataframeFromAdls(spark_session, container_name, path_to_table, table_name):
    return spark_session.read.parquet(f"abfss://{container_name}@REDACTED.dfs.core.windows.net{path_to_table}{table_name}")

In [28]:
df_qb_file = (
    readDataframeFromAdls(spark, container_name, path_to_table, 'df_qb_file')
        .select(['ID_FILE'])
)

df_qd_state = (
    readDataframeFromAdls(spark, container_name, path_to_table, 'df_qd_state')
        .select(['ID', 'ID_FILE', 'DAY', 'MONTH', 'YEAR', 'STIME', 'STYPE', 'ACTIVITY_STATE', 'ACTIVITY_CONFID'])
        .withColumn('STIME_SEC', F.round((F.col('STIME') / 1000)).cast(IntegerType()))
        .withColumnRenamed('ID', 'ID_STATE')
)

df_qd_state_gps = (
    readDataframeFromAdls(spark, container_name, path_to_table, 'df_qd_state_gps')
        .select(['ID_STATE', 'ID_FILE', 'LTIME', 'SOURCE', 'SPEED', 'ACCURACY', 'LON', 'LAT'])
        .withColumn('LTIME_SEC', F.round((F.col('LTIME') / 1000)).cast(IntegerType()))
)

df_qd_state_cell = (
    readDataframeFromAdls(spark, container_name, path_to_table, 'df_qd_state_cell')
        .select(['ID_STATE', 'ID_FILE', 'TYPE', 'LEVEL', 'QUAL', 'SLOT'])
)

df_qd_state_wifi = (
    readDataframeFromAdls(spark, container_name, path_to_table, 'df_qd_state_wifi')
        .select(['ID_STATE', 'ID_FILE'])
        .withColumn('WIFI_CONNECTED', F.col('ID_STATE'))
)

df_qd_state_sense = (
    readDataframeFromAdls(spark, container_name, path_to_table, 'df_qd_state_sens')
        .select(['ID_STATE', 'ID_FILE', 'LIGHT', 'MAGNET_X', 'MAGNET_Y', 'MAGNET_Z', 'PROXIMITY'])
        .withColumn('MAGNET_X', F.when(
            (F.abs(F.col('MAGNET_X')) < 100000), F.col('MAGNET_X'))
            .otherwise(F.lit(None)))
        .withColumn('MAGNET_Y', F.when(
            (F.abs(F.col('MAGNET_X')) < 100000), F.col('MAGNET_X'))
            .otherwise(F.lit(None)))
        .withColumn('MAGNET_Z', F.when(
            (F.abs(F.col('MAGNET_X')) < 100000), F.col('MAGNET_X'))
            .otherwise(F.lit(None)))
)

df_qd_state_batt = (
    readDataframeFromAdls(spark, container_name, path_to_table, 'df_qd_state_batt')
        .select(['ID_STATE', 'ID_FILE', 'BATT_CHARGE'])
)

In [29]:
condition_qdState_qbFile = ['ID_FILE']
condition_qdState_qdStateGps = [
    (df_qd_state.ID_STATE == df_qd_state_gps.ID_STATE) & 
    (df_qd_state.ID_FILE == df_qd_state_gps.ID_FILE) &
    (df_qd_state_gps.LON.between(-180, 180)) & 
    (df_qd_state_gps.LAT.between(-90, 90))
]
condition_qdState_qdStateCell = [
    (df_qd_state.ID_STATE == df_qd_state_cell.ID_STATE) &
    (df_qd_state.ID_FILE == df_qd_state_cell.ID_FILE) &
    (df_qd_state_cell.SLOT == 0)
]
condition_qdState_qdStateWifi = ['ID_FILE', 'ID_STATE']
condition_qdState_qdStateSense = ['ID_FILE', 'ID_STATE']
condition_qdState_qdStateBatt = ['ID_FILE', 'ID_STATE']

In [30]:
df_join = (
    df_qd_state
        .join(F.broadcast(df_qb_file), on=condition_qdState_qbFile, how='inner')
        .join(F.broadcast(df_qd_state_gps), on=condition_qdState_qdStateGps, how='inner')
        .drop(df_qd_state_gps.ID_FILE).drop(df_qd_state_gps.ID_STATE)
        .join(F.broadcast(df_qd_state_cell), on=condition_qdState_qdStateCell, how='left')
        .drop(df_qd_state_cell.ID_FILE).drop(df_qd_state_cell.ID_STATE)
        .join(F.broadcast(df_qd_state_wifi), on=condition_qdState_qdStateWifi, how='left')
        .join(F.broadcast(df_qd_state_sense), on=condition_qdState_qdStateSense, how='left')
        .join(F.broadcast(df_qd_state_batt), on=condition_qdState_qdStateBatt, how='left')
        .cache()
)

In [31]:
df_wifi = (
    df_join
    .withColumn('WIFI_CONNECTED',
        F.when(F.col('WIFI_CONNECTED').isNotNull(), True)
        .otherwise(False)
    )
)

In [32]:
container_name = 'data'
path_to_table = '/extraction/'

def writeDataframeToAdls(dataframe, container_name, path_to_table, table_name, mode='overwrite'):
    (dataframe
        .write
        .mode(mode)
        .format("table_name")
        .parquet(f"abfss://{container_name}@REDACTED.dfs.core.windows.net{path_to_table}{table_name}")
    )

In [33]:
writeDataframeToAdls(df_wifi, container_name, path_to_table, "df_extract")