In [1]:
### Notebook ingests exploded VNET logs into ADLS & Kusto by listing files recursively at source

StatementMeta(, , -1, SessionStarting, , SessionStarting)

# Imports

In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime
import subprocess
from notebookutils import mssparkutils
import pyspark.sql.functions as F

StatementMeta(, , , Waiting, , Waiting)

In [6]:
%run ADFJobs/CommonUtilFunctions/LinkedService_UtilFunctions

# Parameters

In [None]:
WRITE = True  # Controls Writes to Kusto Database, useful to test functionality without writing data
DEBUG = True  # Prints debug statements to output, useful for debugging
 
ETL_DATE = "yesterday"  # today, yesterday or year-month-date string, controls what date will be pulled from ADLS
REGION = "wcus"  # Currently only support one region
CONTAINER = "prod"  # Currently only supporting prod container
DATA_PATH = "feeds/logs"  # Directory path in ADLS container
DOMAIN = "sipvnetlogsadlsgen2.dfs.core.windows.net"  # ADLS domain service location
#DOMAIN = "testadlsgen20.dfs.core.windows.net"  # ADLS domain service location
#LINKED_SERVICE_NAME = "sipvnetlogsadls_connector"  # Synapse Managed Identity Linked Service Name for ADLS domain
LINKED_SERVICE_NAME = "AzureDataLakeStorage_sipvnetlogsadlsgen2_LinkedService"
SIP_DATABASE = "vnetlogs_uat"  # Kusto Database name used to store data from ADLS
SIP_TABLE = "logs_aggregated"  # Kusto table name
#SIP_LINKED_SERVICE_NAME = "VNetFlowLogsSynapse_Connector"  # Synapse Managed Identity for writing to Kusto
SIP_LINKED_SERVICE_NAME = "AzureDataExplorer_Sipvnetlogsprod_LinkedService"

StatementMeta(, , , Waiting, , Waiting)

# Source Configuration

In [8]:
from datetime import datetime, timedelta, timezone
import re
 
from pyspark.sql import DataFrame
from pyspark.sql import functions
from pyspark.sql.types import StringType
 
 
RESOURCE_GROUP_PATTERN_STR = r"^.*\/resourcegroups\/([\w|\-|\_]+)\/.*$"
RESOURCE_GROUP_PATTERN = re.compile(RESOURCE_GROUP_PATTERN_STR, re.I)
 
UUID_PATTERN_STR = (
    r"^.*\/([a-f0-9]{8}-?[a-f0-9]{4}-?4"
    r"[a-f0-9]{3}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12})\/.*?$"
)
UUID_PATTERN = re.compile(UUID_PATTERN_STR, re.I)
 
 
def convert_ts(ts: int) -> datetime:
    """Helper Function converting unix timestamps to python datetime.
 
    Args:
        ts (int): a unix timestamp in milliseconds format.
 
    Returns:
        datetime: UTC datetime object of timestamp.
    """
    return datetime.fromtimestamp(ts / 1e3).replace(tzinfo=timezone.utc)
 
 
def convert_dt(dt: datetime) -> int:
    """Converts python datetime to unix timestamp in milliseconds.
 
    Args:
        dt (datetime): a python datetime object to convert to a timestamp.
 
    Returns:
        int: unix timestamp with milliseconds.
    """
    return int(dt.timestamp() * 1000)
 
 
def log_debug(debug_str: str, *args):
    """Helper Function for printing debug statements.
 
    Args:
        debug_str (str): Python string with interpolation embedded.
        args (any): interpolation args for debug str.
    """
    if DEBUG:
        values = [a() if callable(a) else a for a in args]
        print(debug_str.format(*values))
 
 
class ADLSConnector:
 
    def __init__(
        self,
        date_str: str = ETL_DATE,
        region: str = REGION,
        container: str = CONTAINER,
        base_data_path: str = DATA_PATH,
        domain: str = DOMAIN,
        linked_service_name: str = LINKED_SERVICE_NAME,
    ):
        """Provides connection to Azure DataLake Gen2 as a Linked Service.
        Generates hourly timestamps based on date_str provided to query ADLS folder.
        Reads the constructed parquet files in ADLS folder and returns pySpark.DataFrame.
 
        Args:
            date_str (str, optional): Folder Date string to read data parquet data files. Defaults to ETL_DATE.
            region (str, optional): ADLS path based region string. Defaults to PARAMETER:REGION.
            container (str, optional): ADLS Container string Defaults to PARAMETER:CONTAINER.
            base_data_path (str, optional): ADLS base folder path Defaults to PARAMETER:DATA_PATH.
            domain (str, optional): ADLS service domain name. Defaults to PARAMETER:DOMAIN.
            linked_service_name (str, optional): Name of ADLS Linked Service Defaults to PARAMETER:LINKED_SERVICE_NAME.
        """
        self._etl_date_str = self._str_to_date(date_str)
        self._region = region
        self._container = container
        self._data_path = base_data_path
        self._adls_domain = domain
        self._linked_service_name = linked_service_name
 
    def _str_to_date(self, date_str: str) -> str:
        """Converts Date string to ADLS expected format folder path.
 
        Args:
            date_str (str): Some date string to be converted.
 
        Returns:
            str: date string in MM/DD/YYYY format
        """
 
        def str_to_dt(date_str: str) -> datetime:
            _today = datetime.today().replace(
                hour=0, minute=0, second=0, microsecond=0, tzinfo=timezone.utc
            )
            _yesterday = _today - timedelta(days=1)
            try:
                return datetime.strptime(date_str, "%Y-%m-%d")
            except ValueError:
                if date_str.lower() == "today":
                    return _today
                elif date_str.lower() == "yesterday":
                    return _yesterday
                else:
                    try:
                        datetime.strptime(date_str, "%m/%d/%Y")
                    except ValueError:
                        try:
                            return datetime.strptime(date_str, "%m-%d-%Y")
                        except ValueError:
                            return _yesterday
 
        return str_to_dt(date_str).strftime("%Y-%m-%d")
 
    @property
    def data_path(self) -> str:
        return "/".join(
            (
                self._data_path,
                self.etl_date,
                self.region,
            )
        )
 
    @property
    def domain(self) -> str:
        return self._adls_domain
 
    @property
    def linked_service_name(self) -> str:
        return self._linked_service_name
 
    @property
    def container(self) -> str:
        return self._container
 
    @property
    def region(self) -> str:
        return f"Region={self._region}"
 
    @property
    def etl_date(self) -> str:
        return f"ETL_DATE={self._etl_date_str}"
 
    @property
    def abfss_domain(self) -> str:
        return f"{self.container}@{self.domain}"
 
    @property
    def parquet_read_path(self) -> str:
        return f"abfss://{self.abfss_domain}/{self.data_path}/"
 
    @property
    def abfss_tmp_path(self) -> str:
        return f"abfss://{self.abfss_domain}/tmpStreamingDir/"
 
    def get_hourly_timestamps(
        self, first_hour_ts: int, last_hour_ts: int
    ) -> list:
        """Returns a list of millisecond precision timestamps
        from the currently selected ETL_DATE.
        """
 
        first_dt = convert_ts(first_hour_ts)
        last_dt = convert_ts(last_hour_ts)
 
        #start_time = first_dt.replace(minute=0, second=0, microsecond=0)
        #last_dt = last_dt.replace(minute=0, second=0, microsecond=0)

        # start_time = first_dt.replace(minute=0, second=0, microsecond=0)
        last_dt = last_dt.replace(minute=0, second=0, microsecond=0)
        """Please specify the time window"""
        start_time = last_dt - timedelta(hours=1)

 
        hourly_ts = [convert_dt(start_time)]
        while start_time < last_dt:
            start_time += timedelta(hours=1)
            hourly_ts.append(convert_dt(start_time))
 
        return hourly_ts
 
    @staticmethod
    def get_min_ts(df: DataFrame) -> int:
        return int(df.agg(functions.min(df.Timestamp)).head(1)[0][0])
 
    @staticmethod
    def get_max_ts(df: DataFrame) -> int:
        return int(df.agg(functions.max(df.Timestamp)).head(1)[0][0])
 
    def query(self) -> DataFrame:
        """Reads from parquet files from ADLS storage path.
 
        Returns:
            DataFrame: pySpark.DataFrame rows read from storage path.
        """
        spark.conf.set(
            "spark.storage.synapse.linkedServiceName", self.linked_service_name
        )
        spark.conf.set(
            "spark.storage.synapse.teststorage.dfs.core.windows.net.linkedServiceName",
            self.linked_service_name,
        )
        spark.conf.set(
            "spark.hadoop.fs.azure.account.oauth.provider.type.teststorage.dfs.core.windows.net",
            "microsoft.azure.synapse.tokenlibrary.LinkedServiceBasedTokenProvider",
        )
 
        log_debug(self.parquet_read_path)
        return spark.read.parquet(self.parquet_read_path)
 
    def write(self, df: DataFrame, hourly_path: str) -> None:
        """Writes data to ADLS Gen2 folder.
 
        Args:
            df (DataFrame): DataFrame rows to write to file.
            hourly_path (str): hourly string of data
 
        Returns:
        """
        spark.conf.set(
            "spark.storage.synapse.teststorage.dfs.core.windows.net.linkedServiceName",
            self._linked_service_name,
        )
        spark.conf.set(
            "spark.hadoop.fs.azure.account.oauth.provider.type.teststorage.dfs.core.windows.net",
            "microsoft.azure.synapse.tokenlibrary.LinkedServiceBasedTokenProvider",
        )
        write_path = f"abfss://{self.container}@{self.domain}/{self.data_path}/{hourly_path}/"
        log_debug(write_path)
        if WRITE:
            return df.write.parquet(write_path)
 

# ADLS Connector

In [9]:
adls = ADLSConnector()
log_debug("Querying for {}", adls.etl_date)
df = adls.query()
log_debug("Total Rows [{}] returned", df.count)
log_debug("Columns [{}] returned", df.columns)
# Get Unix TimeStamps for the given date in 1 hour increments
 
# Get earliest and latest timestamps in the dataset
first_hour_ts = ADLSConnector.get_min_ts(df)
last_hour_ts = ADLSConnector.get_max_ts(df)
daily_ts = adls.get_hourly_timestamps(first_hour_ts, last_hour_ts)
log_debug("TimeStamps Created for Hourly Processing [{}]", daily_ts)

In [8]:
first_hour_ts = ADLSConnector.get_min_ts(df)
last_hour_ts = ADLSConnector.get_max_ts(df)
daily_ts = adls.get_hourly_timestamps(first_hour_ts, last_hour_ts)
log_debug("TimeStamps Created for Hourly Processing [{}]", daily_ts)

#print(first_hour_ts)
#print(last_hour_ts)

In [8]:
class SIPWriter:
    def __init__(self,
                 database: str = SIP_DATABASE,
                 table: str = SIP_TABLE,
                 region: str = REGION,
                 linked_service_name: str = SIP_LINKED_SERVICE_NAME):
        """Creates a connection to Kusto Cluster from Synapse Linked Service.
        Summarizes hourly snapshot of pySpark.DataFrame, generates user defined functions from DataFrame.
        Writes a pyspark.DataFrame to Kusto table.
 
        Args:
            database (str, optional): Kusto Database name. Defaults to PARAMETERS:SIP_DATABASE.
            table (str, optional): Kusto Table name. Defaults to PARAMETERS:SIP_TABLE.
            region (str, optional): Region data was read from. Defaults to PARAMETERS:REGION.
            linked_service_name (str, optional): Synapse Linked Service. Defaults to PARAMETERS:SIP_LINKED_SERVICE_NAME.
        """
        self.spark_conf()
        self._database = database
        self._table = table
        self._region = region
        self._linked_service_name = linked_service_name
 
    @property
    def database(self) -> str:
        return self._database
   
    @property
    def region(self) -> str:
        return self._region
 
    @property
    def table(self) -> str:
        return self._table
 
    @property
    def linked_service_name(self) -> str:
        return self._linked_service_name
 
    @staticmethod
    def summarize(df: DataFrame) -> DataFrame:
        """Summarizes DataFrame
 
        Args:
            df (DataFrame): DataFrame hourly summarization of data.
 
        Returns:
            DataFrame: _description_
        """
        return df.groupBy([df.Source_IP, df.Destination_IP, df.TrafficFlow,
                            df.Flow_State, df.macAddress, df.flowLogResourceID,
                            df.targetResourceID, df.Source_Port, df.Destination_Port]).count()
 
    def resource_groups_and_summarized_dt(self, df: DataFrame, dt: datetime) -> DataFrame:
        """
            Extract Subscription IDs and Target Resource Groups
            (flow Resource Group is always NetworkWatcher.).
            Adds summarized_dt date column, region str, and changes port columns to int.
 
        Args:
            df (DataFrame): pyspark.DataFrame rows
            dt (datetime): hourly timestamp from dataframe
 
        Returns:
            DataFrame: pyspark.DataFrame with user defined functions.
        """
        def match_subscription_fn(resource_str: str) -> str:
            _match = UUID_PATTERN.match(resource_str)
            if (_match):
                return _match.groups(1)[0]
            return ""
 
        def match_resource_group_fn(resource_str: str) -> str:
            _match = RESOURCE_GROUP_PATTERN.match(resource_str)
            if (_match):
                return _match.groups(1)[0]
            return ""
 
        match_subscription_udf = functions.udf(match_subscription_fn, StringType())
        match_resource_group_udf = functions.udf(match_resource_group_fn, StringType())
 
        return df.withColumn('flowLogSubscriptionID', match_subscription_udf(df.flowLogResourceID)) \
                 .withColumn("targetSubscriptionID", match_subscription_udf(df.targetResourceID)) \
                 .withColumn("targetResourceGroup", match_resource_group_udf(df.targetResourceID)) \
                 .withColumn("Source_Port", df["Source_Port"].cast('int')) \
                 .withColumn("Destination_Port", df["Destination_Port"].cast('int')) \
                 .withColumn('summarizedDT', functions.lit(dt)) \
                 .withColumn('region', functions.lit(self.region))
 
    @classmethod
    def create_table_column_map(cls, df: DataFrame) -> str:
        """
           Class Method for creating column Map for writing to Table.
        Args:
            df (DataFrame): pyspark.DataFrame rows
 
        Returns:
            str: returns csv column mappings.
        """
        cvsMap = "["
        i = 0
        for col in df.columns:
            col_type = df.schema[col].dataType
            cvsMap += f"""{{"Name":"{col}","{col_type}":{i}}},"""
            i+=1
        cvsMap = cvsMap.rstrip(",")
        cvsMap += "]"
        return cvsMap
 
    def spark_conf(self) -> None:
        """Sets global spark configuration for LinkedService Authorization.
        """
        spark.conf.set('fs.adl.oauth2.access.token.provider.type', 'ClientCredential')
        spark.conf.set('fs.adl.oauth2.refresh.url',
                       'https://login.microsoftonline.com/72f988bf-86f1-41af-91ab-2d7cd011db47/oauth2/token')
        spark.conf.set('spark.synapse.linkedService',
                       'VNETFlowLogsSynapse_Connector')
        spark.conf.set('fs.azure.account.oauth.provider.type',
                       'com.microsoft.azure.synapse.tokenlibrary.LinkedServiceBasedTokenProvider')
        spark.conf.set('spark.sql.streaming.checkpointLocation', '/localWriteCheckpointFolder')
        spark.conf.set('spark.sql.codegen.wholeStage', 'false')
   
    def write(self, df: DataFrame) -> None:
        """Writes DataFrame rows to Kusto table.
 
        Args:
            df (DataFrame): DataFrame rows to write to table.
 
        Returns:
        """
        # Write data to a Kusto table
        log_debug("Writing {}.{}", self.database, self.table)
        if WRITE:
           return df.write. \
                format('com.microsoft.kusto.spark.synapse.datasource'). \
                option('spark.synapse.linkedService', self.linked_service_name). \
                option('kustoDatabase', self.database). \
                option('kustoTable', self.table). \
                option('authType', 'LS'). \
                option('tableCreateOptions', 'CreateIfNotExist'). \
                mode('Append').save()

In [9]:
log_debug("# Processing Hourly Aggregations")
sip_writer = SIPWriter()
total_summarized_rows = 0
 
for i in range(1, len(daily_ts) - 1): # Loop thru each hour of the day to aggregate data
    hourly_snapshot = df.filter((df.Timestamp >= daily_ts[i]) & (df.Timestamp < daily_ts[i+1]))
    log_debug("- From [{}] to [{}]\n - Rows [{}]",
              convert_ts(daily_ts[i]),
              convert_ts(daily_ts[i+1]),
              hourly_snapshot.count)
    hourly_snapshot_summary = sip_writer.summarize(hourly_snapshot)
    # Add resource columns, summarized_dt and change port columns to int
    ext_hourly_summary = sip_writer.resource_groups_and_summarized_dt(hourly_snapshot_summary,
                                                                      convert_ts(daily_ts[i]))
    log_debug("- Extended Columns [{}]", ext_hourly_summary.columns)
    if DEBUG:
        total_summarized_rows += ext_hourly_summary.count()
 
    sip_writer.write(ext_hourly_summary)
    log_debug("=" * 88)
 
log_debug("# Hourlying Aggregation Finished with rows [{}] appended to [{}.{}]",
          total_summarized_rows, sip_writer.database, sip_writer.table)