In [1]:
#Prep data here

# dbutils.widgets.text("SliceStart", "", "")
# sliceStart = dbutils.widgets.get(arg)
#sliceStart = "07-22-2019"

# %run ../Utility/gtm_utils
# %run ../Utility/read_raw_data
# %run ../Utility/preprocess_data
# %run ../Age/get_route_age_data_elements $company_code="AGT"
# %run ../Age/get_route_age_data_groups

In [2]:
#All variables needed for this test go here
configCSVLocation = '/mnt/turingdata/raw/solutions/turing/integrationDataTest/compareDataTest/dataAgeTests/route/config/routeAgeDataConfig.csv'
reportLocation = "/mnt/turingdata/raw/solutions/turing/integrationDataTest/compareDataTest/dataAgeTests/route/config/report.html"
overAllLogs = ""

In [3]:
from pyspark.sql.functions import col
from pyspark.sql.types import *
from datetime import datetime
from decimal import Decimal
from  pyspark.sql.functions import abs
import os

In [4]:
def dfToHTML(df, style=None, random_id=None):
    import numpy as np
    import re

    df = df.toPandas()
    df_html = df.to_html()

    if random_id is None:
        random_id = 'id%d' % np.random.choice(np.arange(1000000))

    if style is None:
        style = """
        <style>
            table#{random_id} {{color: blue}}
        </style>
        """.format(random_id=random_id)
    else:
        new_style = []
        s = re.sub(r'</?style>', '', style).strip()
        for line in s.split('\n'):
                line = line.strip()
                if not re.match(r'^table', line):
                    line = re.sub(r'^', 'table ', line)
                new_style.append(line)
        new_style = ['<style>'] + new_style + ['</style>']

        style = re.sub(r'table(#\S+)?', 'table#%s' % random_id, '\n'.join(new_style))

    df_html = re.sub(r'<table', r'<table id=%s ' % random_id, df_html)

    return style + df_html

In [5]:
#Debug function can redirect logs to various output formats

def addToReport(content):
  global overAllLogs
  overAllLogs += str(content)
  overAllLogs += '\n'
  
def debug(logs):
  logs = str(logs)
  if logs.strip() == "":
    addToReport("<br>")
    return
    
  
  print(logs)
  
  type = logs.split(':')[0]
  if type == "WARN":
    addToReport("<p style=\"color: orange\"> " + str(logs) + "</p>")
  elif type == "ERROR":
    addToReport("<p style=\"color: red\"> " + str(logs) + "</p>")
  elif type == "SUCCESS":
    addToReport("<h4 style=\"color: green\"> " + str(logs) + "</h4>")
  elif type == "FAILURE":
    addToReport("<h4 style=\"color: red\"> " + str(logs) + "</h4>")
  else:
    addToReport("<p>" + str(logs) + "</p>")

In [6]:
mount_storage()

In [7]:
#Load config CSV here
try:
  debug("DEBUG: Loading test config file")
  configCSV = sqlContext.read.format('com.databricks.spark.csv').options(header='true', delimiter='|', inferschema='true').load(configCSVLocation)
  debug("DEBUG: Printing top 20 tests")
  
  #Collect all tests
  tests = configCSV.collect()
  debug("INFO: Detected " + str(len(tests)) + " tests in the config")
  
  
except Exception as e:
  debug(str(e))
  debug("ERROR: Something went wrong loading config CSV")
  #Break code here
  

In [8]:
#Function that takes dataframe and the columns names to be compared
#Returns df containing rows that meet critera
def checkIfSuccess(df, targetColumn, calculatedColumn, difference):
  temp = df.withColumn('difference', 
          (df[targetColumn] - df[calculatedColumn]))
  temp = temp.withColumn('difference',abs(temp['difference']))
  successDf = temp.filter((temp['difference']<=difference))
  failureDf = temp.filter((temp['difference']>difference))
  return (successDf, failureDf)

In [9]:
#Function to handle/load input data 
def loadData(type, location):
    if type == "csv":
      return sqlContext.read.format('com.databricks.spark.csv').options(header='true', delimiter='|', inferschema='true').load(location)
    elif type == "json":
      return spark.read.json(location)
    elif type == "parquet":
      return spark.read.parquet(location)
    elif type == "databricks":
      return spark.sql("SELECT * FROM " + location)
    elif type == "sqlServer":
      return read_from_SQL(location)
      

In [10]:
#Execute the tests here
i = 0
newDataLocation1 =''
newDataLocation2 =''
for test in tests:
  try:
    testName = str(test['testName'])
    
    testSuccessThreshold = test['testSuccessThreshold']
    
    dataLocation1 = test['sourceDataLocation']
    data1ColumnName = test['sourceColumnToCompare']

    dataLocation2 = test['targetDataLocation']
    data2ColumnName = test['targetColumnToCompare']

    differenceValue = test['permitedDeviation']

    type1 = test['sourceDataType']
    type2 = test['targetDataType']

    debug("INFO: Runnign test " + testName )
    debug("INFO: Checking " + type1 + " " + dataLocation1 + " with " + type2 + " " +  dataLocation2)

    keys2 = test['targetKeys'].split(',')
    keys1 = test['sourceKeys'].split(',')

    if(len(keys2) != len(keys1)):
      debug("ERROR: Number of keys in both inputs need to be same. This test will be skipped.")
      debug('<hr>')
      debug("")
      continue
      
    if (dataLocation1 != newDataLocation1 or dataLocation2 != newDataLocation2):  
      try:
        debug("DEBUG: Loading test data into Spark")
        df1 = loadData(type1, dataLocation1)
        df1.cache()
        df2 = loadData(type2, dataLocation2)     
        df2.cache()

        newDataLocation1 = dataLocation1
        newDataLocation2 = dataLocation2

        debug("DEBUG: Data loaded successfully")
      except Exception as e:
        debug("ERROR: Something went wrong while trying to load data. This test will be skipped <br>" +str(e))
        debug('<hr>')
        debug("")
        continue

      for columnName in df1.columns:
        df1 = df1.withColumnRenamed(columnName , "expected_" + columnName)

      for columnName in df2.columns:
        df2 = df2.withColumnRenamed(columnName , "calculated_" + columnName)

    else:
      debug("DEBUG: Using data from memory")
      
    keys1_withSuffix = []
    for key in keys1:
      keys1_withSuffix.append("expected_" + key)

    keys2_withSuffix = []
    for key in keys2:
      keys2_withSuffix.append("calculated_" + key)

    joinDf = df1.join(df2, [col(f) == col(s) for (f, s) in zip(keys1_withSuffix, keys2_withSuffix)],how='inner') 
    totalTestSize = joinDf.count()

#     joinFailure1 = spark.sql("""Select * from df1 one LEFT JOIN joinedDF joined on one.routename = joined.routename where joined.routename = null""")

    dataFormat1 = joinDf.schema["calculated_" + data2ColumnName].dataType
    dataFormat2 = joinDf.schema["expected_" + data1ColumnName].dataType

    debug("INFO: Data type for " + "calculated_" + data2ColumnName + " is " + str(dataFormat1))
    debug("INFO: Data type for " + "expected_" + data1ColumnName + " is " + str(dataFormat2))

    if dataFormat2 != dataFormat1:
      debug("WARN: Data columns to compare don't have same types")

    if str(dataFormat2) == "StringType" or str(dataFormat1) == "StringType":
      debug("ERROR: One of the data columns to compare is of type String. This test will be skipped.")
      debug('<hr>')
      debug("")
      continue


    (successfulEntries, failedEntries) = checkIfSuccess(joinDf, "expected_" + data1ColumnName, "calculated_" + data2ColumnName, differenceValue)
    successCount = successfulEntries.count()
    failureCount = failedEntries.count()
    successPercentage = (totalTestSize - failureCount) / totalTestSize * 100

  
    if testSuccessThreshold <= successPercentage:
      testSuccessful = "Passed"
    else:
      testSuccessful = "Failed"

    debug("INFO: Total entries in data 1: " + str(df1.count()))
    debug("INFO: Total entries in data 2: " + str(df2.count()))
    debug("INFO: Total entries in join: " + str(joinDf.count()))
    
    debug("INFO: Test passed for " + str(successCount) + " rows")
    
    if testSuccessful == "Passed":
      debug('SUCCESS: Test ' + testSuccessful)
      debug('SUCCESS: Success percentage for test is ' + str(successPercentage))
    
    if testSuccessful == "Failed":
      debug('FAILURE: Test ' + testSuccessful)
      debug('FAILURE: Success percentage for test is ' + str(successPercentage))
      
      debug("Data that failed the tests is: ")
      keys1_withSuffix.extend(["expected_" + data1ColumnName, "calculated_" + data2ColumnName, "difference"])
      failedEntriesToPrint = failedEntries.select(keys1_withSuffix)
      failedEntriesToPrint.show()
      addToReport(dfToHTML(failedEntriesToPrint))
    
    debug('<hr>')
    debug("")
    
    
  except Exception as e: 
    debug("ERROR: Encountered Error during executing test <br>" + str(e))
    debug('<hr>')
    debug("")
    continue
  

write_to_blob_storage(overAllLogs, reportLocation)

In [11]:
print(overAllLogs)