In [2]:
# mount_storage()
#%run ../../Utility/gtm_utils

In [3]:
#All variables needed for this test go here
configCSVLocation = '/mnt/turingdata/raw/solutions/turing/integrationDataTest/compareDataTest/formatTest/formatTesting.csv'
reportLocation = "/mnt/turingdata/raw/solutions/turing/integrationDataTest/compareDataTest/formatTest/report.html"

#Dont delete this initialization
overAllLogs = ""

In [4]:
from pyspark.sql.functions import col
from pyspark.sql.types import *
from datetime import datetime
from decimal import Decimal
from  pyspark.sql.functions import abs
import os

In [5]:
def dfToHTML(df, style=None, random_id=None):
    import numpy as np
    import re

    df = df.toPandas()
    df_html = df.to_html()

    if random_id is None:
        random_id = 'id%d' % np.random.choice(np.arange(1000000))

    if style is None:
        style = """
        <style>
            table#{random_id} {{color: blue}}
        </style>
        """.format(random_id=random_id)
    else:
        new_style = []
        s = re.sub(r'</?style>', '', style).strip()
        for line in s.split('\n'):
                line = line.strip()
                if not re.match(r'^table', line):
                    line = re.sub(r'^', 'table ', line)
                new_style.append(line)
        new_style = ['<style>'] + new_style + ['</style>']

        style = re.sub(r'table(#\S+)?', 'table#%s' % random_id, '\n'.join(new_style))

    df_html = re.sub(r'<table', r'<table id=%s ' % random_id, df_html)

    return style + df_html

In [6]:
#Debug function can redirect logs to various output formats

def addToReport(content):
  global overAllLogs
  overAllLogs += str(content)
  overAllLogs += '\n'
  
def debug(logs):
  logs = str(logs)
  
  if logs.strip() == "":
    addToReport("<br>")
    return
  
  print(logs)
  
  type = logs.split(':')[0]
  if type == "WARN":
    addToReport("<p style=\"color: orange\"> " + str(logs) + "</p>")
  elif type == "ERROR":
    addToReport("<p style=\"color: red\"> " + str(logs) + "</p>")
  elif type == "SUCCESS":
    addToReport("<h4 style=\"color: green\"> " + str(logs) + "</h4>")
  elif type == "FAILURE":
    addToReport("<h4 style=\"color: red\"> " + str(logs) + "</h4>")
  else:
    addToReport("<p>" + str(logs) + "</p>")

In [7]:
#Load config CSV here
try:
  debug("DEBUG: Loading test config file")
  configCSV = sqlContext.read.format('com.databricks.spark.csv').options(header='true', delimiter='|', inferschema='true').load(configCSVLocation)
  debug("DEBUG: Printing top 20 tests")
  debug(configCSV.show())
  
  #Collect all tests
  tests = configCSV.collect()
  debug("INFO: Detected " + str(len(tests)) + " tests in the config")
  
  
except Exception as e:
  debug(str(e))
  debug("ERROR: Something went wrong loading config CSV")
  #Break code here
  

In [8]:
#Function to handle/load input data 
def loadData(type, location):
    if type == "csv":
      return sqlContext.read.format('com.databricks.spark.csv').options(header='true', delimiter='|', inferschema='true').load(location)
    elif type == "json":
      return spark.read.json(location)
    elif type == "parquet":
      return spark.read.parquet(location)
    elif type == "databricks":
      return spark.sql("SELECT * FROM " + location)
    elif type == "sqlServer":
      return read_from_SQL(location)
      

In [9]:
def checkCondition(df, condition, value, dataColumnName):
  if condition == "greaterThan":
    successfulEntries = df.where(df[dataColumnName] > value)
    failedEntries = df.where(df[dataColumnName] <= value)
    return (successfulEntries, failedEntries)
  else:
    debug("ERROR: The testing criteria is invalid")
    return (None, None)
  
  

In [10]:
newDataLocation = ""
newDataSourceType = ""

for test in tests:
  try:
    testName = str(test['testName'])
    
    testSuccessThreshold = test['testSuccessThreshold']
    
    dataLocation = test['sourceDataLocation']
    dataColumnName = test['sourceColumnToCompare']

    sourceType = test['sourceDataType']

    debug("INFO: Runnign test " + testName)
    debug("INFO: Checking " + sourceType + " " + dataLocation)

    condition = test['condition']
    value = test['value']
    
    if newDataSourceType!=sourceType and newDataLocation!=dataLocation:
      try:
        debug("DEBUG: Loading test data into Spark")
        df = loadData(sourceType, dataLocation)
        df.cache()

        newDataLocation = dataLocation
        newDataSourceType = sourceType
        
        debug("DEBUG: Data loaded successfully")
        
      except Exception as e:
        debug("ERROR: Something went wrong while trying to load data. This test will be skipped <br>" +str(e))
        debug('<hr>')
        continue
      
    else:
      debug("DEBUG: Using data from memory")
      
    (successfulEntries, failedEntries) = checkCondition(df, condition, value, dataColumnName)
    
    if successfulEntries == None:
      debug('<hr>')
      continue
    
    totalTestSize = df.count()
    successCount = successfulEntries.count()
    failureCount = failedEntries.count()
    successPercentage = (totalTestSize - failureCount) / totalTestSize * 100
    
    if testSuccessThreshold <= successPercentage:
      testSuccessful = "Passed"
    else:
      testSuccessful = "Failed"

    debug("INFO: Test passed for " + str(successCount) + " rows")
    
    if testSuccessful == "Passed":
      debug('SUCCESS: Test ' + testSuccessful)
      debug('SUCCESS: Success percentage for test is ' + str(successPercentage))
    
    if testSuccessful == "Failed":
      debug('FAILURE: Test ' + testSuccessful)
      debug('FAILURE: Success percentage for test is ' + str(successPercentage))
      
      debug("Data that failed the tests is: ")
      
      failedEntriesToPrint = failedEntries.select(dataColumnName)
      failedEntriesToPrint.show()
      addToReport(dfToHTML(failedEntriesToPrint))
    
    debug('<hr>')
      
  except Exception as e: 
    debug("ERROR: Encountered Error during executing test <br>" + str(e))
    debug('<hr>')
    continue