# Data validation

In [1]:
# Import librairies
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql import functions as F

#os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages=org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell"

config = configparser.ConfigParser()
config.read_file(open('dl.cfg'))

os.environ["AWS_ACCESS_KEY_ID"]= config['AWS']['AWS_ACCESS_KEY_ID']
os.environ["AWS_SECRET_ACCESS_KEY"]= config['AWS']['AWS_SECRET_ACCESS_KEY']

In [2]:
# Create spark session
def create_spark_session():
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .config("spark.hadoop.fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .getOrCreate()
        
        
    spark.sparkContext._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.algorithm.version", "2")
    #spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.eu-west-3.amazonaws.com")
    return spark

In [3]:
spark = create_spark_session()

### 1 - Read data from lake storage

In [4]:
path = "s3a://lakestorage/"
bodyStyle = spark.read.parquet(path + "bodyStyle/")
car = spark.read.parquet(path + "car/")
color = spark.read.parquet(path + "color/")
country = spark.read.parquet(path + "country/")
makeModelYear = spark.read.parquet(path + "makeModelYear/")
plate = spark.read.parquet(path + "plate/")

### 2 - Check the data model for each dataset 

In [5]:
def dataModelCheck(df):
    return df.printSchema()

In [6]:
dataModelCheck(bodyStyle)
dataModelCheck(car)
dataModelCheck(color)
dataModelCheck(country)
dataModelCheck(makeModelYear)
dataModelCheck(plate)

root
 |-- bodyStyle_id: long (nullable = true)
 |-- frame_id: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Confidence: double (nullable = true)
 |-- createDateTime: timestamp (nullable = true)

root
 |-- car_id: long (nullable = true)
 |-- frame_id: integer (nullable = true)
 |-- color: string (nullable = true)
 |-- bodyStyle: string (nullable = true)
 |-- plateText: string (nullable = true)
 |-- make: string (nullable = true)
 |-- warpedBoxV1: double (nullable = true)
 |-- warpedBoxV2: double (nullable = true)
 |-- warpedBoxV3: double (nullable = true)
 |-- warpedBoxV4: double (nullable = true)
 |-- createDateTime: timestamp (nullable = true)

root
 |-- color_id: long (nullable = true)
 |-- frame_id: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Confidence: double (nullable = true)
 |-- createDateTime: timestamp (nullable = true)

root
 |-- country_id: long (nullable = true)
 |-- frame_id: integer (nullable = true)
 |-- country: string (nulla

### 3 - Check if row exist 

In [7]:
def rowsExistCheck(df, description):
    result = df.count()
    if result == 0:
        print("No records in this file")
    else:
        print(f"There are {result} records in the data set {description}")
    

In [8]:
rowsExistCheck(bodyStyle, "bodyStyle")
rowsExistCheck(car, "car")
rowsExistCheck(color, "color")
rowsExistCheck(country, "country")
rowsExistCheck(makeModelYear, "makeModelYear")
rowsExistCheck(plate, "plate")

There are 266440 records in the data set bodyStyle
There are 253282 records in the data set car
There are 266440 records in the data set color
There are 266440 records in the data set country
There are 266440 records in the data set makeModelYear
There are 245155 records in the data set plate


## 4 - Check the integrity

In [13]:
carIntegrity = car.join(bodyStyle, (car.bodyStyle == bodyStyle.Name) & (car.createDateTime == bodyStyle.createDateTime),"left_anti") \
                     .count() == 0

In [14]:
carIntegrity

False