In [23]:
import csv
import os
import snowflake.connector
import sys
import json
import shutil
import logging
from datetime import datetime, timedelta
import pandas as pd


def check_schema(file_name,data):
    """
    CHeck the schema of the csv file and compare to the schema mentioned in the 
    process json.
    """
    with open(path+"\\Process\\"+process_name+".json",'r') as w:
        data=json.load(w)
    
    logging.info("Column list for process '{}' is {}".format(data["Process_Name"],data["Schema"]) )

    df=pd.read_csv(data["SourcePath"]+"\\"+file_name,nrows=0)  
    col_list=df.columns.tolist()
    logging.info("COlumn list available in csv file {} is {}".format(file_name,col_list))

    if data["Schema"]==col_list:
        logging.info("Schema is matching, process for data loading")
        load_to_stg(file_name,data)
    else:
        logging.info("Please check the schema in the csv file {} as it is not in sync with the proces json".format(file_name))
        
def json_process(process_name):
    """
    Process the json file availabe for mentioned process
    """
    logging.info("Details of process extraction started for process '{}'".format(process_name))
    with open(path+"\\Process\\"+process_name+".json",'r') as w:
        data=json.load(w)
        #Reading all values available in json
        try:
            Prcs_nm=data["Process_Name"]
            file_format=data["File_Format"]
            Trgt_db=data["TargetDB"]
            StageTbl=data["StageTbl"]
            TargetTbl=data["TargetTbl"]
            InputFilePath=data["SourcePath"]
            logging.info("Json file is successfully read for process: {}".format(process_name))     
        except Exception as e:
            logging.error("Please check the JSON for proper values", e)
            pass
        file_process(data)
        
def load_to_stg(file_name,data):
    """
        Loads the csv file to DB stage layer
        Here we are using Snowflake as our staging DB
    """
    logging.info("Load the CSV file into SF stage layer started for file: {}".format(file_name))
    load_from_stg_to_core(file_name,data)

def load_from_stg_to_core(file_name,data):
    
    """
        Loads the data from stage layer to core layer
        Here we are using Snowflake as our staging and core DB
    """
    logging.info("Load the from stage to core layer started for file: {}".format(file_name))
    
    Tbl_row_cnt=5 #get it from DB
    
    logging.info("No of rows loaded into Table: {} is {}".format(data["TargetTbl"],Tbl_row_cnt))
    
    post_processing(Tbl_row_cnt,file_name,data)
    
def post_processing(Tbl_row_cnt,file_name,data):
    
    """
        Check count between source file and target table.
        If it matches then move the file to Archive folder
    """
    logging.info("Move the loaded file into Archive folder for file: {}".format(file_name))
    
    df=pd.read_csv(data["SourcePath"]+"\\"+file_name,nrows=5)   
    logging.info("Sample rows for file :{} is \n {}".format(file_name,df))

    row_count=len(df)
    logging.info("NO of rows available in file:{} is {}".format(file_name,row_count))
                 
    if Tbl_row_cnt==row_count:
        logging.info("Successfully loaded data into SF for filename: {}".format(file_name))

        source_path=data["SourcePath"]+"\\"+file_name
        target_path=data["ArchivePath"]+"\\"+file_name

        shutil.move(source_path,target_path)

        logging.info ("File :{} is moved to archive folder: {}".format(file_name,data["ArchivePath"]))
    else:
        logging.error("Source to Target row count is not matching, Please check file: {}".format(file_name))

    
def file_process(data):
    
    """
    Function to process the files to load to DB
    """

    # Remove files from archive which are older than 7 days;
    files=os.listdir(data["ArchivePath"])
    for file in files:
        timestamp = datetime.fromtimestamp(os.path.getmtime(data["ArchivePath"]+"\\"+file))
        if datetime.now()-timestamp>timedelta(days=7):
                logging.info("{} is Older than 7 days, Removing it from Archive".format(file))
                os.remove(data["ArchivePath"]+"\\"+file)
            
    logging.info ("File pattern for process:'{}' is '{}'*.csv \n".format(data["Process_Name"],data["File_Format"]))
    
    files=os.listdir(data["SourcePath"])
    #print(files)
    to_be_processed=[]
    
    for file in files:
        """
        Identify if only file with ProductWiseSales name is present or name
        Ignore all other files
        """
        if data["File_Format"] in file:
            logging.info("List of files to be processed {} \n".format(file))
            to_be_processed.append(file)
    #print (file) 
    #print(to_be_processed)
    if not to_be_processed:
        logging.error("No file is there to process for Process:{}".format(data["Process_Name"]))
    else:
        for file_name in to_be_processed:
            
            check_schema(file_name,data)
            
#Process starts from here
if __name__ == "__main__":
    #print("My program starts here")
    path=r"C:\Users\neela\Downloads\PythonDemo\DataIngestion"
    logPath=path+"\\log\\"
    
    process_list=["Product","Employee"]
    
    process_name=input("Enter the process name you want to trigger")
    logging.basicConfig(
                filename=logPath+process_name+'_'+datetime.now().strftime("%Y%m%d%H%M%S")+'.log',
                level=logging.INFO,           # minimum log level to record
                format='%(asctime)s - %(levelname)s - %(message)s'
            )
    if process_name in process_list:
        logging.info("Processes available are {}\n".format(process_list))
        logging.info("Data Ingestion Process started at {}".format(datetime.now().strftime("%Y%m%d%H%M%S")))
        logging.info("'{}' process will be triggered \n".format(process_name))
        print("'{}' will be triggered, Please check the log path '{}' for details".format(process_name,logPath+"product*"))
        json_process(process_name)

        for handler in logging.root.handlers[:]:
            handler.close()
            logging.root.removeHandler(handler)
    else:
        logging.error("Please enter a valid process name \n")
        print("Please enter a valid Process Name from list {}".format(process_list))  
        for handler in logging.root.handlers[:]:
            handler.close()
            logging.root.removeHandler(handler)
    

Enter the process name you want to trigger Product


'Product' will be triggered, Please check the log path 'C:\Users\neela\Downloads\PythonDemo\DataIngestion\log\product*' for details


In [16]:
l1=['Product_Id', 'Qty', 'Amt', 'Sales_dt']
l2=['Product_id', 'Qty', 'Amt', 'Sales_dt']
if l1==l2:
    print("Yes")
else:
    print("No")

No
