In [None]:
#The following code compares pairs of files with the same name from 2 input folders.
#This is meant for data containing text columns which will be concatenated to form a unique identifier,
#while the numpy.diff function will be applied on the numerical columns.
#The resulting output contains in each row the identifier and the numerical diffs for each column.
#Basically, it returns only rows and columns where numerical differences are observed.
#Very useful for csv report regression testing!
#The second resulting output is extra_records_df, which contains extra rows that do not have a match in the other file. See column "Exist".

In [None]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
import time

In [None]:
#Set input folder name
input_folder = "test_folder"

In [None]:
#Create output folder. This approach ensures a new folder name is generated every time so that the previous one does not get overwritten.
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
out_folder = f'{cob}_{timestamp}'
out_folder_path = os.path.join('.', out_folder)
os.makedirs(out_folder_path)

In [None]:
#Create functions to add new column as a unique identifier by concatenating text columns in the dataframe.
#Create a sorting column function if required so that the text columns come first.
def create_unique_key_2_columns(row)
    return f"{row['Unnamed: 0']}_{row['Unnamed: 1']}"

def create_unique_key_3_columns(row)
    return f"{row['Unnamed: 0']}_{row['Unnamed: 1']}_{row['Unnamed: 2']}"

#Create function to add new column as a unique identifier by concatenating all non-numerical columns, Except "Exist".
def create_unique_key(row):
    keys = [str(value) for col, value in row.items() if not pd.api.types.is_numeric_dtype(type(value))]
    return '_'.join(keys[:-1])

In [None]:
#Read the files from a text file
report_list = 'input_file.txt'
with open(report_list, 'r') as file:
    rows = file.readlines()

In [None]:
#Record the start time
start_time = time.time()

#Create a counter
files_compared = 0

#Loop through the list of files
for row in rows
    #Check if row is commented in input file
    if not row.startwith("#"):
        file_row = row.strip() #check if this works
        
        file = file_row[0]
        
        #Reset dataframes and variables
        df_diff = pd.DataFrame()
        df_diff2 = pd.DataFrame()
        df_diff3 = pd.DataFrame()
        extra_records_df = pd.DataFrame()
        key_counts = []
        single_key = []
        
        #Read file, compute diffs and determine extra records
        if (os.path.exists(f'.//{input_folder}/Path1/{file}') and not (os.path.exists(f'.//{input_folder}/Path2/{file}'):
            print(f'File {file} is in Path1 but not in Path2')
        
        elif (not os.path.exists(f'.//{input_folder}/Path1/{file}') and (os.path.exists(f'.//{input_folder}/Path2/{file}'):
            print(f'File {file} is in Path2 but not in Path1')
        
        elif (os.path.exists(f'.//{input_folder}/Path1/{file}') and (os.path.exists(f'.//{input_folder}/Path2/{file}'):
            df1 = pd.read_csv(f'.//{input_folder}/Path1/{file}, sep = ';')
            df2 = pd.read_csv(f'.//{input_folder}/Path2/{file}, sep = ';')
            
            #Increase counter for compared files
            files_compared += 1
            
            #Reset index
            df1 = df1.reset_index(drop = True)
            df2 = df2.reset_index(drop = True)
            
            #Merge dataframes and add indicator column to show row origin
            df_diff = pd.merge(df1, df2, how = 'outer', indicator = 'Exist')

            #Filter dataframe to show only rows with differences or extra rows
            df_diff = df_diff.query("Exist != 'both'")
                              
            if df_diff.empty:
                print(f'No diffs for {file}')
            else:
                print(f'DIFFS for {file}')
                              
                #If needed, replace all "," decimal separators with "."
                df_diff = df_diff.replace(',', '.', regex = True)
                
                #If needed, convert numbers stored as string to numerical values
                df_diff = df_diff._convert(numeric = True)
                    
                #Insert new unique identifier column
                df_diff.insert(0, 'Key', df_diff.apply(create_unique_key, axis = 1))
                              
                #Sort the dataframe rows by the new Key column
                df_diff = df_diff.sort_values(['Key'])
                              
                #Create a list of all "Key" values without a pair
                key_counts = df_diff['Key'].value_counts()
                single_keys = key_counts[key_counts == 1].index.tolist()

                #Generate the list of extra records in both files. Check column "Exist"
                extra_records_df = df_diff[df_diff['Key'].isin(single_keys)]
                
                #Write extra records dataframe to Excel
                if extra_records_df.empty:
                    print(f'No extra records for {file}')
                else:
                    print(f'EXTRA records for {file}')
                    #The tab name will contain at most 20 characters from the file name
                    tab_name = file[-20:]
                    
                    #If file exists, open in append mode
                    if os.path.exists(f'.//{out_folder}/{file}_diffs.xlsx'):
                        with pd.ExcelWriter(f'.//{out_folder}/{file}_diffs.xlsx', engine = 'openpyxl', mode = 'a') as writer:
                            extra_records_df.to_excel(writer, sheet_name = f'{tab_name}++records)
                    else:
                        with pd.ExcelWriter(f'.//{out_folder}/{file}_diffs.xlsx') as writer:
                            extra_records_df.to_excel(writer, sheet_name = f'{tab_name}++records)
                     
                #Generate the records with "Key" pairs
                df_diff = df_diff[-df_diff['Key'].isin(single_keys)]
                                                      
                #Get all numerical column names excluding 'Key'
                df_columns_to_check = df_diff.select_dtypes(include = np.number).columns
                                                      
                #Add "Key" back to the list of numerical columns
                df_columns_to_check = ['Key'] + list(df_columns_to_check)
                                                      
                #Generate the dataframe containing "Key" + numerical columns
                df_diff = df_diff[df_columns_to_check]
                                                      
                #Drop all NaN columns
                df_diff = df_diff.dropna(axis = 1)
                                                      
                #Drop 0 columns
                df_diff = df_diff.loc[:, (df_diff !=0).any(axis = 0)]
                                                      
                #Generate the diffs between all numerical columns in consecutive rows, except "Key"
                diff_df2 = df_diff.drop(columns = ['Key']).diff()
                                                      
                #Add "Key" column to new dataframe
                diff_df2['Key'] = df_diff['Key']
                                                      
                #Add the previously computed diffs to the new dataframe
                diff_df2 = diff_df2['Key'] + [col for col in diff_df2.columns if col != 'Key']]
                                                      
                #Create a new dataframe by resetting the index of the previous one. 
                diff_df3 = diff_df2.reset_index(drop = False)
                                                      
                #Keep only diffs for rows with the same "Key"
                diff_df3 = diff_df3[diff_df3.index % 2 != 0]
                                                      
                #Drop 0 columns
                diff_df3 = diff_df3.loc[:, (diff_df3 !=0).any(axis = 0)]
                                                      
                #The tab name will contain at most 20 characters from the file name
                tab_name = file[-20:]
                #Save diffs to excel
                #If file exists, open in append mode
                if os.path.exists(f'.//{out_folder}/{file}_diffs.xlsx'):
                    with pd.ExcelWriter(f'.//{out_folder}/{file}_diffs.xlsx', engine = 'openpyxl', mode = 'a') as writer:
                        diff_df3.to_excel(writer, sheet_name = f'{tab_name}_diffs)
                else:
                    with pd.ExcelWriter(f'.//{out_folder}/{file}_diffs.xlsx') as writer:
                        diff_df3.to_excel(writer, sheet_name = f'{tab_name}_diffs)
                                          
        else:
            print(f'{file} does NOT exist')
                        
print('\n\' + '\033[1m' + '\033[95m' + ' Regression analysis COMPLETE')
                    
#Record the end time and display processing duration
end_time = time.time()
processing_duration = end_time - start_time
minutes, seconds = divmod(processing_duration, 60)
print(f'Processing duration: {int(minutes)} m {seconds:.2f} s')   
print(f'Files compared: {files_compared}')