'''
Author: Ngawang Gurung
Date: 2024/07/24
'''

### Data Validation

- Compare if the values between two tables match

### Importing Libraries

In [1]:
import pandas as pd

### Loading Datasets

In [2]:
dev_df = pd.read_csv('./dataset/developer_data.csv')
qa_df = pd.read_csv('./dataset/qa_data.csv')

In [9]:
print(dev_df)

   account    amount
0  3127143    3786.5
1  4413248  135718.0
2  4942722    8494.0
3  5519726   19297.0
4  9197174   11784.0
5  9907844    1000.0
6  9907848   56565.0


In [10]:
print(qa_df)

   account   amount
0  3127143   3786.5
1  4413248  13518.0
2  4942722   8494.0
3  5519726  19297.0
4  9197174   1194.0
5  9907844   1000.0


### Data Validation Function

In [7]:
def validation(df_1: pd.DataFrame, df_2: pd.DataFrame, primary_col: str, sec_col: str) -> pd.DataFrame:
    """
    Validates two DataFrames by comparing the values of a secondary column after merging them on a primary column.

    Parameters:
    df_1 (pd.DataFrame): The first DataFrame.
    df_2 (pd.DataFrame): The second DataFrame.
    primary_col (str): The column name to merge on.
    sec_col (str): The column name to compare.

    Returns:
    pd.DataFrame: A merged DataFrame with a 'status' column indicating "Pass" if values match in sec_col, 
    otherwise "Fail".
    """
    
    df = pd.merge(df_1, df_2, how='outer', on=primary_col)
    df['status'] = df.apply(lambda x: "Pass" if x[f'{sec_col}_x'] == x[f'{sec_col}_y'] else "Fail", axis=1)
    return df


In [8]:
validation(dev_df, qa_df, 'account', 'amount')

Unnamed: 0,account,amount_x,amount_y,status
0,3127143,3786.5,3786.5,Pass
1,4413248,135718.0,13518.0,Fail
2,4942722,8494.0,8494.0,Pass
3,5519726,19297.0,19297.0,Pass
4,9197174,11784.0,1194.0,Fail
5,9907844,1000.0,1000.0,Pass
6,9907848,56565.0,,Fail
