In [None]:
import pandas as pd

# Cross Field Validation

Cross field validation is **the use of multiple fields in a dataset to sanity-check data integrity**.

What to do when we catch inconsistencies?

-	Dropping them

-	Imputing them with appropriate value(s)

-	Applying rules from domain knowledge

	-	e.g., If you know that in a specific context (like healthcare or finance), certain values should always fall within a particular range, you can create rules to flag or correct values that fall outside this range.

-	...

In [None]:
""" Dataset Description:

The dataset contains data on the amount of money stored in accounts (acct_amount), their currency (acct_cur){NOT FOUND!}, 
amount invested (inv_amount), account opening date (account_opened), 
and last transaction date (last_transaction) that were consolidated from American and European branches.
"""
banking = pd.read_csv("https://raw.githubusercontent.com/MohamedMostafa259/Pandas-Notes/refs/heads/main/Data/banking_dirty.csv", index_col=0)
banking.head()

Unnamed: 0,cust_id,birth_date,Age,acct_amount,inv_amount,fund_A,fund_B,fund_C,fund_D,account_opened,last_transaction
0,870A9281,1962-06-09,58,63523.31,51295,30105.0,4138.0,1420.0,15632.0,02-09-18,22-02-19
1,166B05B0,1962-12-16,58,38175.46,15050,4995.0,938.0,6696.0,2421.0,28-02-19,31-10-18
2,BFC13E88,1990-09-12,34,59863.77,24567,10323.0,4590.0,8469.0,1185.0,25-04-18,02-04-18
3,F2158F66,1985-11-03,35,84132.1,23712,3908.0,492.0,6482.0,12830.0,07-11-17,08-11-18
4,7A73F334,1990-05-17,30,120512.0,93230,12158.4,51281.0,13434.0,18383.0,14-05-18,19-07-18


In [None]:
# In `numeric_df.sum(axis=1)`, the `axis=1` argument specifies that the operation is performed row-wise. 
	# This means that the sum of all the columns in each row is calculated, resulting in a Series containing a single value for each row.
valid_inv = banking[['fund_A', 'fund_B', 'fund_C', 'fund_D']].sum(axis=1) == banking['inv_amount']
inconsistent_inv = banking[~valid_inv]

print("Number of inconsistent investments:", inconsistent_inv.shape[0])
inconsistent_inv

Number of inconsistent investments: 8


Unnamed: 0,cust_id,birth_date,Age,acct_amount,inv_amount,fund_A,fund_B,fund_C,fund_D,account_opened,last_transaction
4,7A73F334,1990-05-17,30,120512.0,93230,12158.4,51281.0,13434.0,18383.0,14-05-18,19-07-18
12,EEBD980F,1990-11-20,34,57838.49,50812,18314.0,1477.0,29049.48,5539.0,08-12-18,04-01-20
22,96525DA6,1992-11-23,28,82511.24,33927,8206.0,15019.0,5559.6,6182.0,23-07-18,07-08-18
43,38B8CD9C,1970-06-25,50,28834.71,27531,314.0,6072.28,14163.0,7908.0,17-09-18,05-02-20
47,68C55974,1962-07-08,58,95038.14,66796,33764.0,5042.0,10659.0,19237.41,03-04-18,25-09-18
65,0A9BA907,1966-09-21,54,90469.53,70171,28615.0,21720.05,11906.0,10763.0,15-06-18,28-08-18
89,C580AE41,1968-06-01,52,96673.37,68466,8489.36,28592.0,2439.0,30419.0,28-09-18,17-09-18
92,A07D5C92,1990-09-20,30,99577.36,60407,6467.0,20861.0,9861.0,26004.16,17-11-17,16-01-20


In [None]:
import datetime as dt

today = pd.to_datetime("01-01-2020") # `today` should be = dt.date.today(), but I think this data was collected in 2020
ages_manual = today.year - pd.to_datetime(banking['birth_date']).dt.year

valid_age = banking['Age'] == ages_manual
inconsistent_age = banking[~valid_age]

print("Number of inconsistent ages:", inconsistent_age.shape[0])
inconsistent_age

Number of inconsistent ages: 8


Unnamed: 0,cust_id,birth_date,Age,acct_amount,inv_amount,fund_A,fund_B,fund_C,fund_D,account_opened,last_transaction
2,BFC13E88,1990-09-12,34,59863.77,24567,10323.0,4590.0,8469.0,1185.0,25-04-18,02-04-18
8,E52D4C7F,1975-06-05,49,61795.89,49385,12939.0,7757.0,12569.0,16120.0,22-05-17,24-10-19
12,EEBD980F,1990-11-20,34,57838.49,50812,18314.0,1477.0,29049.48,5539.0,08-12-18,04-01-20
23,A1815565,1968-09-27,56,82996.04,30897,16092.0,5491.0,5098.0,4216.0,07-11-17,30-09-19
32,8D08495A,1961-08-14,63,89138.52,60795,53880.0,1325.0,2105.0,3485.0,08-08-18,05-02-19
54,2F4F99C1,1988-12-19,36,82058.48,35758,6129.0,16840.0,10397.0,2392.0,30-12-18,11-08-18
61,45F31C81,1975-01-12,49,120675300.0,94608,15416.0,18845.0,20325.0,40022.0,05-11-18,25-12-19
85,7539C3B7,1974-05-14,50,1077557.0,91190,32692.0,30405.0,14728.0,13365.0,23-08-17,07-06-19
