In [3]:
%pip install fairlearn

Collecting fairlearn
  Downloading fairlearn-0.13.0-py3-none-any.whl.metadata (7.3 kB)
Collecting narwhals>=1.14.0 (from fairlearn)
  Downloading narwhals-2.17.0-py3-none-any.whl.metadata (14 kB)
Collecting scikit-learn>=1.2.1 (from fairlearn)
  Downloading scikit_learn-1.8.0-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy<1.16.0,>=1.9.3 (from fairlearn)
  Downloading scipy-1.15.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.3.0 (from scikit-learn>=1.2.1->fairlearn)
  Downloading joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn>=1.2.1->fairlearn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading fairlearn-0.13.0-py3-none-any.whl (251 kB)
Downloading narwhals-2.17.0-py3-none-any.whl (444 kB)
Downloading scikit_learn-1.8.0-cp312-cp312-win_amd64.whl (8.0 MB)
   ---------------------------------------- 0.0/8.0 MB ? eta -:--:--
   ---------------------------------------- 8.0/8.0 M

  You can safely remove it manually.
  You can safely remove it manually.


In [8]:
import json

# Data manipulation 
import pandas as pd 
import numpy as np 

# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns 

# Fairness 
from fairlearn.metrics import demographic_parity_difference 

# MongoDB 
from pymongo import MongoClient

# Data Quality Analysis: NovaCred Credit Applications
**Task Force:** Team DEGO
**Objective:** Evaluate, quantify, and remediate data quality issues in the `raw_credit_applications.json` dataset across 4 dimensions: Completeness, Consistency, Validity, and Accuracy.

## Phase 1: Data Ingestion & Flattening
The original dataset is provided in a nested JSON format (e.g., `applicant_info`, `financials`). To perform an effective Exploratory Data Analysis (EDA) using Pandas, our first step is to "flatten" this structure, transforming the nested keys into standard tabular columns. We will use the `pd.json_normalize()` function.

In [9]:
file_path = '../data/raw/raw_credit_applications.json'

# Uploading the JSON file
with open(file_path, 'r') as file:
    raw_data = json.load(file)

# Flattening: let's extract the nested dictionaries in single columns
df = pd.json_normalize(raw_data)

print("Phase 1 Completed: Data successfully loaded and flattened.")
print(f"Dataset Shape: {df.shape[0]} records (rows) and {df.shape[1]} attributes (columns).")

display(df.head(5))

Phase 1 Completed: Data successfully loaded and flattened.
Dataset Shape: 502 records (rows) and 21 attributes (columns).


Unnamed: 0,_id,spending_behavior,processing_timestamp,applicant_info.full_name,applicant_info.email,applicant_info.ssn,applicant_info.ip_address,applicant_info.gender,applicant_info.date_of_birth,applicant_info.zip_code,...,financials.credit_history_months,financials.debt_to_income,financials.savings_balance,decision.loan_approved,decision.rejection_reason,loan_purpose,decision.interest_rate,decision.approved_amount,financials.annual_salary,notes
0,app_200,"[{'category': 'Shopping', 'amount': 480}, {'ca...",2024-01-15T00:00:00Z,Jerry Smith,jerry.smith17@hotmail.com,596-64-4340,192.168.48.155,Male,2001-03-09,10036,...,23,0.2,31212,False,algorithm_risk_score,,,,,
1,app_037,"[{'category': 'Rent', 'amount': 608}, {'catego...",,Brandon Walker,brandon.walker2@yahoo.com,425-69-4784,10.1.102.112,M,1992-03-31,10032,...,51,0.18,17915,False,algorithm_risk_score,,,,,
2,app_215,"[{'category': 'Rent', 'amount': 109}]",,Scott Moore,scott.moore94@mail.com,370-78-5178,10.240.193.250,Male,1989-10-24,10075,...,41,0.21,37909,True,,vacation,3.7,59000.0,,
3,app_024,"[{'category': 'Fitness', 'amount': 575}]",,Thomas Lee,thomas.lee6@protonmail.com,194-35-1833,192.168.175.67,Male,1983-04-25,10077,...,70,0.35,0,True,,,4.3,34000.0,,
4,app_184,"[{'category': 'Entertainment', 'amount': 463}]",2024-01-15T00:00:00Z,Brian Rodriguez,brian.rodriguez86@aol.com,480-41-2475,172.29.125.105,M,1999-05-21,10080,...,14,0.23,31763,False,algorithm_risk_score,,,,,


## Phase 2: Systematic Data Profiling (Discovery)
In the real world, we cannot assume we know the data's flaws. We must build systematic checks to discover issues across the four dimensions of Data Quality.

1. **Completeness Profiler**: Standard `.isnull()` checks, plus a scanner for "hidden" nulls (empty strings, whitespace, 'N/A').
2. **Consistency Profiler**: Checking data types against expected schemas and analyzing unique values in categorical fields to spot variations (e.g., 'M' vs 'Male').
3. **Validity & Accuracy Profiler**: Using statistical summaries to find impossible values (e.g., negative ages) and checking for logical duplicates (e.g., same SSN for different users).

In [10]:
print("=== 1. COMPLETENESS DISCOVERY ===")
# Standard nulls
standard_nulls = df.isnull().sum()

# Scanner for hidden nulls (strings that are just spaces or empty)
hidden_nulls = df.map(lambda x: str(x).strip() == '').sum()

completeness_df = pd.DataFrame({
    'Standard Nulls': standard_nulls,
    'Hidden Nulls (Empty Strings)': hidden_nulls,
    'Total Missing': standard_nulls + hidden_nulls
})
print("Columns with missing data detected:")
display(completeness_df[completeness_df['Total Missing'] > 0].sort_values(by='Total Missing', ascending=False))


print("\n=== 2. CONSISTENCY DISCOVERY ===")
# Check data types to find mismatches (e.g., numbers stored as strings)
print("Data Types Overview:")
print(df.dtypes[df.dtypes == 'object']) # Focusing on object/string columns

# Check unique values for categorical columns to spot formatting inconsistencies
categorical_cols = ['applicant_info.gender', 'decision.rejection_reason']
for col in categorical_cols:
    if col in df.columns:
        print(f"\nUnique values in '{col}':")
        print(df[col].dropna().unique())


print("\n=== 3. VALIDITY & ACCURACY DISCOVERY ===")
# Statistical summary to spot impossible min/max values (e.g., negatives)
numeric_cols = df.select_dtypes(include=[np.number]).columns
print("Statistical Summary for Numeric Columns (Look at min/max):")
display(df[numeric_cols].describe().T[['min', 'max', 'mean']])

# Logical Duplicates Check (SSN is a primary identifier, it should be unique)
if 'applicant_info.ssn' in df.columns:
    duplicate_ssns = df[df.duplicated(subset=['applicant_info.ssn'], keep=False)]
    print(f"\nFound {duplicate_ssns['applicant_info.ssn'].nunique()} unique SSNs that are shared across {len(duplicate_ssns)} different records!")

=== 1. COMPLETENESS DISCOVERY ===
Columns with missing data detected:


Unnamed: 0,Standard Nulls,Hidden Nulls (Empty Strings),Total Missing
notes,500,0,500
financials.annual_salary,497,0,497
loan_purpose,452,0,452
processing_timestamp,440,0,440
decision.rejection_reason,292,0,292
decision.approved_amount,210,0,210
decision.interest_rate,210,0,210
applicant_info.email,0,7,7
applicant_info.ip_address,5,0,5
applicant_info.ssn,5,0,5



=== 2. CONSISTENCY DISCOVERY ===
Data Types Overview:
spending_behavior           object
financials.annual_income    object
dtype: object

Unique values in 'applicant_info.gender':
<StringArray>
['Male', 'M', 'F', 'Female', '']
Length: 5, dtype: str

Unique values in 'decision.rejection_reason':
<StringArray>
[       'algorithm_risk_score', 'insufficient_credit_history',
              'high_dti_ratio',                  'low_income']
Length: 4, dtype: str

=== 3. VALIDITY & ACCURACY DISCOVERY ===
Statistical Summary for Numeric Columns (Look at min/max):


Unnamed: 0,min,max,mean
financials.credit_history_months,-10.0,133.0,50.40239
financials.debt_to_income,0.05,1.85,0.246195
financials.savings_balance,-5000.0,88078.0,29493.503984
decision.interest_rate,2.5,6.5,4.564726
decision.approved_amount,15000.0,80000.0,47845.890411
financials.annual_salary,45000.0,94000.0,69200.0



Found 3 unique SSNs that are shared across 11 different records!
