In [0]:
%sql
SELECT DISTINCT account_name, account_number, MIN(gl_date), MAX(gl_date) FROM fin_demo.acct.fact_gl_entries GROUP BY ALL

In [0]:
%sql
SELECT  
  account_name, 
  account_number, 
  MIN(gl_date), 
  MAX(gl_date), 
  COUNT(*) FILTER (WHERE gl_date IS NULL) 
FROM fin_demo.acct.fact_gl_entries 
GROUP BY ALL ;



In [0]:

# TEST OVERALL DATASET BUILD

print("\n" + "="*80)
print("DATA QUALITY CHECKS")
print("="*80)

# Check 1: Verify debits equal credits for each transaction
print("\n1. Checking if debits equal credits for each transaction...")

df_balance_check = spark.sql("""
    SELECT 
        transaction_id,
        transaction_type,
        SUM(debit_amount) AS total_debits,
        SUM(credit_amount) AS total_credits,
        SUM(debit_amount) - SUM(credit_amount) AS variance
    FROM fin_demo.acct.fact_gl_entries
    GROUP BY transaction_id, transaction_type
    HAVING ABS(SUM(debit_amount) - SUM(credit_amount)) > 0.01
    ORDER BY variance DESC
""")

unbalanced_count = df_balance_check.count()
if unbalanced_count == 0:
    print("   ✓ PASS: All transactions are balanced!")
else:
    print(f"   ✗ FAIL: {unbalanced_count} transactions are not balanced")
    display(df_balance_check)

# Check 2: Summary by account
print("\n2. Account Summary (Trial Balance)...")

df_account_summary = spark.sql("""
    SELECT 
        account_number,
        account_name,
        COUNT(*) AS entry_count,
        SUM(debit_amount) AS total_debits,
        SUM(credit_amount) AS total_credits,
        SUM(debit_amount) - SUM(credit_amount) AS net_balance
    FROM fin_demo.acct.fact_gl_entries
    GROUP BY account_number, account_name
    ORDER BY account_number
""")

display(df_account_summary)

# Check 3: Overall balance check
print("\n3. Overall Double-Entry Balance Check...")

df_overall = spark.sql("""
    SELECT 
        SUM(debit_amount) AS total_debits,
        SUM(credit_amount) AS total_credits,
        SUM(debit_amount) - SUM(credit_amount) AS variance
    FROM fin_demo.acct.fact_gl_entries
""")

overall_result = df_overall.collect()[0]
if abs(overall_result['variance']) < 0.01:
    print("   ✓ PASS: Overall debits equal credits!")
else:
    print(f"   ✗ FAIL: Overall variance = {overall_result['variance']}")

display(df_overall)

# Check 4: Transaction type summary
print("\n4. Summary by Transaction Type...")

df_type_summary = spark.sql("""
    SELECT 
        transaction_type,
        COUNT(DISTINCT transaction_id) AS transaction_count,
        COUNT(*) AS gl_entry_count,
        SUM(debit_amount) AS total_debits,
        SUM(credit_amount) AS total_credits
    FROM fin_demo.acct.fact_gl_entries
    GROUP BY transaction_type
    ORDER BY transaction_type
""")

display(df_type_summary)

# COMMAND ----------
# ============================================================================
# SAMPLE QUERIES FOR ANALYSIS
# ============================================================================

print("\n" + "="*80)
print("SAMPLE ANALYTICAL QUERIES")
print("="*80)

# Cash flow analysis
print("\n1. Cash Flow Summary...")
df_cash_flow = spark.sql("""
    SELECT 
        DATE_TRUNC('month', gl_date) AS month,
        SUM(debit_amount) AS cash_in,
        SUM(credit_amount) AS cash_out,
        SUM(debit_amount) - SUM(credit_amount) AS net_cash_flow
    FROM fin_demo.acct.fact_gl_entries
    WHERE account_number = 1000
    GROUP BY DATE_TRUNC('month', gl_date)
    ORDER BY month
""")
display(df_cash_flow)

# AR aging
print("\n2. Accounts Receivable Summary...")
df_ar_summary = spark.sql("""
    SELECT 
        SUM(debit_amount) AS ar_increases,
        SUM(credit_amount) AS ar_decreases,
        SUM(debit_amount) - SUM(credit_amount) AS outstanding_ar
    FROM fin_demo.acct.fact_gl_entries
    WHERE account_number = 1100
""")
display(df_ar_summary)

# AP aging
print("\n3. Accounts Payable Summary...")
df_ap_summary = spark.sql("""
    SELECT 
        SUM(credit_amount) AS ap_increases,
        SUM(debit_amount) AS ap_decreases,
        SUM(credit_amount) - SUM(debit_amount) AS outstanding_ap
    FROM fin_demo.acct.fact_gl_entries
    WHERE account_number = 2000
""")
display(df_ap_summary)

# Revenue by category
print("\n4. Revenue by Category...")
df_revenue_category = spark.sql("""
    SELECT 
        category,
        SUM(credit_amount) AS total_revenue
    FROM fin_demo.acct.fact_gl_entries
    WHERE account_number = 4000
    GROUP BY category
    ORDER BY total_revenue DESC
""")
display(df_revenue_category)

print("\n✓ GL Entries generation complete!")