In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind


In [2]:
df = pd.read_csv("../data/cleaned_transactions.csv")
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'])
df.head()


Unnamed: 0,TransactionID,CustomerID,AccountID,AccountType,TransactionType,Product,Firm,Region,Manager,TransactionDate,TransactionAmount,AccountBalance,RiskScore,CreditRating,TenureMonths,Year,Month,YearMonth
0,33,CUST6549,ACC12334,credit,withdrawal,Savings Account,Firm C,Central,Manager 1,2023-10-21,87480.05448,74008.4331,0.729101,319,200,2023,10,2023-10
1,177,CUST2942,ACC52650,credit,withdrawal,Home Loan,Firm A,East,Manager 3,2023-06-20,20315.74505,22715.8359,0.472424,692,47,2023,6,2023-06
2,178,CUST6776,ACC45101,current,deposit,Personal Loan,Firm C,South,Manager 3,2023-01-02,10484.57165,42706.0921,0.648784,543,109,2023,1,2023-01
3,173,CUST2539,ACC88252,current,withdrawal,Mutual Fund,Firm A,Central,Manager 2,2023-07-25,45122.27373,114176.5687,0.734832,430,103,2023,7,2023-07
4,67,CUST2626,ACC21878,savings,withdrawal,Home Loan,Firm C,Central,Manager 4,2023-07-25,42360.79878,17863.02644,0.289304,468,234,2023,7,2023-07


In [None]:
#Hypothesis1- High-volume accounts have higher average balances than low-volume accounts
customer_summary = (
    df
    .groupby('AccountID')
    .agg(
        Total_Transactions=('TransactionID', 'count'),
        Avg_Balance=('AccountBalance', 'mean')
    )
    .reset_index()
)

customer_summary.head()


Unnamed: 0,AccountID,Total_Transactions,Avg_Balance
0,ACC10117,4,70107.007957
1,ACC10996,5,43568.008084
2,ACC11062,2,38137.13261
3,ACC11188,5,69652.151044
4,ACC11285,3,97401.34856


In [4]:
median_txn = customer_summary['Total_Transactions'].median()


In [5]:
high_volume_balances = customer_summary[
    customer_summary['Total_Transactions'] > median_txn
]['Avg_Balance']

low_volume_balances = customer_summary[
    customer_summary['Total_Transactions'] <= median_txn
]['Avg_Balance']


In [6]:
t_stat_1, p_value_1 = ttest_ind(
    high_volume_balances,
    low_volume_balances,
    equal_var=False
)

t_stat_1, p_value_1


(np.float64(0.31506769364079873), np.float64(0.7530534149746722))

In [7]:
if p_value_1 < 0.05:
    decision_1 = "Reject Null Hypothesis"
    conclusion_1 = "High-volume transaction accounts have significantly different average balances."
else:
    decision_1 = "Fail to Reject Null Hypothesis"
    conclusion_1 = "No significant difference in average balances."

decision_1, conclusion_1


('Fail to Reject Null Hypothesis',
 'No significant difference in average balances.')

In [8]:
#Hypothesis2- Balances differ across risk-based customer segments
df['Risk_Segment'] = pd.qcut(
    df['RiskScore'],
    q=3,
    labels=['Low Risk', 'Medium Risk', 'High Risk']
)


In [9]:
high_risk_balances = df[df['Risk_Segment'] == 'High Risk']['AccountBalance']
low_risk_balances = df[df['Risk_Segment'] == 'Low Risk']['AccountBalance']


In [10]:
t_stat_2, p_value_2 = ttest_ind(
    high_risk_balances,
    low_risk_balances,
    equal_var=False
)

t_stat_2, p_value_2


(np.float64(0.8801582436767966), np.float64(0.37917226509606283))

In [11]:
if p_value_2 < 0.05:
    decision_2 = "Reject Null Hypothesis"
    conclusion_2 = "Account balances significantly differ between high-risk and low-risk segments."
else:
    decision_2 = "Fail to Reject Null Hypothesis"
    conclusion_2 = "No significant balance difference across risk segments."

decision_2, conclusion_2


('Fail to Reject Null Hypothesis',
 'No significant balance difference across risk segments.')

In [12]:
hypothesis_results = pd.DataFrame({
    'Hypothesis': [
        'High vs Low Transaction Volume',
        'High Risk vs Low Risk Segments'
    ],
    'T-Statistic': [t_stat_1, t_stat_2],
    'P-Value': [p_value_1, p_value_2],
    'Decision': [decision_1, decision_2],
    'Conclusion': [conclusion_1, conclusion_2]
})

hypothesis_results


Unnamed: 0,Hypothesis,T-Statistic,P-Value,Decision,Conclusion
0,High vs Low Transaction Volume,0.315068,0.753053,Fail to Reject Null Hypothesis,No significant difference in average balances.
1,High Risk vs Low Risk Segments,0.880158,0.379172,Fail to Reject Null Hypothesis,No significant balance difference across risk ...


In [13]:
with pd.ExcelWriter(
    "../excel_outputs/Financial_Risk_Analysis_Task6_FINAL.xlsx",
    engine="openpyxl"
) as writer:

    hypothesis_results.to_excel(
        writer,
        sheet_name="Hypothesis_Test_Results",
        index=False
    )

print("✅ Financial_Risk_Analysis_Task6_FINAL.xlsx created successfully")


✅ Financial_Risk_Analysis_Task6_FINAL.xlsx created successfully


## Insights
-Statistical testing showed no significant difference in average balances between high- and low-volume transaction accounts.
-Hypothesis testing reinforced the importance of data-driven decision-making rather than assumptions.
-Risk segmentation analysis helped validate behavioral differences across customer groups.