In [1]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
from scipy.stats import mannwhitneyu
import data_loader

df = data_loader.load_bank_data('bank_data.csv')
print("\n--- First 5 Rows of the Corrected Dataset ---")
print(df.head())
    
print("\n--- Corrected Dataset Information ---")
print(df.info())

Dataset loaded successfully!

--- First 5 Rows of the Corrected Dataset ---
   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         0  nonexistent          1.1   
1   may         mon  ...         1    999         0  nonexistent          1.1   
2   may         mon  ...         1    999         0  nonexistent          1.1   
3   may         mon  ...         1    999         0  nonexistent          1.1   
4   may         mon  ...         1    999         0  nonexiste

In [2]:
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact']
alpha = 0.05

print("--- Chi-Squared Test Results ---")

for col in categorical_cols:
    contingency_table = pd.crosstab(df[col], df['y'])
    
    chi2, p_value, _, _ = chi2_contingency(contingency_table)
    
    print(f"\nFeature: {col.upper()}")
    print(f"P-value: {p_value:.10f}")
    
    if p_value < alpha:
        print(f"Result: SIGNIFICANT - The relationship between {col} and subscription is likely not due to random chance.")
    else:
        print(f"Result: NOT SIGNIFICANT - The relationship between {col} and subscription could be due to random chance.")

--- Chi-Squared Test Results ---

Feature: JOB
P-value: 0.0000000000
Result: SIGNIFICANT - The relationship between job and subscription is likely not due to random chance.

Feature: MARITAL
P-value: 0.0000000000
Result: SIGNIFICANT - The relationship between marital and subscription is likely not due to random chance.

Feature: EDUCATION
P-value: 0.0000000000
Result: SIGNIFICANT - The relationship between education and subscription is likely not due to random chance.

Feature: DEFAULT
P-value: 0.0000000000
Result: SIGNIFICANT - The relationship between default and subscription is likely not due to random chance.

Feature: HOUSING
P-value: 0.0582944767
Result: NOT SIGNIFICANT - The relationship between housing and subscription could be due to random chance.

Feature: LOAN
P-value: 0.5786752870
Result: NOT SIGNIFICANT - The relationship between loan and subscription could be due to random chance.

Feature: CONTACT
P-value: 0.0000000000
Result: SIGNIFICANT - The relationship between cont

In [3]:
def cramers_v(contingency_table):
    """ Calculate Cramér's V for a given contingency table """
    chi2 = chi2_contingency(contingency_table)[0]
    n = contingency_table.sum().sum()
    phi2 = chi2 / n
    r, k = contingency_table.shape
    phi2_corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    r_corr = r - ((r-1)**2)/(n-1)
    k_corr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2_corr / min((k_corr-1), (r_corr-1)))

categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact']
cramers_results = {}

print("--- Cramér's V Association Strength ---")
for col in categorical_cols:
    contingency = pd.crosstab(df[col], df['y'])
    v = cramers_v(contingency)
    cramers_results[col] = v
    print(f"Feature: {col.upper():<10} | Cramér's V: {v:.4f}")

print("\n--- Features Ranked by Association Strength ---")
ranked_results = sorted(cramers_results.items(), key=lambda item: item[1], reverse=True)
for feature, score in ranked_results:
    print(f"{feature.upper():<10}: {score:.4f}")

--- Cramér's V Association Strength ---
Feature: JOB        | Cramér's V: 0.1519
Feature: MARITAL    | Cramér's V: 0.0539
Feature: EDUCATION  | Cramér's V: 0.0672
Feature: DEFAULT    | Cramér's V: 0.0991
Feature: HOUSING    | Cramér's V: 0.0095
Feature: LOAN       | Cramér's V: 0.0000
Feature: CONTACT    | Cramér's V: 0.1446

--- Features Ranked by Association Strength ---
JOB       : 0.1519
CONTACT   : 0.1446
DEFAULT   : 0.0991
EDUCATION : 0.0672
MARITAL   : 0.0539
HOUSING   : 0.0095
LOAN      : 0.0000


In [4]:
numerical_cols = ['age', 'duration', 'campaign', 'pdays', 'previous']
alpha = 0.05

subscribed_yes = df[df['y'] == 'yes']
subscribed_no = df[df['y'] == 'no']

print("--- Mann-Whitney U Test for Numerical Features ---")
print("Comparing clients who subscribed vs. those who did not\n")

for col in numerical_cols:

    stat, p_value = mannwhitneyu(subscribed_yes[col], subscribed_no[col])
    
    print(f"Feature: {col.upper()}")
    print(f"P-value: {p_value:.10f}")
    
    if p_value < alpha:
        print(f"Result: SIGNIFICANT - The distribution of '{col}' is significantly different for subscribers and non-subscribers.")
    else:
        print(f"Result: NOT SIGNIFICANT - There is no significant difference in the distribution of '{col}'.")
    print("-" * 40)

--- Mann-Whitney U Test for Numerical Features ---
Comparing clients who subscribed vs. those who did not

Feature: AGE
P-value: 0.0160805383
Result: SIGNIFICANT - The distribution of 'age' is significantly different for subscribers and non-subscribers.
----------------------------------------
Feature: DURATION
P-value: 0.0000000000
Result: SIGNIFICANT - The distribution of 'duration' is significantly different for subscribers and non-subscribers.
----------------------------------------
Feature: CAMPAIGN
P-value: 0.0000000000
Result: SIGNIFICANT - The distribution of 'campaign' is significantly different for subscribers and non-subscribers.
----------------------------------------
Feature: PDAYS
P-value: 0.0000000000
Result: SIGNIFICANT - The distribution of 'pdays' is significantly different for subscribers and non-subscribers.
----------------------------------------
Feature: PREVIOUS
P-value: 0.0000000000
Result: SIGNIFICANT - The distribution of 'previous' is significantly differe