### Statistical Testing

## 1-Setup

In [4]:
import pandas as pd
from scipy import stats
import numpy as np

# load cleaned datasets
hr = pd.read_csv("data/cleaned/hr_clean.csv")
superstore = pd.read_csv("data/cleaned/superstore_clean.csv")
retail = pd.read_csv("data/cleaned/retail_clean.csv")

print("HR:", hr.shape, "Superstore:", superstore.shape, "Retail:", retail.shape)


HR: (1470, 36) Superstore: (9983, 21) Retail: (392692, 8)


## 2 - Correlation (numeric vs numeric)

In [7]:
# check if MonthlyIncome relates to YearsAtCompany

if "MonthlyIncome" in hr.columns and "YearsAtCompany" in hr.columns:
    x = hr['MonthlyIncome']
    y = hr['YearsAtCompany']
    corr, pval = stats.pearsonr(x, y)
    print(f"Correlation (MonthlyIncome vs YearsAtCompany): r={corr:.3f}, p={pval:.3f}")
else:
    print("⚠️ Columns not found in HR dataset. Check column names.")


Correlation (MonthlyIncome vs YearsAtCompany): r=0.514, p=0.000


## 3 - t-test (compare two groups)

In [10]:
# compare MonthlyIncome of employees who left vs those who stayed

if "AttritionFlag" in hr.columns and "MonthlyIncome" in hr.columns:
    left = hr[hr['AttritionFlag']==1]['MonthlyIncome']
    stay = hr[hr['AttritionFlag']==0]['MonthlyIncome']
    tstat, pval = stats.ttest_ind(left, stay, equal_var=False)
    print(f"T-test (Income attrited vs stayed): t={tstat:.3f}, p={pval:.3f}")
else:
    print("⚠️ Columns not found in HR dataset.")



T-test (Income attrited vs stayed): t=-7.483, p=0.000


## 4 - Chi-Square Test (categorical vs categorical)

In [13]:
# check if Department and Attrition are related

if "Department" in hr.columns and "AttritionFlag" in hr.columns:
    contingency = pd.crosstab(hr['Department'], hr['AttritionFlag'])
    chi2, p, dof, expected = stats.chi2_contingency(contingency)
    print("Chi-square Test (Department vs Attrition)")
    print("chi2 =", chi2, "p-value =", p)
else:
    print("⚠️ Department or AttritionFlag column not found.")


Chi-square Test (Department vs Attrition)
chi2 = 10.79600732241067 p-value = 0.004525606574479633


## 5 - Superstore: Discount vs Sales

In [16]:
# correlation between Discount and Sales

if "Discount" in superstore.columns and "Sales" in superstore.columns:
    corr, pval = stats.pearsonr(superstore['Discount'], superstore['Sales'])
    print(f"Correlation (Discount vs Sales): r={corr:.3f}, p={pval:.3f}")
else:
    print("⚠️ Columns not found in Superstore dataset.")


Correlation (Discount vs Sales): r=-0.027, p=0.006


## 6 - Retail: Quantity vs UnitPrice

In [20]:
# Are higher priced items sold in smaller quantities?

if "Quantity" in retail.columns and "UnitPrice" in retail.columns:
    corr, pval = stats.pearsonr(retail['Quantity'], retail['UnitPrice'])
    print(f"Correlation (Quantity vs UnitPrice): r={corr:.3f}, p={pval:.3f}")
else:
    print("⚠️ Columns not found in Retail dataset.")


Correlation (Quantity vs UnitPrice): r=-0.005, p=0.004


## Save Results

In [23]:
import csv

results = [
    ["Test", "Variable 1", "Variable 2", "Statistic", "p-value", "Interpretation"]
]

# Example: fill with actual results
results.append(["Correlation", "MonthlyIncome", "YearsAtCompany", round(corr,3), round(pval,3), "Significant" if pval<0.05 else "Not Significant"])

# save results
with open("data/analysis/stat_tests.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(results)

print("Statistical test results saved to data/analysis/stat_tests.csv")


Statistical test results saved to data/analysis/stat_tests.csv
