## ALMUZAHIM NASEEF MUHAMAD S24B38/006 B30296

In [1]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

Performs a two-sample t-test with error checking and mitigation.
    
    Parameters:
    - sample1, sample2: Arrays of sample data
    - alpha: Significance level (default 0.05)
    - normality_threshold: P-value threshold for normality test
    - min_sample_size: Minimum sample size to proceed with t-test
    
    Returns:
    - dict with test results and diagnostics

In [8]:
import numpy as np
from scipy import stats
import warnings

def safe_hypothesis_test(sample1, sample2, alpha=0.05, normality_threshold=0.05, min_sample_size=5):

    try:
        # Convert inputs to numpy arrays and remove NaNs
        sample1 = np.array(sample1, dtype=float)
        sample2 = np.array(sample2, dtype=float)
        sample1 = sample1[~np.isnan(sample1)]
        sample2 = sample2[~np.isnan(sample2)]

        
        n1, n2 = len(sample1), len(sample2)
        if n1 < min_sample_size or n2 < min_sample_size:
            raise ValueError(f"Sample sizes too small: {n1} and {n2}. Need at least {min_sample_size}.")
        
        # Check for zero variance
        if np.var(sample1) == 0 or np.var(sample2) == 0:
            raise ValueError("One or both samples have zero variance. T-test not meaningful.")

        # Test normality (Shapiro-Wilk test)
        norm1 = stats.shapiro(sample1)
        norm2 = stats.shapiro(sample2)
        normality_flag = (norm1.pvalue < normality_threshold) or (norm2.pvalue < normality_threshold)
        
        if normality_flag:
            warnings.warn("Data may not be normally distributed.")
            # Fallback to Mann-Whitney U test if normality is violated
            stat, p_value = stats.mannwhitneyu(sample1, sample2, alternative='two-sided')
            test_type = "Mann-Whitney U (non-parametric)"
        else:
            # Performing t-test 
            stat, p_value = stats.ttest_ind(sample1, sample2, equal_var=False)
            test_type = "Welch's t-test"

        
        result = {
            "test_type": test_type,
            "statistic": stat,
            "p_value": p_value,
            "alpha": alpha,
            "reject_null": p_value < alpha,
            "sample_sizes": (n1, n2),
            "normality_p_values": (norm1.pvalue, norm2.pvalue)
        }
        
        return result

    except ValueError as ve:
        return {"error": str(ve)}
    except Exception as e:
        return {"error": f"Unexpected error: {str(e)}"}


if __name__ == "__main__":
    
    np.random.seed(42)
    sample1 = np.random.normal(100, 10, 20)  # Normal data
    sample2 = np.random.normal(105, 15, 18)  # Slightly different normal data
    
    
    sample_bad = [1, 2, np.nan, "not_a_number"]  # Will trigger cleaning/errors
    
    # Run the test
    result = safe_hypothesis_test(sample1, sample2)
    print("Good data result:")
    for key, value in result.items():
        print(f"{key}: {value}")
    
    # Test with bad data
    result_bad = safe_hypothesis_test(sample_bad, sample2)
    print("\nBad data result:")
    print(result_bad)

Good data result:
test_type: Welch's t-test
statistic: -0.7886831166680487
p_value: 0.4367642496965398
alpha: 0.05
reject_null: False
sample_sizes: (20, 18)
normality_p_values: (np.float64(0.8478800044350894), np.float64(0.880836820325634))

Bad data result:
{'error': "could not convert string to float: 'not_a_number'"}
