In [2]:
import pandas as pd
import numpy as np
import scipy.stats as ss
from statsmodels.stats.weightstats import ztest


## Z-test for a population mean (variance known)

Object
To investigate the significance of the difference between an assumed population mean
µ0 and a sample mean $\overline{X}$.

$Z = \frac{\bar{X} - \mu}{\frac{\sigma}{\sqrt{n}}}$


In [9]:
population_mean = 100
population_std = 3  
x = ss.norm.rvs(100,9, size=20)
# Performing Z-test
z_statistic, p_value = ztest(x, value=population_mean, alternative='two-sided')
print("Z-statistic:", z_statistic)
print("P-value:", p_value)


Z Statistic: 0.5741692517632145
P-value: 0.5816333668955778
Reject null hypothesis: False


## Test 2 Z-test for two population means (variances known and equal)

To investigate the significance of the difference between the means of two populations

$Z = \frac{{\bar{x}_1 - \bar{x}_2}-(\mu_1-\mu_2)}{{\sigma\sqrt{\frac{{1}}{{n_1}} + \frac{{1}}{{n_2}}}}}$


In [10]:
from scipy import stats

# Example data
sample1 = [25, 30, 35, 40, 45]
sample2 = [20, 28, 32, 38, 42]

# Define significance level
alpha = 0.05

# Perform Z-test for two population means
z_statistic, p_value = stats.ttest_ind(sample1, sample2)

# Determine if the null hypothesis should be rejected
reject_null = p_value < alpha

print("Z Statistic:", z_statistic)
print("P-value:", p_value)
print("Reject null hypothesis:", reject_null)

Z Statistic: 0.5741692517632145
P-value: 0.5816333668955778
Reject null hypothesis: False


In [13]:
def z_test_two_means(sample_mean1, sample_mean2, population_std, n1, n2):
    se = population_std * ((1 / n1) + (1 / n2))**0.5
    z = (sample_mean1 - sample_mean2) / se
    p_value = 2 * ss.norm.cdf(-abs(z))  # two-tailed test
    
    return z, p_value

# Example data
sample_mean1 = 110 
sample_mean2 = 100 
population_std = 15
n1 = 30  
n2 = 20  

# Perform Z-test
z_score, p_value = z_test_two_means(sample_mean1, sample_mean2, population_std, n1, n2)
print("Z-score:", z_score)
print("P-value:", p_value)


Z-score: 2.309401076758503
P-value: 0.02092133533779403


## Test 3 Z-test for two population means (variancesknown and unequal)

Object
To investigate the significance of the difference between the means of two populations.

$Z = \frac{{\bar{X}_1 - \bar{X}_2}}{{\sqrt{\frac{{\sigma_1^2}}{{n_1}} + \frac{{\sigma_2^2}}{{n_2}}}}}$


In [11]:
from scipy import stats

# Example data
sample1 = [25, 30, 35, 40, 45]
sample2 = [20, 28, 32, 38, 42]

# Perform Z-test assuming variances are known and unequal
t_statistic, p_value = stats.ttest_ind(sample1, sample2, equal_var=False)

alpha = 0.05

print("T Statistic:", t_statistic)
print("P-value:", p_value)

if p_value < alpha:
    print("Reject null hypothesis: There is a significant difference between the means of two populations.")
else:
    print("Fail to reject null hypothesis: There is no significant difference between the means of two populations.")

T Statistic: 0.5741692517632145
P-value: 0.5817425944182096
Fail to reject null hypothesis: There is no significant difference between the means of two populations.


In [16]:
group1 = np.array([75, 80, 85, 90, 95]) 
group2 = np.array([65, 70, 75, 80, 85, 90])  
mean1 = np.mean(group1)  
mean2 = np.mean(group2)  
std1 = np.std(group1, ddof=1)  
std2 = np.std(group2, ddof=1)  
n1 = len(group1)  
n2 = len(group2) 

z_score = (mean1 - mean2) / np.sqrt((std1**2 / n1) + (std2**2 / n2))

p_value = 2 * (1 - ss.norm.cdf(abs(z_score)))

print("Z-score:", z_score)
print("P-value:", p_value)


Z-score: 1.441153384245784
P-value: 0.14954135458461515


## Test 4 Z-test for a proportion (binomial distribution)

Object
To investigate the significance of the difference between an assumed proportion p0 and an observed proportion p

$Z = \frac{{p - p_0}}{{\sqrt{\frac{{p_0(1 - p_0)}}{{n}}}}}$


In [14]:
from statsmodels.stats.proportion import proportions_ztest

# Sample data
successes_sample1 = 25
total_sample1 = 100
successes_sample2 = 20
total_sample2 = 100

# Perform Z-test for proportions
count = np.array([successes_sample1, successes_sample2])
nobs = np.array([total_sample1, total_sample2])

# Assuming null hypothesis of equal proportions
z_stat, p_value = proportions_ztest(count, nobs)

# Output results
print("Z Statistic:", z_stat)
print("P-value:", p_value)

# Determine if the null hypothesis should be rejected
alpha = 0.05
reject_null = p_value < alpha
print("Reject null hypothesis:", reject_null)

Z Statistic: 0.8466675133346031
P-value: 0.3971804712199202
Reject null hypothesis: False


## Test 5 Z-test for the equality of two proportions(binomial distribution)

Object
To investigate the assumption that the proportions π1 and π2 of elements from two populations are equal, based on two samples, one from each population.

$Z = \frac{(\hat{p}_1 - \hat{p}_2)}{\sqrt{\hat{p}(1-\hat{p})\left(\frac{1}{n_1} + \frac{1}{n_2}\right)}}$

In [15]:
from statsmodels.stats.proportion import proportions_ztest

# Example data
successes1 = 40  # Number of successes in sample 1
trials1 = 100    # Number of trials in sample 1

successes2 = 30  # Number of successes in sample 2
trials2 = 100    # Number of trials in sample 2

# Perform Z-test for two proportions
z_stat, p_value = proportions_ztest([successes1, successes2], [trials1, trials2])

# Print results
print("Z Statistic:", z_stat)
print("P-value:", p_value)

# Interpret results
alpha = 0.05
if p_value < alpha:
    print("Reject null hypothesis: There is a significant difference between the proportions.")
else:
    print("Fail to reject null hypothesis: There is no significant difference between the proportions.")

Z Statistic: 1.4824986333222028
P-value: 0.1382076669740257
Fail to reject null hypothesis: There is no significant difference between the proportions.


In [2]:
import statsmodels.api as sm
successes_group1 = 45  # Number of successes in group 1
nobs_group1 = 100     # Total number of observations in group 1

successes_group2 = 60  # Number of successes in group 2
nobs_group2 = 100     # Total number of observations in group 2
count = np.array([successes_group1, successes_group2])
nobs = np.array([nobs_group1, nobs_group2])
zstat, pval = sm.stats.proportions_ztest(count, nobs)

# Output the test statistic and p-value
print("Z-test statistic:", zstat)
print("P-value:", pval)


Z-test statistic: -2.1239769762143657
P-value: 0.033672068856345855


## Test 6: Z-test for comparing two counts (Poisson distribution)

Object
To investigate the significance of the difference between two counts

Let n1 and n2 be the two counts taken over times t1 and t2, respectively. Then the two
average frequencies are R1 = n1/t1 and R2 = n2/t2. To test the assumption of equal
average frequencies we use the test statistic
$Z = \frac{\bar{R}_1 - \bar{R}_2}{\sqrt{\frac{R_1}{t_1} + \frac{R_2}{t_2}}}$




In [12]:
from scipy import stats
import numpy as np

# Define the counts for two samples
count_sample1 = 50
count_sample2 = 60

# Calculate the means and standard deviations of the Poisson distributions
mean1 = count_sample1
mean2 = count_sample2
std_dev1 = np.sqrt(count_sample1)
std_dev2 = np.sqrt(count_sample2)

# Calculate the pooled standard error
pooled_std_error = np.sqrt((std_dev1**2 / count_sample1) + (std_dev2**2 / count_sample2))

# Calculate the Z-score
z_score = (mean1 - mean2) / pooled_std_error

# Calculate the p-value
p_value = stats.norm.cdf(z_score)

# Two-tailed test, so double the p-value
p_value *= 2

print("Z-score:", z_score)
print("P-value:", p_value)



Z-score: -7.071067811865475
P-value: 1.5374597944280347e-12


## Test 7 t-test for a population mean (variance unknown)
Object
To investigate the significance of the difference between an assumed population mean µ0 and a sample mean $\overline{x}$

$t = \frac{\bar{x} - \mu}{s/\sqrt{n}}$

In [6]:
# Generate some sample data
np.random.seed(42)  # For reproducibility
sample_data = np.random.normal(loc=5, scale=2, size=30)  # Sample data with mean 5 and std deviation 2

# Define the population mean to test against
pop_mean = 4.5

# Perform t-test
t_statistic, p_value = ss.ttest_1samp(sample_data, pop_mean)

# Print results
print("T-statistic:", t_statistic)
print("P-value:", p_value)

# Determine significance
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis. There is significant evidence to suggest that the population mean is not", pop_mean)
else:
    print("Fail to reject the null hypothesis. There is not enough evidence to suggest that the population mean is not", pop_mean)


T-statistic: 0.3764233161298853
P-value: 0.7093423019666218
Fail to reject the null hypothesis. There is not enough evidence to suggest that the population mean is not 4.5


## Test 8 t-test for two population means (variancesunknown but equal)
Object:- To investigate the significance of the difference between the means of two populations

$s_p^2 = \frac{(n_1 - 1)s_1^2 + (n_2 - 1)s_2^2}{n_1 + n_2 - 2}$


$t = \frac{\bar{X} - \bar{Y}}{s_p \sqrt{\frac{1}{n_1} + \frac{1}{n_2}}}$

In [11]:

from scipy import stats

# Sample data for two populations
population1 = [17, 21, 26, 20, 19, 22, 18, 23, 25, 24]
population2 = [16, 19, 23, 18, 17, 20, 15, 21, 22, 20]

# Perform t-test assuming equal variances
t_statistic, p_value = stats.ttest_ind(population1, population2, equal_var=True)

# Print results
print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

# Interpret the results
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis. There is a significant difference between the means.")
else:
    print("Fail to reject the null hypothesis. There is no significant difference between the means.")


T-Statistic: 1.9013318388714453
P-Value: 0.07338999818575455
Fail to reject the null hypothesis. There is no significant difference between the means.


## Test 9 t-test for two population means (variances unknown and unequal)

Object:- To investigate the significance of the difference between the means of two populations.

$t = \frac{\bar{X} - \bar{Y}}{\sqrt{\frac{s_1^2}{n_1} + \frac{s_2^2}{n_2}}}$

In [9]:

# Example usage
x1 = [85, 90, 95, 100, 105]  # Sample data for population 1
x2 = [75, 80, 85, 90, 95]     # Sample data for population 2

t_statistic, p_value = stats.ttest_ind(x1, x2, equal_var=False)
print("t-statistic:", t_stat)
print("p-value:", p_value)


t-statistic: 2.0
p-value: 0.08051623795726257


## Test 10 t-test for two population means (method of paired comparisons)
Object:-

To investigate the significance of the difference between two population means, µ1 and µ2. No assumption is made about the population variances

$t = \frac{\bar{d} - \mu_0}{\frac{s_d}{\sqrt{n}}}$

In [10]:
from scipy import stats

# Sample data for two groups
group1 = [15, 16, 18, 20, 22]
group2 = [14, 16, 17, 19, 21]

# Perform paired t-test
t_statistic, p_value = stats.ttest_rel(group1, group2)

# Print the results
print("T-statistic:", t_statistic)
print("P-value:", p_value)

# Interpret the results
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis. There is a significant difference between the means.")
else:
    print("Fail to reject the null hypothesis. There is no significant difference between the means.")


T-statistic: 3.9999999999999996
P-value: 0.016130089900092532
Reject the null hypothesis. There is a significant difference between the means.


## Test 11 t-test of a regression coefficient
Object
To investigate the significance of the regression coefficient of y on x

$t = \frac{\hat{\beta} - \beta_0}{\text{SE}(\hat{\beta})}$

In [13]:
import numpy as np
from scipy import stats

# Example data
x = np.array([1, 2, 3, 4, 5])
y = np.array([2, 3, 4, 5, 6])

# Perform linear regression to get the coefficients
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

# Calculate standard error of the slope
n = len(x)
df = n - 2  # degrees of freedom
residuals = y - (slope * x + intercept)
residual_std_err = np.sqrt(np.sum(residuals ** 2) / df)
slope_std_err = residual_std_err / np.sqrt(np.sum((x - np.mean(x)) ** 2))

# Calculate t-statistic
t_statistic = slope / slope_std_err

# Calculate p-value
p_value = stats.t.sf(np.abs(t_statistic), df) * 2  # two-sided test

print("t-statistic:", t_statistic)
print("p-value:", p_value)


t-statistic: inf
p-value: 0.0


  t_statistic = slope / slope_std_err


## Test 12 t-test of a correlation coefficient
Object
To investigate whether the difference between the sample correlation coefficient and zero is statistically significant

$t = \frac{r \sqrt{n - 2}}{\sqrt{1 - r^2}}$

where $r = \frac{{\sum_{i=1}^{n} (x_i - \bar{x})(y_i - \bar{y})}}{{\sqrt{\sum_{i=1}^{n} (x_i - \bar{x})^2 \sum_{i=1}^{n} (y_i - \bar{y})^2}}}$

In [14]:
import numpy as np
from scipy.stats import t

def correlation_t_test(r, n):
    df = n - 2  # degrees of freedom
    t_value = r * np.sqrt(df) / np.sqrt(1 - r**2)
    p_value = 2 * (1 - t.cdf(abs(t_value), df))
    return t_value, p_value

# Example usage:
r = 0.6  # correlation coefficient
n = 50   # number of samples
t_value, p_value = correlation_t_test(r, n)
print("t-value:", t_value)
print("p-value:", p_value)


t-value: 5.196152422706631
p-value: 4.120215931102678e-06


## Test 13 Z-test of a correlation coefficient
Object
To investigate the significance of the difference between a correlation coefficient and a specified value ρ0

$Z = \frac{1}{2} \ln\left(\frac{{1+r}}{{1-r}}\right)$

In [17]:
import numpy as np
from scipy.stats import pearsonr

def z_test_correlation(r, n):

    # Calculate the Z-score using Fisher's transformation
    z_score = 0.5 * np.log((1 + r) / (1 - r)) * np.sqrt(n - 3)

    # Calculate the p-value
    p_value = 2 * (1 - ss.norm.cdf(abs(z_score)))

    return z_score, p_value

# Example usage:
# Sample correlation coefficient
r = 0.75
# Sample size
n = 50

# Perform Z-test
z_score, p_value = z_test_correlation(r, n)

print("Z-score:", z_score)
print("P-value:", p_value)


Z-score: 6.670243932669069
P-value: 2.55377941016377e-11


## Test 15 χ2-test for a population variance


In [20]:
import numpy as np
from scipy.stats import chi2

def chi_square_test_population_variance(sample, sigma_squared, alpha):
    n = len(sample)
    sample_variance = np.var(sample, ddof=1)  # Sample variance
    
    # Calculate the test statistic
    chi2_statistic = (n - 1) * sample_variance / sigma_squared
    
    # Calculate the critical value
    critical_value = chi2.ppf(1 - alpha, df=n - 1)
    
    # Determine if the null hypothesis should be rejected
    reject_null = chi2_statistic > critical_value
    
    return chi2_statistic, critical_value, reject_null

# Example usage:
sample = [3.2, 4.5, 2.8, 3.9, 4.1]
sigma_squared = 1.2
alpha = 0.05
chi2_statistic, critical_value, reject_null = chi_square_test_population_variance(sample, sigma_squared, alpha)

print("Chi-square Statistic:", chi2_statistic)
print("Critical Value:", critical_value)
print("Reject null hypothesis:", reject_null)


Chi-square Statistic: 1.583333333333333
Critical Value: 9.487729036781154
Reject null hypothesis: False


## Test 16 F-test for two population variances (variance ratio test)

In [21]:
from scipy import stats

# Example datasets
sample1 = [25, 30, 35, 40, 45]
sample2 = [20, 28, 32, 38, 42]

# Perform F-test
f_statistic, p_value = stats.f_oneway(sample1, sample2)

# Print results
print("F Statistic:", f_statistic)
print("P-value:", p_value)


F Statistic: 0.32967032967032966
P-value: 0.5816333668955771


## Test 17 F-test for two population variances (with correlated observations)