Statistics Part **2**

1. Write a Python program to perform a Z-test for comparing a sample mean to a known population mean and
interpret the results.

In [None]:
import numpy as np
from scipy import stats

# Given values
sample_mean = 25.5  # Sample mean
population_mean = 24.0  # Known population mean
population_std = 3.5  # Known population standard deviation
sample_size = 36  # Sample size

# Calculate the standard error
std_error = population_std / np.sqrt(sample_size)

# Calculate the Z-score
z_score = (sample_mean - population_mean) / std_error

# Calculate the P-value (two-tailed test)
p_value = 2 * (1 - stats.norm.cdf(abs(z_score)))

# Print the results
print(f"Z-score: {z_score:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpret the results
alpha = 0.05  # Significance level
if p_value < alpha:
 print("Reject the null hypothesis. The sample mean is significantly different from the population mean.")
else:
 print("Fail to reject the null hypothesis. The sample mean is not significantly different from the population mean.")


2. Simulate random data to perform hypothesis testing and calculate the corresponding P-value using Python.

In [None]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

# Set seed for reproducibility
np.random.seed(0)

# Simulate two groups of random data (e.g., treatment and control)
group1 = np.random.normal(loc=20, scale=3, size=100)  # Treatment group
group2 = np.random.normal(loc=20, scale=3, size=100)  # Control group

# Perform two-sample t-test
t_stat, p_value = stats.ttest_ind(group1, group2)

# Print the results
print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpret the results
alpha = 0.05  # Significance level
if p_value < alpha:
 print("Reject the null hypothesis. The means of the two groups are significantly different.")
else:
 print("Fail to reject the null hypothesis. The means of the two groups are not significantly different.")

# Visualize the data
plt.hist(group1, alpha=0.5, label='Group 1')
plt.hist(group2, alpha=0.5, label='Group 2')
plt.legend()
plt.show()



3. Implement a one-sample Z-test using Python to compare the sample mean with the population mean.

In [None]:
import numpy as np
from scipy import stats

# Given values
sample = np.array([23, 21, 19, 24, 20, 22, 18, 25, 19, 21])  # Sample data
population_mean = 20.5  # Known population mean
population_std = 2.5  # Known population standard deviation

# Calculate sample mean and size
sample_mean = np.mean(sample)
sample_size = len(sample)

# Calculate standard error
std_error = population_std / np.sqrt(sample_size)

# Calculate Z-score
z_score = (sample_mean - population_mean) / std_error

# Calculate P-value (two-tailed test)
p_value = 2 * (1 - stats.norm.cdf(abs(z_score)))

# Print the results
print(f"Sample Mean: {sample_mean:.4f}")
print(f"Z-score: {z_score:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpret the results
alpha = 0.05  # Significance level
if p_value < alpha:
 print("Reject the null hypothesis. The sample mean is significantly different from the population mean.")
else:
 print("Fail to reject the null hypothesis. The sample mean is not significantly different from the population mean.")


4. Perform a two-tailed Z-test using Python and visualize the decision region on a plot.

In [None]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

# Given values
sample_mean = 25.5  # Sample mean
population_mean = 24.0  # Known population mean
population_std = 3.5  # Known population standard deviation
sample_size = 36  # Sample size
alpha = 0.05  # Significance level

# Calculate standard error
std_error = population_std / np.sqrt(sample_size)

# Calculate Z-score
z_score = (sample_mean - population_mean) / std_error

# Calculate critical Z-values for two-tailed test
critical_z = stats.norm.ppf(1 - alpha / 2)

# Calculate P-value (two-tailed test)
p_value = 2 * (1 - stats.norm.cdf(abs(z_score)))

# Print the results
print(f"Z-score: {z_score:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpret the results
if p_value < alpha:
 print("Reject the null hypothesis. The sample mean is significantly different from the population mean.")
else:
 print("Fail to reject the null hypothesis. The sample mean is not significantly different from the population mean.")

# Visualize the decision region
x = np.linspace(-3, 3, 100)
y = stats.norm.pdf(x)

plt.plot(x, y)
plt.fill_between(x, y, where=(x < -critical_z) | (x > critical_z), alpha=0.5, label='Rejection Region')
plt.axvline(x=-critical_z, color='red', linestyle='--', label='Critical Value')
plt.axvline(x=critical_z, color='red', linestyle='--')
plt.axvline(x=z_score, color='green', label='Observed Z-score')
plt.legend()
plt.show()




5. Create a Python function that calculates and visualizes Type 1 and Type 2 errors during hypothesis testing.

In [None]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

def calculate_errors(alpha, beta, mu0, mu1, sigma, n):
 # Calculate critical value for Type 1 error
 critical_value = stats.norm.ppf(1 - alpha, loc=mu0, scale=sigma / np.sqrt(n))

 # Calculate Type 1 error (alpha)
 type1_error = alpha

 # Calculate Type 2 error (beta)
 type2_error = stats.norm.cdf(critical_value, loc=mu1, scale=sigma / np.sqrt(n))

 return type1_error, type2_error

def visualize_errors(alpha, beta, mu0, mu1, sigma, n):
 # Calculate critical value
 critical_value = stats.norm.ppf(1 - alpha, loc=mu0, scale=sigma / np.sqrt(n))

 # Generate x values for plotting
 x = np.linspace(mu0 - 3 * sigma / np.sqrt(n), mu1 + 3 * sigma / np.sqrt(n), 100)

 # Plot null hypothesis distribution
 y0 = stats.norm.pdf(x, loc=mu0, scale=sigma / np.sqrt(n))
 plt.plot(x, y0, label='Null Hypothesis')

 # Plot alternative hypothesis distribution
 y1 = stats.norm.pdf(x, loc=mu1, scale=sigma / np.sqrt(n))
 plt.plot(x, y1, label='Alternative Hypothesis')

 # Shade Type 1 error region
 plt.fill_between(x, y0, where=(x > critical_value), alpha=0.5, label='Type 1 Error')

 # Shade Type 2 error region
 plt.fill_between(x, y1, where=(x < critical_value), alpha=0.5, label='Type 2 Error')

 plt.axvline(x=critical_value, color='red', linestyle='--', label='Critical Value')
 plt.legend()
 plt.show()

# Example usage
alpha = 0.05  # Significance level
beta = 0.2  # Power of the test
mu0 = 0  # Mean under null hypothesis
mu1 = 1  # Mean under alternative hypothesis
sigma = 1  # Standard deviation
n = 100  # Sample size

type1_error, type2_error = calculate_errors(alpha, beta, mu0, mu1, sigma, n)
print(f"Type 1 Error: {type1_error:.4f}")
print(f"Type 2 Error: {type2_error:.4f}")

visualize_errors(alpha, beta, mu0, mu1, sigma, n)



6. Write a Python program to perform an independent T-test and interpret the results.

In [None]:
import numpy as np
from scipy import stats

# Sample data for two groups
group1 = np.array([23, 21, 19, 24, 20, 22, 18, 25, 19, 21])
group2 = np.array([20, 18, 16, 22, 17, 19, 15, 21, 16, 18])

# Perform independent T-test
t_stat, p_value = stats.ttest_ind(group1, group2)

# Print the results
print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpret the results
alpha = 0.05  # Significance level
if p_value < alpha:
 print("Reject the null hypothesis. The means of the two groups are significantly different.")
else:
 print("Fail to reject the null hypothesis. The means of the two groups are not significantly different.")


7. Perform a paired sample T-test using Python and visualize the comparison results.

In [None]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

# Sample data (before and after treatment)
before = np.array([23, 21, 19, 24, 20, 22, 18, 25, 19, 21])
after = np.array([20, 18, 16, 22, 17, 19, 15, 21, 16, 18])

# Perform paired sample T-test
t_stat, p_value = stats.ttest_rel(before, after)

# Print the results
print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpret the results
alpha = 0.05  # Significance level
if p_value < alpha:
 print("Reject the null hypothesis. The means are significantly different.")
else:
 print("Fail to reject the null hypothesis. The means are not significantly different.")

# Visualize the comparison results
plt.figure(figsize=(10, 6))

# Plot before and after treatment values
plt.plot(before, label='Before', marker='o')
plt.plot(after, label='After', marker='o')

# Plot lines connecting before and after values
for i in range(len(before)):
 plt.plot([i, i], [before[i], after[i]], color='gray', linestyle='--')

plt.xlabel('Subject')
plt.ylabel('Value')
plt.title('Before and After Treatment Comparison')
plt.legend()
plt.show()

# Calculate and plot mean difference
mean_before = np.mean(before)
mean_after = np.mean(after)
plt.bar(['Before', 'After'], [mean_before, mean_after])
plt.xlabel('Group')
plt.ylabel('Mean Value')
plt.title('Mean Comparison')
plt.show()



8. Simulate data and perform both Z-test and T-test, then compare the results using Python.

In [None]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

# Set seed for reproducibility
np.random.seed(0)

# Simulate data
population_mean = 20
population_std = 3
sample_size = 30
sample = np.random.normal(loc=population_mean, scale=population_std, size=sample_size)

# Calculate sample mean and standard deviation
sample_mean = np.mean(sample)
sample_std = np.std(sample)

# Perform Z-test
z_score = (sample_mean - population_mean) / (population_std / np.sqrt(sample_size))
p_value_z = 2 * (1 - stats.norm.cdf(abs(z_score)))

# Perform T-test
t_stat, p_value_t = stats.ttest_1samp(sample, population_mean)

# Print the results
print(f"Sample Mean: {sample_mean:.4f}")
print(f"Z-test: Z-score = {z_score:.4f}, P-value = {p_value_z:.4f}")
print(f"T-test: T-statistic = {t_stat:.4f}, P-value = {p_value_t:.4f}")

# Compare the results
alpha = 0.05
if p_value_z < alpha:
 print("Z-test: Reject the null hypothesis.")
else:
 print("Z-test: Fail to reject the null hypothesis.")

if p_value_t < alpha:
 print("T-test: Reject the null hypothesis.")
else:
 print("T-test: Fail to reject the null hypothesis.")

# Visualize the data
plt.hist(sample, alpha=0.5, label='Sample Data')
plt.axvline(x=population_mean, color='red', label='Population Mean')
plt.axvline(x=sample_mean, color='green', label='Sample Mean')
plt.legend()
plt.show()




9. Write a Python function to calculate the confidence interval for a sample mean and explain its significance.

In [None]:
import numpy as np
from scipy import stats

def calculate_confidence_interval(sample, confidence_level=0.95):
 # Calculate sample mean and standard deviation
 sample_mean = np.mean(sample)
 sample_std = np.std(sample, ddof=1)  # Use sample standard deviation (Bessel's correction)
 n = len(sample)

 # Calculate standard error
 std_error = sample_std / np.sqrt(n)

 # Calculate critical value (t-distribution)
 critical_value = stats.t.ppf((1 + confidence_level) / 2, df=n - 1)

 # Calculate margin of error
 margin_of_error = critical_value * std_error

 # Calculate confidence interval
 lower_bound = sample_mean - margin_of_error
 upper_bound = sample_mean + margin_of_error

 return lower_bound, upper_bound

# Example usage
sample = np.array([23, 21, 19, 24, 20, 22, 18, 25, 19, 21])
confidence_level = 0.95
lower_bound, upper_bound = calculate_confidence_interval(sample, confidence_level)
print(f"{confidence_level*100}% Confidence Interval: ({lower_bound:.4f}, {upper_bound:.4f})")


10. Write a Python program to calculate the margin of error for a given confidence level using sample data.

In [None]:
import numpy as np
from scipy import stats

def calculate_margin_of_error(sample, confidence_level=0.95):
 # Calculate sample standard deviation
 sample_std = np.std(sample, ddof=1)  # Use sample standard deviation (Bessel's correction)
 n = len(sample)

 # Calculate standard error
 std_error = sample_std / np.sqrt(n)

 # Calculate critical value (t-distribution)
 critical_value = stats.t.ppf((1 + confidence_level) / 2, df=n - 1)

 # Calculate margin of error
 margin_of_error = critical_value * std_error

 return margin_of_error

# Example usage
sample = np.array([23, 21, 19, 24, 20, 22, 18, 25, 19, 21])
confidence_level = 0.95
margin_of_error = calculate_margin_of_error(sample, confidence_level)
print(f"Margin of Error at {confidence_level*100}% confidence level: {margin_of_error:.4f}")


11. Implement a Bayesian inference method using Bayes' Theorem in Python and explain the process.

In [None]:
import numpy as np

def bayes_theorem(prior_A, prob_B_given_A, prob_B):
 # Calculate the numerator of Bayes' Theorem
 numerator = prior_A * prob_B_given_A

 # Calculate the denominator of Bayes' Theorem
 denominator = prob_B

 # Calculate the posterior probability
 posterior_A_given_B = numerator / denominator

 return posterior_A_given_B

# Example usage
prior_A = 0.1  # Prior probability of hypothesis A (e.g., disease prevalence)
prob_B_given_A = 0.9  # Probability of observing data B given hypothesis A (e.g., test accuracy)
prob_B = 0.15  # Probability of observing data B (e.g., test results in the population)

posterior_A_given_B = bayes_theorem(prior_A, prob_B_given_A, prob_B)
print(f"Posterior probability of A given B: {posterior_A_given_B:.4f}")


12. Perform a Chi-square test for independence between two categorical variables in Python.

In [None]:
import numpy as np
from scipy import stats

# Sample data (contingency table)
observed = np.array([[20, 15], [10, 25]])  # Rows represent categories of one variable, columns represent categories of another variable

# Perform Chi-square test
chi2_stat, p_value, dof, expected = stats.chi2_contingency(observed)

# Print the results
print(f"Chi-square statistic: {chi2_stat:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Degrees of freedom: {dof}")

# Interpret the results
alpha = 0.05
if p_value < alpha:
 print("Reject the null hypothesis. The variables are likely dependent.")
else:
 print("Fail to reject the null hypothesis. The variables are likely independent.")


13. Write a Python program to calculate the expected frequencies for a Chi-square test based on observed
data.

In [None]:
import numpy as np

def calculate_expected_frequencies(observed):
 # Calculate row and column totals
 row_totals = np.sum(observed, axis=1)
 col_totals = np.sum(observed, axis=0)
 grand_total = np.sum(observed)

 # Calculate expected frequencies
 expected = np.outer(row_totals, col_totals) / grand_total

 return expected

# Example usage
observed = np.array([[20, 15], [10, 25]])  # Contingency table
expected = calculate_expected_frequencies(observed)
print("Expected Frequencies:")
print(expected)



14.  Perform a goodness-of-fit test using Python to compare the observed data to an expected distribution.

In [None]:
import numpy as np
from scipy import stats

# Observed data
observed = np.array([16, 18, 16, 14, 12, 12])  # Frequencies for each category

# Expected probabilities for each category (uniform distribution)
expected_probabilities = np.array([1/6, 1/6, 1/6, 1/6, 1/6, 1/6])

# Calculate expected frequencies
total_observations = np.sum(observed)
expected_frequencies = total_observations * expected_probabilities

# Perform goodness-of-fit test
chi2_stat, p_value = stats.chisquare(observed, expected_frequencies)

# Print the results
print(f"Chi-square statistic: {chi2_stat:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpret the results
alpha = 0.05
if p_value < alpha:
 print("Reject the null hypothesis. The observed data does not fit the expected distribution.")
else:
 print("Fail to reject the null hypothesis. The observed data fits the expected distribution.")



15. Create a Python script to simulate and visualize the Chi-square distribution and discuss its characteristics.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# Set seed for reproducibility
np.random.seed(0)

# Define degrees of freedom
df = 5

# Simulate Chi-square distribution
chi2_samples = np.random.chisquare(df, size=1000)

# Plot histogram of simulated data
plt.hist(chi2_samples, bins=30, density=True, alpha=0.5, label='Simulated Chi-square')

# Plot theoretical Chi-square distribution
x = np.linspace(0, 15, 100)
y = stats.chi2.pdf(x, df)
plt.plot(x, y, 'r-', label=f'Chi-square Distribution (df={df})')

plt.xlabel('Value')
plt.ylabel('Probability Density')
plt.title('Chi-square Distribution Simulation')
plt.legend()
plt.show()

# Calculate mean and variance
mean = np.mean(chi2_samples)
variance = np.var(chi2_samples)
print(f"Mean: {mean:.4f}")
print(f"Variance: {variance:.4f}")


16.  Implement an F-test using Python to compare the variances of two random samples.

In [None]:
import numpy as np
from scipy import stats

# Sample data
sample1 = np.array([23, 21, 19, 24, 20, 22, 18, 25, 19, 21])
sample2 = np.array([20, 18, 16, 22, 17, 19, 15, 21, 16, 18])

# Perform F-test
f_stat, p_value = stats.f_oneway(sample1, sample2)

# However, for comparing variances directly, we calculate F-statistic manually
var1 = np.var(sample1, ddof=1)
var2 = np.var(sample2, ddof=1)
f_stat_manual = var1 / var2 if var1 > var2 else var2 / var1

# Calculate p-value manually (two-tailed test)
p_value_manual = 2 * (1 - stats.f.cdf(f_stat_manual, len(sample1) - 1, len(sample2) - 1))

# Print the results
print(f"F-statistic (manual calculation): {f_stat_manual:.4f}")
print(f"P-value (manual calculation): {p_value_manual:.4f}")

# Interpret the results
alpha = 0.05
if p_value_manual < alpha:
 print("Reject the null hypothesis. The variances are significantly different.")
else:
 print("Fail to reject the null hypothesis. The variances are not significantly different.")


17. Write a Python program to perform an ANOVA test to compare means between multiple groups and
interpret the results.

In [None]:
import numpy as np
from scipy import stats

# Sample data
group1 = np.array([23, 21, 19, 24, 20])
group2 = np.array([20, 18, 16, 22, 17])
group3 = np.array([18, 15, 13, 19, 16])

# Perform ANOVA test
f_stat, p_value = stats.f_oneway(group1, group2, group3)

# Print the results
print(f"F-statistic: {f_stat:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpret the results
alpha = 0.05
if p_value < alpha:
 print("Reject the null hypothesis. The means are significantly different.")
else:
 print("Fail to reject the null hypothesis. The means are not significantly different.")



18. Perform a one-way ANOVA test using Python to compare the means of different groups and plot the results.

In [None]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

# Sample data
np.random.seed(0)
group1 = np.random.normal(loc=20, scale=3, size=10)
group2 = np.random.normal(loc=22, scale=3, size=10)
group3 = np.random.normal(loc=18, scale=3, size=10)

# Perform one-way ANOVA test
f_stat, p_value = stats.f_oneway(group1, group2, group3)

# Print the results
print(f"F-statistic: {f_stat:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpret the results
alpha = 0.05
if p_value < alpha:
 print("Reject the null hypothesis. The means are significantly different.")
else:
 print("Fail to reject the null hypothesis. The means are not significantly different.")

# Plot the results
plt.figure(figsize=(8, 6))
plt.boxplot([group1, group2, group3], labels=['Group 1', 'Group 2', 'Group 3'])
plt.title('Boxplot of Group Means')
plt.ylabel('Value')
plt.show()

# Plot mean comparison
means = [np.mean(group1), np.mean(group2), np.mean(group3)]
plt.bar(['Group 1', 'Group 2', 'Group 3'], means)
plt.title('Mean Comparison')
plt.ylabel('Mean Value')
plt.show()



19. Write a Python function to check the assumptions (normality, independence, and equal variance) for ANOVA.

In [None]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

def check_anova_assumptions(data):
 # Normality assumption
 for i, group in enumerate(data):
 stat, p = stats.shapiro(group)
 print(f"Group {i+1} Normality Test: Shapiro-Wilk statistic = {stat:.4f}, p-value = {p:.4f}")
 if p < 0.05:
 print(f"Group {i+1} does not appear to be normally distributed.")
 else:
 print(f"Group {i+1} appears to be normally distributed.")

 # Equal variance assumption
 variances = [np.var(group, ddof=1) for group in data]
 stat, p = stats.levene(*data)
 print(f"\nLevene's Test for Equal Variances: statistic = {stat:.4f}, p-value = {p:.4f}")
 if p < 0.05:
 print("The groups do not appear to have equal variances.")
 else:
 print("The groups appear to have equal variances.")

 # Independence assumption
 # Note: Independence is often assumed based on the study design and cannot be tested statistically.
 print("\nIndependence assumption: This assumption is typically assessed based on the study design and data collection process.")

 # Visualize the data
 plt.figure(figsize=(8, 6))
 plt.boxplot(data, labels=[f"Group {i+1}" for i in range(len(data))])
 plt.title('Boxplot of Groups')
 plt.show()

# Example usage
group1 = np.random.normal(loc=20, scale=3, size=10)
group2 = np.random.normal(loc=22, scale=3, size=10)
group3 = np.random.normal(loc=18, scale=3, size=10)
data = [group1, group2, group3]
check_anova_assumptions(data)


20. Perform a two-way ANOVA test using Python to study the interaction between two factors and visualize the
results.

In [None]:
import numpy as np
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Sample data
np.random.seed(0)
factor1 = np.repeat(['A', 'B'], 30)
factor2 = np.tile(['X', 'Y', 'Z'], 20)
values = np.random.normal(loc=20, scale=5, size=60) + np.where(factor1 == 'A', 2, 0) + np.where(factor2 == 'Y', 3, 0)

# Create a DataFrame
data = pd.DataFrame({
 'Factor1': factor1,
 'Factor2': factor2,
 'Values': values
})

# Perform two-way ANOVA test
model = ols('Values ~ C(Factor1) + C(Factor2) + C(Factor1):C(Factor2)', data=data).fit()
anova_table = anova_lm(model, typ=2)

# Print the ANOVA table
print(anova_table)

# Visualize the results
plt.figure(figsize=(8, 6))
sns.boxplot(x='Factor2', y='Values', hue='Factor1', data=data)
plt.title('Interaction Plot')
plt.show()

# Interaction plot using line plot
plt.figure(figsize=(8, 6))
sns.pointplot(x='Factor2', y='Values', hue='Factor1', data=data, ci=None)
plt.title('Interaction Plot')
plt.show()


21. Write a Python program to visualize the F-distribution and discuss its use in hypothesis testing.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# Define degrees of freedom
df1 = 5
df2 = 10

# Generate x values
x = np.linspace(0, 5, 100)

# Calculate F-distribution PDF
y = stats.f.pdf(x, df1, df2)

# Plot F-distribution
plt.plot(x, y, label=f'F-distribution (df1={df1}, df2={df2})')
plt.xlabel('Value')
plt.ylabel('Probability Density')
plt.title('F-Distribution')
plt.legend()
plt.show()

# Calculate critical value for a given significance level
alpha = 0.05
critical_value = stats.f.ppf(1 - alpha, df1, df2)
print(f"Critical value for alpha={alpha}: {critical_value:.4f}")


22. Perform a one-way ANOVA test in Python and visualize the results with boxplots to compare group means.

In [None]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

# Sample data
np.random.seed(0)
group1 = np.random.normal(loc=20, scale=3, size=10)
group2 = np.random.normal(loc=22, scale=3, size=10)
group3 = np.random.normal(loc=18, scale=3, size=10)

# Perform one-way ANOVA test
f_stat, p_value = stats.f_oneway(group1, group2, group3)

# Print the results
print(f"F-statistic: {f_stat:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpret the results
alpha = 0.05
if p_value < alpha:
 print("Reject the null hypothesis. The means are significantly different.")
else:
 print("Fail to reject the null hypothesis. The means are not significantly different.")

# Visualize the results with boxplots
plt.figure(figsize=(8, 6))
plt.boxplot([group1, group2, group3], labels=['Group 1', 'Group 2', 'Group 3'])
plt.title('Boxplot of Group Means')
plt.ylabel('Value')
plt.show()


23.  Simulate random data from a normal distribution, then perform hypothesis testing to evaluate the means.

In [None]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

# Set seed for reproducibility
np.random.seed(0)

# Simulate data from a normal distribution
mean1 = 20
std_dev1 = 3
sample_size1 = 30
data1 = np.random.normal(mean1, std_dev1, sample_size1)

mean2 = 22
std_dev2 = 3
sample_size2 = 30
data2 = np.random.normal(mean2, std_dev2, sample_size2)

# Perform two-sample t-test
t_stat, p_value = stats.ttest_ind(data1, data2)

# Print the results
print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpret the results
alpha = 0.05
if p_value < alpha:
 print("Reject the null hypothesis. The means are significantly different.")
else:
 print("Fail to reject the null hypothesis. The means are not significantly different.")

# Visualize the data
plt.figure(figsize=(8, 6))
plt.hist(data1, alpha=0.5, label='Data 1', bins=10)
plt.hist(data2, alpha=0.5, label='Data 2', bins=10)
plt.legend()
plt.title('Histogram of Simulated Data')
plt.show()


24. Perform a hypothesis test for population variance using a Chi-square distribution and interpret the results.

In [None]:
import numpy as np
from scipy import stats

# Sample data
np.random.seed(0)
data = np.random.normal(loc=20, scale=3, size=30)

# Null hypothesis: population variance is equal to 9 (sigma^2 = 9)
sigma0_squared = 9

# Calculate sample variance
sample_variance = np.var(data, ddof=1)

# Calculate Chi-square statistic
n = len(data)
chi2_stat = (n - 1) * sample_variance / sigma0_squared

# Calculate P-value (two-tailed test)
p_value = 2 * min(stats.chi2.cdf(chi2_stat, n - 1), 1 - stats.chi2.cdf(chi2_stat, n - 1))

# Print the results
print(f"Chi-square statistic: {chi2_stat:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpret the results
alpha = 0.05
if p_value < alpha:
 print("Reject the null hypothesis. The population variance is significantly different from 9.")
else:
 print("Fail to reject the null hypothesis. The population variance is not significantly different from 9.")


25. Write a Python script to perform a Z-test for comparing proportions between two datasets or groups.

In [None]:
import numpy as np
from scipy import stats

# Sample data
successes1 = 120  # Number of successes in group 1
sample_size1 = 200  # Sample size of group 1
successes2 = 100  # Number of successes in group 2
sample_size2 = 250  # Sample size of group 2

# Calculate proportions
proportion1 = successes1 / sample_size1
proportion2 = successes2 / sample_size2

# Calculate pooled proportion
pooled_proportion = (successes1 + successes2) / (sample_size1 + sample_size2)

# Calculate standard error
standard_error = np.sqrt(pooled_proportion * (1 - pooled_proportion) * (1/sample_size1 + 1/sample_size2))

# Calculate Z-statistic
z_statistic = (proportion1 - proportion2) / standard_error

# Calculate P-value (two-tailed test)
p_value = 2 * (1 - stats.norm.cdf(abs(z_statistic)))

# Print the results
print(f"Z-statistic: {z_statistic:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpret the results
alpha = 0.05
if p_value < alpha:
 print("Reject the null hypothesis. The proportions are significantly different.")
else:
 print("Fail to reject the null hypothesis. The proportions are not significantly different.")


26. Implement an F-test for comparing the variances of two datasets, then interpret and visualize the results.

In [None]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

# Sample data
np.random.seed(0)
data1 = np.random.normal(loc=20, scale=3, size=30)
data2 = np.random.normal(loc=22, scale=4, size=30)

# Perform F-test
f_statistic, p_value = stats.f_oneway(data1, data2)

# However, for comparing variances directly, we calculate F-statistic manually
var1 = np.var(data1, ddof=1)
var2 = np.var(data2, ddof=1)
f_statistic_manual = var1 / var2 if var1 > var2 else var2 / var1

# Calculate p-value manually (two-tailed test)
p_value_manual = 2 * (1 - stats.f.cdf(f_statistic_manual, len(data1) - 1, len(data2) - 1))

# Print the results
print(f"F-statistic (manual calculation): {f_statistic_manual:.4f}")
print(f"P-value (manual calculation): {p_value_manual:.4f}")

# Interpret the results
alpha = 0.05
if p_value_manual < alpha:
 print("Reject the null hypothesis. The variances are significantly different.")
else:
 print("Fail to reject the null hypothesis. The variances are not significantly different.")

# Visualize the data
plt.figure(figsize=(8, 6))
plt.hist(data1, alpha=0.5, label='Data 1', bins=10)
plt.hist(data2, alpha=0.5, label='Data 2', bins=10)
plt.legend()
plt.title('Histogram of Data')
plt.show()

# Boxplot visualization
plt.figure(figsize=(8, 6))
plt.boxplot([data1, data2], labels=['Data 1', 'Data 2'])
plt.title('Boxplot of Data')
plt.show()



27. Perform a Chi-square test for goodness of fit with simulated data and analyze the results.

In [None]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

# Set seed for reproducibility
np.random.seed(0)

# Simulate data from a normal distribution
mean = 20
std_dev = 3
sample_size = 100
data = np.random.normal(mean, std_dev, sample_size)

# Define bins for histogram
bins = np.linspace(10, 30, 6)

# Calculate observed frequencies
observed_frequencies, _ = np.histogram(data, bins=bins)

# Calculate expected frequencies under normal distribution
expected_frequencies = sample_size * (stats.norm.cdf(bins[1:], mean, std_dev) - stats.norm.cdf(bins[:-1], mean, std_dev))

# Perform Chi-square test
The chi2_statistic, p_value = stats.chisquare(observed_frequencies, f_exp=expected_frequencies)

# Print the results
print(f"Chi-square statistic: {chi2_statistic:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpret the results
alpha = 0.05
if p_value < alpha:
 print("Reject the null hypothesis. The data does not fit the normal distribution.")
else:
 print("Fail to reject the null hypothesis. The data fits the normal distribution.")

# Visualize the results
plt.figure(figsize=(8, 6))
plt.hist(data, bins=bins, alpha=0.5, label='Observed')
plt.plot((bins[:-1] + bins[1:]) / 2, expected_frequencies, 'ro-', label='Expected')
plt.legend()
plt.title('Observed vs Expected Frequencies')
plt.show()

