In [2]:
import numpy as np
import scipy.stats as stats


np.random.seed(123)
sales_salary = np.random.normal(loc=50000, scale=10000, size=50)
marketing_salary = np.random.normal(loc=55000, scale=8000, size=50)
it_salary = np.random.normal(loc=60000, scale=12000, size=50)

# One-Way ANOVA test
f_value, p_value = stats.f_oneway(sales_salary, marketing_salary, it_salary)


print("F-value: ", f_value)
print("P-value: ", p_value)


alpha = 0.05


if p_value < alpha:
    print("We reject the null hypothesis.")
    print("There is a significant difference in mean salary between the three departments.")
else:
    print("We fail to reject the null hypothesis.")
    print("There is no significant difference in mean salary between the three departments.")


F-value:  13.055870477967321
P-value:  6.037047433812819e-06
We reject the null hypothesis.
There is a significant difference in mean salary between the three departments.


In [4]:
import numpy as np
import scipy.stats as stats

# generate random recovery time data
np.random.seed(123)
drug1_recovery = np.random.normal(loc=10, scale=2, size=50)
drug2_recovery = np.random.normal(loc=12, scale=3, size=50)
drug3_recovery = np.random.normal(loc=15, scale=4, size=50)

# One-Way ANOVA test
f_value, p_value = stats.f_oneway(drug1_recovery, drug2_recovery, drug3_recovery)

# print results
print("F-value: ", f_value)
print("P-value: ", p_value)

# set significance level
alpha = 0.05

# interpret results using p-value
if p_value < alpha:
    print("We reject the null hypothesis.")
    print("There is a significant difference in mean recovery time between the three drugs.")
else:
    print("We fail to reject the null hypothesis.")
    print("There is no significant difference in mean recovery time between the three drugs.")


F-value:  34.638294743713885
P-value:  4.727428152898501e-13
We reject the null hypothesis.
There is a significant difference in mean recovery time between the three drugs.


In [11]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

# generate random plant growth data
np.random.seed(123)
fertilizer = ["A", "B", "C"]
watering = ["low", "medium", "high"]
data = pd.DataFrame({
    "fertilizer": np.random.choice(fertilizer, size=100),
    "watering": np.random.choice(watering, size=100),
    "growth": np.random.normal(loc=10, scale=2, size=100)
})

# Two-Way ANOVA test
model = ols("growth ~ C(fertilizer) + C(watering) + C(fertilizer):C(watering)", data=data).fit()
table = sm.stats.anova_lm(model, typ=2)

# print results
print(table)

# set significance level
alpha = 0.05

# interpret results using p-value
if table.loc["C(fertilizer):C(watering)", "PR(>F)"] < alpha:
    print("We reject the null hypothesis.")
    print("There is a significant difference in mean plant growth between different levels of fertilizer and watering.")
else:
    print("We fail to reject the null hypothesis.")
    print("There is no significant difference in mean plant growth between different levels of fertilizer and watering.")


                               sum_sq    df         F    PR(>F)
C(fertilizer)                7.823388   2.0  0.603026  0.549325
C(watering)                 10.885574   2.0  0.839059  0.435432
C(fertilizer):C(watering)    5.235495   4.0  0.201776  0.936789
Residual                   590.296737  91.0       NaN       NaN
We fail to reject the null hypothesis.
There is no significant difference in mean plant growth between different levels of fertilizer and watering.


In [4]:
age_group = ["18-30", "31-50", "51+"]
gender = ["male", "female"]
data = pd.DataFrame({
    "age_group": np.random.choice(age_group, size=120),
    "gender": np.random.choice(gender, size=120),
    "spending": np.random.normal(loc=100, scale=20, size=120)
})

In [5]:
data

Unnamed: 0,age_group,gender,spending
0,18-30,male,91.342210
1,18-30,female,79.014858
2,18-30,male,100.584277
3,18-30,female,122.580852
4,31-50,male,104.026799
...,...,...,...
115,31-50,female,101.521856
116,31-50,female,89.303586
117,51+,female,100.476618
118,18-30,female,107.076953


In [12]:
data

Unnamed: 0,fertilizer,watering,growth
0,C,low,9.908664
1,B,high,8.488322
2,C,high,4.686976
3,C,medium,10.602103
4,A,medium,8.634847
...,...,...,...
95,B,high,5.969240
96,C,medium,10.429768
97,C,medium,7.898348
98,B,high,13.789352


In [8]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

data = {'Yield':[24,28,30,27,31,30,32,34,33,29,31,36,34,37,35,39,38,41,36,40,39,41,43,44,42,45,46]}
df = pd.DataFrame(data)

df['fertilizer'] = ["A"]*9 + [ "B"]*9 + ["C"]*9
df['watering'] = ["low", "medium", "high"]*9

# Two-Way ANOVA test
model = ols("Yield ~ C(fertilizer) + C(watering) + C(fertilizer):C(watering)", data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

print(anova_table)
print('\n')
alpha = 0.05

# interpret results using p-value
if anova_table["PR(>F)"][0] < alpha:
    print("There is a significant main effect of fertilizer on crop yield.")
else:
    print("There is no significant main effect of fertilizer on crop yield.")
    
if anova_table["PR(>F)"][1] < alpha:
    print("There is a significant main effect of water on crop yield.")
else:
    print("There is no significant main effect of water on crop yield.")

if anova_table['PR(>F)'][2] < alpha:
    print("We reject the null hypothesis.")
    print("There is a significant interaction effect between fertilizer and water on crop yield.")
else:
    print("We fail to reject the null hypothesis.")
    print("There is no significant interaction effect between fertilizer and water on crop yield.")


                               sum_sq    df          F    PR(>F)
C(fertilizer)              636.518519   2.0  26.603715  0.000004
C(watering)                 54.740741   2.0   2.287926  0.130213
C(fertilizer):C(watering)    4.592593   4.0   0.095975  0.982446
Residual                   215.333333  18.0        NaN       NaN


There is a significant main effect of fertilizer on crop yield.
There is no significant main effect of water on crop yield.
We fail to reject the null hypothesis.
There is no significant interaction effect between fertilizer and water on crop yield.


In [16]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

# generate random weight loss data
np.random.seed(123)
diet = ["low-carb", "low-fat"]
exercise = ["low", "medium", "high"]
data = pd.DataFrame({"diet": np.random.choice(diet, size=100),"exercise": np.random.choice(exercise, size=100),
    "weight_loss": np.random.normal(loc=5, scale=2, size=100)})

# Two-Way ANOVA test
model = ols("weight_loss ~ C(diet) + C(exercise) + C(diet):C(exercise)", data=data).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

print(anova_table)
print('\n')

alpha = 0.05

# interpret results using p-value
if table.loc["C(diet):C(exercise)", "PR(>F)"] < alpha:
    
    print("We reject the null hypothesis.")
    print("There is a significant difference in mean weight loss between different diets and levels of exercise.")
    
else:
    
    print("We fail to reject the null hypothesis.")
    print("There is no significant difference in mean weight loss between different diets and levels of exercise.")


                         sum_sq    df         F    PR(>F)
C(diet)                0.866991   1.0  0.234061  0.629654
C(exercise)            5.546258   2.0  0.748661  0.475798
C(diet):C(exercise)    0.700569   2.0  0.094566  0.909854
Residual             348.187125  94.0       NaN       NaN


We fail to reject the null hypothesis.
There is no significant difference in mean weight loss between different diets and levels of exercise.


In [1]:
import numpy as np
from scipy.stats import t

salaries = [25000, 35000, 40000, 50000, 55000, 60000, 65000, 70000, 75000, 80000,
            85000, 90000, 95000, 100000, 105000, 110000, 115000, 120000, 125000, 130000,
            135000, 140000, 145000, 150000, 155000, 160000, 165000, 170000, 175000, 180000,
            185000, 190000, 195000, 200000, 205000, 210000, 215000, 220000, 225000, 230000,
            235000, 240000, 245000, 250000, 255000, 260000, 265000, 270000, 275000, 280000]

n = len(salaries)

x_bar = np.mean(salaries)

s = np.std(salaries, ddof=1)

se = s / np.sqrt(n)

t_value = t.ppf(0.975, n-1)

me = t_value * se

ci_lower = x_bar - me
ci_upper = x_bar + me

print(f"Point estimate: {x_bar}")
print(f"95% Confidence Interval: [{ci_lower}, {ci_upper}]")


Point estimate: 157100.0
95% Confidence Interval: [136192.03943806508, 178007.96056193492]


In [2]:
import numpy as np


heights = [175, 180, 182, 178, 185, 177, 183, 179, 181, 176,
           184, 182, 177, 180, 185, 183, 179, 181, 178, 176,
           183, 178, 181, 180, 175, 182, 179, 184, 178, 177,
           180, 182, 176, 183, 179, 184, 177, 181, 175, 178,
           183, 180, 181, 178, 176, 184, 182, 177, 179, 185,
           180, 181, 179, 176, 184, 182, 177, 183, 180, 178,
           181, 179, 176, 184, 182, 178, 180, 185, 177, 183,
           179, 181, 178, 176, 183, 178, 181, 180, 175, 182,
           179, 184, 178, 177, 180, 182, 176, 183, 179, 184,
           177, 181, 175, 178, 183, 180, 181, 178, 176, 184]


n = len(heights)
x_bar = np.mean(heights)

print(f"Point estimate: {x_bar}")


Point estimate: 179.91
