In [None]:
import numpy as np
import pandas as pd 
import matplotlib as plt

# Values of z for Common Levels of Confidence

In [None]:
import plotly.graph_objects as go
Confidence_Level = [90, 95, 98, 99]
z_Value = [1.645, 1.96, 2.33, 2.575]

fig = go.Figure(data=[go.Table(header=dict(values=['Confidence Level', 'z Value']),
                 cells=dict(values=[Confidence_Level, z_Value]))
                     ])
fig.show()

# ESTIMATING THE POPULATION MEAN USING THE z STATISTIC 
(VARIANCE KNOWN)

In [None]:
def zmean_interval(mean, table, std, n):
  max = mean + (table *(std/ n**0.5))
  min = mean - (table *(std/ n**0.5))
  interval = (min, max)
  return interval

**EXAMPLE 1**

A survey was taken of U.S. companies that do business with firms in India. One of
the questions on the survey was: Approximately how many years has your company
been trading with firms in India? A random sample of 44 responses to this question
yielded a mean of 10.455 years. Suppose the population standard deviation for this
question is 7.7 years. Using this information, construct a 90% confidence interval for
the mean number of years that a company has been trading in India for the population
of U.S. companies trading with firms in India.

**Solution**

n= 44    sample mean = 10.455 standard deviation = 7.7 alpha = 0.10
table value: 0.1/2 = 0.05   1-0.05= 0.95 check area to left of curve z =  1.645

In [None]:
zmean_interval(10.455,1.645,7.7,44)

(8.545453276952879, 12.364546723047122)

# Finite Correction Factor

In [None]:
def fin_factor(mean, table, N, n, std):
  max = mean + table * (std/n**0.5) * ((N-n)/(N-1))**0.5
  min = mean - table * (std/n**0.5) * ((N-n)/(N-1))**0.5
  return (min, max)


**Example**

A study is conducted in a company that employs 800 engineers. A random sample
of 50 engineers reveals that the average sample age is 34.3 years. Historically, the population standard deviation of the age of the company’s engineers is approximately 8 years. Construct a 98% confidence interval to estimate the average age of all the engineers in this company.

**Solution**

This problem has a finite population. The sample size, 50, is greater than 5% of the population, so the finite correction factor may be helpful.

N = 800 n = 50 mean = 34.3 std = 8 alpha = 0.02 
table value: z(0.01) = 2.33


In [None]:
fin_factor(34.3, 2.33, 800, 50, 8)

(31.746016142978302, 36.85398385702169)

# ESTIMATING THE POPULATION MEAN USING THE t STATISTIC (VARIANCE UNKNOWN)

In [None]:
def tmean_interval(mean, table, std, n):
  max = mean + (table *(std/ n**0.5))
  min = mean - (table *(std/ n**0.5))
  interval = (min, max)
  return interval

**Example 2**

The owner of a large equipment rental company wants to make a
rather quick estimate of the average number of days a piece of
ditchdigging equipment is rented out per person per time. The company
has records of all rentals, but the amount of time required to
conduct an audit of all accounts would be prohibitive. The owner decides to take a random sample of rental invoices. Fourteen different rentals of
ditchdiggers are selected randomly from the files, yielding the following data. She uses these data to construct a 99% confidence interval to estimate the average number of days that a ditchdigger is rented and assumes that the number of days per rental is normally distributed in the population.
3 1 3 2 5 1 2 1 4 2 1 3 1 1

In [None]:
data = np.array([3, 1, 3, 2, 5, 1, 2, 1, 4, 2, 1, 3, 1, 1])
mean = np.mean(data)
std = np.std(data)
print(mean)
print(std)

2.142857142857143
1.2453996981544784


**Solution**
 
n = 14 mean = 2.1429 std = 1.2454 alpha=0.01 df=13
table value: 0.01/2=0.005  t(0.005,13) = 3.012

In [None]:
tmean_interval(20.6 , 2.262, 2.65, 10)

(18.70443590216527, 22.495564097834734)

# ESTIMATING THE POPULATION PROPORTION

In [None]:
def prop_interval(p,q, table, n):
  max = p + (table * ((p*q)/n)**0.5)
  min = p - (table * ((p*q)/n)**0.5)
  interval = (min, max)
  return interval

**Example 3**

Coopers & Lybrand surveyed 210 chief executives of fast-growing small companies.
Only 51% of these executives had a management succession plan in place. A
spokesperson for Cooper & Lybrand said that many companies do not worry about
management succession unless it is an immediate problem. However, the unexpected
exit of a corporate leader can disrupt and unfocus a company for long enough
to cause it to lose its momentum.Use the data given to compute a 92% confidence interval to estimate the proportion of all fast-growing small companies that have a management succession plan.

**Solution**

p = 0.51  q= 0.49 n = 210 alpha = 0.08 
table value = z(0.08/2) = z(0.04) = 1.75 

In [None]:
prop_interval(0.51, 0.49, 1.75, 210)

(0.4496313409126888, 0.5703686590873112)

# ESTIMATING THE POPULATION VARIANCE

In [None]:
def var_interval(n, s, table1, table2):
  max = ((n-1)*s**2)/table2
  min = ((n-1)*s**2)/table1
  interval = (min, max)
  return interval

**Example 4**

The U.S. Bureau of Labor Statistics publishes data on the hourly compensation
costs for production workers in manufacturing for various countries. The latest
figures published for Greece show that the average hourly wage for a production worker in manufacturing is 16.10. Suppose the business council of Greece wants
to know how consistent this figure is. They randomly select 25 production workers in manufacturing from across the country and determine that the standard deviation of hourly wages for such workers is 1.12. Use this information to develop a 95% confidence interval to estimate the population variance for the hourly wages of production workers in manufacturing in Greece. Assume that the hourly wages for production workers across the country in manufacturing are normally distributed.

**Solution**

n = 25 df = 24 alpha = 0.05 sample std = 1.12 
table1 value: X(0.025, 24) = 39.36
table2 value: X(0.975, 24) = 12.40

In [None]:
var_interval(25, 1.12, 39.36, 12.40)

(0.7648780487804879, 2.4278709677419354)

# ESTIMATING SAMPLE SIZE

In [None]:
def sample_size(table, var, err):
  n = (table**2 * var**2)/err**2
  return n

**Example 5:**
**Sample Size when Estimating mean**

Suppose you want to estimate the average age of all Boeing 737-300 airplanes
now in active domestic U.S. service. You want to be 95% confident, and you want
your estimate to be within one year of the actual figure. The 737-300 was first
placed in service about 24 years ago, but you believe that no active 737-300s in
the U.S. domestic fleet are more than 20 years old. How large of a sample should
you take?

**Solution**

alpha = 0.05  E = 1 std = 1/4 * range = 1/4 * (20) = 5
table = z(0.025) = 1.96

In [None]:
sample_size(1.96, 5, 1)

96.03999999999999

**Example 6: Determining Sample Size when Estimating p**

Hewitt Associates conducted a national survey to determine the extent to which
employers are promoting health and fitness among their employees. One of the
questions asked was, Does your company offer on-site exercise classes? Suppose it was estimated before the study that no more than 40% of the companies would
answer Yes. How large a sample would Hewitt Associates have to take in estimating the population proportion to ensure a 98% confidence in the results and to be within .03 of the true population proportion?

In [None]:
def sample_size_p(p, q, table, err):
  n = (table**2 * p * q)/err**2
  return n


**Solution**

alpha = 0.02 err = 0.03 q = 0.60 p = 0.40 
table value: z(0.01) = 2.33

In [None]:
sample_size_p(0.4, 0.6, 2.33, 0.03)

1447.7066666666667

# TESTING HYPOTHESES ABOUT A POPULATION MEAN USING THE z STATISTIC (VARIANCE KNOWN)

In [None]:
def var_known(mean, mu, std, n):
  z = (mean - mu )/ (std/n**0.5)
  return z

**Example**

A survey of CPAs across the United States found that the average net income for sole proprietor CPAs is 74,914. Because this survey is now more than ten years old, an accounting researcher wants to test this figure by taking a random sample of 112 sole proprietor accountants in the United States to determine whether the net income figure changed. The researcher could use the eight steps of hypothesis testing to do so.Assume the population standard deviation of net incomes for sole proprietor CPAs is 14,530. Suppose the 112 CPAs who respond produce a sample mean of 78,695.

**Solution**

H0: mu = 74,914   Ha: mu =! 74,914   n = 112 std = 14530 mean = 78695

In [None]:
var_known(78695, 74914, 14530, 112)

2.753912101069571

Compare critical value of 2.75 to the observed value at 95% significance level.
z = 1.96
2.75 > 1.96, hence we reject the null hypothesis.

# Testing the Mean with a Finite Population

In [None]:
def fin_pop(mean, mu, N, n, std):
  z = (mean - mu)/ ((std/n**0.5)*((N-n)/(N-1))**0.5)
  return z 

**Example**

In the CPA net income example, suppose only 600 sole proprietor CPAs practice in the United States. A sample of 112 CPAs taken from a population of only 600 CPAs is 18.67% of the population and therefore is much more likely to be representative of the population than a sample of 112 CPAs taken from a population of 20,000 CPAs (.56% of the population).

**Solution**

N = 600 n = 112 mean = 78695 mu = 74914 std = 14530

In [None]:
fin_pop(78695, 74914, 600, 112, 14530)

3.0510798559992653

# TESTING HYPOTHESES ABOUT A POPULATION MEAN USING THE t STATISTIC (VARIANCE UNKNOWN)

In [None]:
def var_unknown(mean, mu, std, n):
  t = (mean - mu)/ (std/n**0.5)
  return t

**Example**

The U.S. Farmers’ Production Company builds large harvesters. For a harvester to be properly balanced when operating, a 25-pound plate is installed on its side. The machine that produces these plates is set to yield plates that average 25 pounds. The distribution of plates produced from the machine is normal. However, the shop supervisor is worried that the machine is out of adjustment and is producing plates that do not average 25 pounds. To test this concern, he randomly selects 20 of the plates produced the day before and
weighs them. sample mean = 25.51, sample std = 2.1933

**Solution**

H0: mu = 25 pounds   n = 20 df = 19

In [None]:
var_unknown(25.51, 25, 2.1933, 20)

1.0398893617151292

Compare the critical value 1.0399 with table value t(0.025, 19) = 2.093

1.0399 < 2.093, hence we don't reject the null hypothesis

# TESTING HYPOTHESES ABOUT A PROPORTION

In [None]:
def prop_test(prop, p, q, n):
  z = (prop - p)/ (p*q/n)**0.5
  return z  

**Example**

A manufacturer believes exactly 8% of its products contain at least one minor flaw. The business researcher randomly selects a sample of 200 products, inspects each item for flaws, and determines that 33 items have at least
one minor flaw. 
H0: p = .08 

**Solution**

prop = 33/200 = 0.165
p = 0.08 q = 0.92 n = 200 alpha = 0.05
table value = 1.96



In [None]:
prop_test(0.165, 0.08, 0.92, 200)

4.430931298712839

4.4309 > 1.96, hence we reject the null hypothesis

# TESTING HYPOTHESES ABOUT A VARIANCE

In [None]:
def var_test(n, std, sigma):
  chi = (n - 1) * std**2/ sigma**2
  return chi

**Example**

A manufacturing firm has been working diligently to implement a
just-in-time inventory system for its production line. The final product requires the installation of a pneumatic tube at a particular station on the assembly line.With the justin-time inventory system, the company’s goal is to minimize the number of pneumatic tubes that are piled up at the station waiting to be installed. Ideally, the tubes would arrive just as the operator needs them. However, because of the supplier and the variables involved in getting the tubes to the line, most of the time there will be some buildup of
tube inventory. The company expects that, on the average, about 20 pneumatic tubes will be at the station. However, the production superintendent does not want the variance of this inventory to be greater than 4. On a given day, the number of pneumatic tubes piled up at the workstation is determined eight different times and the following number of tubes are recorded.
23 17 20 29 21 14 19 24

H0: variance = 4

**Solution**

n = 8 df = 7 std = 

# HYPOTHESIS TESTING AND CONFIDENCE INTERVALS ABOUT THE DIFFERENCE IN TWO MEANS USING THE z STATISTIC (POPULATION VARIANCES KNOWN)

In [None]:
def two_means(mean1, mean2, var1, var2, n1, n2):
  z = (mean1 - mean2)/ (var1/ n1 + var2/n2)**0.5
  return z

**Example**

A sample of 87 professional working women showed that the average amount paid
annually into a private pension fund per person was 3352. The population standard deviation is 1100. A sample of 76 professional working men showed that the average amount paid annually into a private pension fund per person was  5727, with a population standard deviation of 1700. A women’s activist group wants to “prove” that women do not pay as much per year as men into private pension funds. If they use alpha = .001 and these sample data, will they be able to reject a null hypothesis that women annually pay the same as or more than men into private pension funds? Use the eight-step hypothesis-testing process.

**Solution**

mean1 = 3352 mean2 = 5727 n1 = 87 n2=76 var1 = (1100)^2 var2 = (1700)^2 
alpha =0.001

In [None]:
two_means(3352, 5727, 1100**2, 1700**2, 87, 76)

-10.42164353961526

# HYPOTHESIS TESTING AND CONFIDENCE INTERVALS ABOUT THE DIFFERENCE IN TWO MEANS: INDEPENDENT SAMPLES AND POPULATION VARIANCES UNKNOWN

In [None]:
def two_means_unknown(mean1, mean2, var1, var2, n1, n2):
  t = (mean1 - mean2) / (((var1*(n1-1) + var2*(n2-1))/(n1+n2-2))**0.5 * (1/n1 + 1/n2)**0.5)
  return t

**Example**

mean1 = 47.73   mean2 = 56.5 var1 = 19.495 var2 = 18.273 n1 = 15 n2 = 12


In [None]:
two_means_unknown(47.73, 56.5, 19.495, 18.273, 15, 12)

-5.200744198869119

# STATISTICAL INFERENCES ABOUT TWO POPULATION PROPORTIONS, p1-p2

In [None]:
def two_ps(p1, p2, n1, n2):
  p_mean = (n1*p1 + n2 * p2)/(n1+n2)
  q_mean = 1 - p_mean
  z = (p1-p2)/ ((p_mean*q_mean) * (1/n1 + 1/n2))**0.5
  return z

**Example**

A study of female entrepreneurs was conducted to determine their definition of success. The women were offered optional choices such as happiness/self-fulfillment, sales/profit, and achievement/challenge. The women were divided into groups according to the gross sales of their businesses. A significantly higher proportion of female entrepreneurs in the 100,000 to 500,000 category than in the less than 100,000 category seemed to rate sales/profit as a definition of success.Suppose you decide to test this result by taking a survey of your own and identify female entrepreneurs by gross sales. You interview 100 female entrepreneurs with gross sales of less than 100,000, and 24 of them define sales/profit as success. You then interview 95 female entrepreneurs with gross sales of 100,000 to 500,000, and 39 cite sales/profit as a definition of success. Use this information to test to determine whether there is a significant difference in the proportions of the two groups that define success as sales/profit. Use a = .01.

**Solution**

n1 = 100 n2 = 95 p1 = 0.24 p2 = 0.41 

In [None]:
two_ps(0.24, 0.41, 100, 95)

-2.537820841887076

# TESTING HYPOTHESES ABOUT TWO POPULATION VARIANCES

In [None]:
def two_var(var1, var2):
  f = var1/var2
  return f

# ONE WAY ANOVA

In [None]:
import plotly.graph_objects as go
group1 = [29, 27, 30, 27, 28]
group2 = [32, 33, 31, 34, 30]
group3 = [25, 24, 24, 25, 26]
fig = go.Figure(data=[go.Table(header=dict(values=['Group 1 - Location 1', 'Group 2 - Location 2', 'Group 3 - Location 3']),
                 cells=dict(values=[group1, group2, group3]))
                     ])
fig.show()

In [None]:
group1_mean = np.array(group1).mean()
group2_mean = np.array(group2).mean()
group3_mean = np.array(group3).mean()

In [None]:
group_means = [group1_mean, group2_mean, group3_mean]
grand_mean = np.array(group_means).mean()

In [None]:
k = len(group_means)
n = len(group1)
N = n * k
for i in group_means:
    print('Group mean ', i)
print("k = "+ str(k))
print("n = "+ str(n))
print("N = " + str(N))
grand_mean

Group mean  28.2
Group mean  32.0
Group mean  24.8
k = 3
n = 5
N = 15


28.333333333333332

Sum of Squares of treatments(SST) 
It measures the variation of group means around the grand mean.
We will calculate the SST of our data in the cell below

In [None]:
def find_SST(group_means):
    sst_list = []
    for i in group_means:
            val = n*(i - grand_mean)**2
            sst_list.append(val)
    SST = sum(sst_list)
    return(SST)    
SST = find_SST(group_means)
SST

129.73333333333332

Residual Sum of Squares(SSE) measures variation of all scores around their respective group means.

In [None]:
from statistics import mean
def find_SSR(list1, list2, list3):
    residual_list = []
    for i in list1:
        res = (i - mean(list1))**2
        residual_list.append(res)
    for j in list2:
        res2 = (j-mean(list2))**2
        residual_list.append(res2)
    for k in list3:
        res3 = (k - mean(list3))**2
        residual_list.append(res3)
    return sum(residual_list)
SSR = find_SSR(group1, group2, group3)
SSR

19.599999999999998

Total sum of squares (TSS) measures variation of all scores around the grand mean

In [None]:
def find_TSS(list1, list2, list3):
    residual_list = []
    for i in list1:
        res1 = (i - grand_mean)**2
        residual_list.append(res1)
    for j in list2:
        res2 = (j - grand_mean)**2
        residual_list.append(res2)
    for k in list3:
        res3 =(k - grand_mean)**2
        residual_list.append(res3)
    
    return sum(residual_list)
TSS = find_TSS(group1, group2, group3)
TSS

149.33333333333334

Treatment Mean of Squares(MST), is the average between variation. SST/(k-1)

In [None]:
def find_MST(SST):
    MST = SST/(k-1)
    return MST
MST = find_MST(SST)
MST

64.86666666666666

Error Mean Square(MSE); The average within variation. SSE/(N-k)

In [None]:
def find_MSE(SSR):
    MSE = SSR/(N-k)
    return MSE
MSE = find_MSE(SSR)
MSE

1.633333333333333

Calculate F value to compare against critical value. Decision rule: if Fcal > Ftab reject Ho.
Ftab at 0.05 level of significance = F0.05,(k-1),(N-k)

In [None]:
F = MST/MSE
F

39.714285714285715