In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chi2, norm, t, f

In [14]:
xs = norm.rvs(loc=100,scale=np.sqrt(10),size=100)
xs[:50]

array([101.3851234 ,  98.26952221,  97.34091663, 104.90049346,
       102.13685397,  98.04963906,  97.40314782, 102.88063658,
        98.15681654,  97.8206282 , 102.3588493 ,  98.24807846,
        97.54054961, 102.50967417, 101.4346519 , 100.02895253,
       100.15365785, 100.76314369, 100.62923257,  99.44811184,
        98.97394101, 103.55470989, 104.93475536,  97.4282409 ,
        96.5066892 , 102.36552153, 107.194614  ,  97.7527665 ,
       104.29696041,  94.21556938, 101.91569611,  97.24412035,
        97.73473161, 102.04095297, 104.47170525,  97.13889558,
       100.20864521, 103.72560212, 109.59061577,  99.71158077,
       104.30750545, 103.59629231,  98.8617036 ,  97.44713043,
       104.71595164, 109.15664147, 100.68746374,  99.96354029,
        94.84275652, 102.34745653])

In [15]:
n = len(xs)
xs_mean = np.mean(xs)
xs_std = np.std(xs,ddof=1) 

print(f"mean : {xs_mean} std : {xs_std}")

# 평균의 신뢰구간 95% 구하기 

L1 = xs_mean - norm().ppf(0.975)* xs_std/np.sqrt(n)
U1 = xs_mean + norm().ppf(0.975)*xs_std/np.sqrt(n)

print(f"95% 신뢰구간 {np.round(L1,4)} ~ {np.round(U1,4)}")

mean : 100.75418849109487 std : 3.284293078790612
95% 신뢰구간 100.1105 ~ 101.3979


In [16]:
# 작은 사이즈(t검정) 
small_xs = np.random.choice(xs,size=25,replace=True)
small_xs

array([103.59629231, 103.55470989,  97.44713043, 101.91569611,
       101.74214228,  99.58689606, 101.91569611, 107.194614  ,
        97.0434734 , 100.21646934, 104.56082033,  97.13889558,
        99.16766529, 100.42639285, 102.54887798, 102.36552153,
        94.84275652,  97.34091663,  97.15676311, 100.42639285,
       100.20864521,  98.97394101, 105.21936365,  99.96354029,
       102.93891103])

In [18]:
n = len(small_xs)
df = n-1
small_mean = np.mean(small_xs)
small_std = np.std(small_xs,ddof=1)

L1 = small_mean - t(df).ppf(0.975)*small_std/np.sqrt(n)
U1 = small_mean + t(df).ppf(0.975)*small_std/np.sqrt(n)

print(f"95% 신뢰구간 {np.round(L1,4)} ~ {np.round(U1,4)}")

95% 신뢰구간 99.4734 ~ 101.926


## Problem 1

A company randomly select 250 customers and send the promotion code for a new product.

Among these, 70 people responded to buy the product using the promotion code. 

Then what would be the proportion of the customers to buy the product,

and what is the 90% confidence interval of that?

In [20]:
n = 250 # 표본개수
p_hat = 70/250 # 표본확률
alpha= 0.1 

L1 = p_hat - norm().ppf(0.95)*np.sqrt(p_hat*(1-p_hat)/n)
U1 = p_hat + norm().ppf(0.95)*np.sqrt(p_hat*(1-p_hat)/n)

print(f"표본확률 측정값 : {p_hat}")
print(f"90% 신뢰구간 {np.round(L1,4)} ~ {np.round(U1,4)}")

표본확률 측정값 : 0.28
90% 신뢰구간 0.2333 ~ 0.3267


## Problem 2

Find the 95% confidence interval of the variance with 'can data'

여기선 can_data 대신 xs로 실습 

In [23]:
n = len(xs)
df = n-1
s2 = np.var(xs,ddof=1)
chi_dist = chi2(df)

l1=(n-1)*s2/chi_dist.ppf(0.975)
u1=(n-1)*s2/chi_dist.ppf(0.025)

print(f"95% 신뢰구간 {np.round(l1,4)} ~ {np.round(u1,4)}")

95% 신뢰구간 8.3153 ~ 14.5564


## Problem 3

Calculate the 95% confidence interval of the difference of the means.

In [25]:
A_group = np.random.choice(xs,30,replace=True)
B_group = np.random.choice(xs,25,replace=True)
print("---A group---")
print(A_group)
print("---B group---")
print(B_group)

---A group---
[ 97.73473161 100.77800811 102.3588493  100.69625077 104.47170525
 102.60866991 100.02895253 103.55470989  98.15681654  94.75004809
 102.36552153 103.5359495   97.16010536 104.56082033 107.194614
 101.66300084 105.85163763  99.58689606  99.96354029 100.21897401
  97.34091663 106.77012297 102.08776821  98.97394101 107.194614
  95.31567936 103.72560212  95.31567936 100.02895253 104.89673053]
---B group---
[101.66300084 103.5359495  102.08776821  99.81668988 101.73588343
  97.13889558 104.30750545  97.44713043 104.30750545  97.73473161
  94.84275652 102.36552153 103.59629231  99.81668988 100.42639285
 105.21936365 103.55470989  99.96354029 103.59629231 105.21936365
 102.54887798 104.22600175  97.34091663 104.89673053 102.13685397]


In [26]:
n1 = len(A_group)
n2 = len(B_group)

pooled_s2 = ((n1-1)*np.var(A_group,ddof=1) + (n2-1)*np.var(B_group,ddof=1))/(n1+n2-2)
print("Pooled estimator of the variance is ", np.round(pooled_s2,4))

Pooled estimator of the variance is  10.6046


In [28]:
a_mean = np.mean(A_group)
b_mean = np.mean(B_group)
df= n1+n2-2

l1 = (a_mean-b_mean) - t(df).ppf(0.975)*np.sqrt(pooled_s2)*np.sqrt(1/n1+1/n2)
u1 = (a_mean-b_mean) + t(df).ppf(0.975)*np.sqrt(pooled_s2)*np.sqrt(1/n1+1/n2)

print(f"The point estimator of the difference of the mean is {np.round(a_mean-b_mean,4)}")
print(f"95% confidence interval of the mean difference is {np.round(l1,4)} and {np.round(u1,4)}")

The point estimator of the difference of the mean is -0.2847
95% confidence interval of the mean difference is -2.0535 and 1.4841


## Problem 4
About the difference of two proportions

A survey company asked 100 males and 100 female if they married or not. 

62 male and 29 female responded that they are married.

Compute the 90% confidence interval of the difference of the two proportions.

In [29]:
n1 = n2 = 100
p_male = 62/n1
p_female = 29/n2

l1 = (p_male-p_female) - norm().ppf(0.95)*np.sqrt(p_male*(1-p_male)/n1 + p_female*(1-p_female)/n2)
u1 = (p_male-p_female) + norm().ppf(0.95)*np.sqrt(p_male*(1-p_male)/n1 + p_female*(1-p_female)/n2)

print(f"difference of two proportions : {p_male - p_female}")
print(f"90% confidence interval of the propportion difference is {np.round(l1,4)} and {np.round(u1,4)}")

difference of two proportions : 0.33
90% confidence interval of the propportion difference is 0.2207 and 0.4393
