<제3유형 : 통계기반 가설검정>

# 표본평균 검정

## 한 독립표본 검정

In [2]:
import numpy as np
import pandas as pd
from scipy import stats

In [11]:
mtcars = pd.read_csv("../Dataset/mtcars.csv")

mpg_mean = np.round(np.mean(mtcars['mpg']), 4)
print(f"mpg_mean = {mpg_mean}")

mpg_var = np.round(np.var(mtcars['mpg']), 4)
print(f"mpg_var = {mpg_var}")

t_stat, p_value = stats.ttest_1samp(mtcars['mpg'], 20)
print(f"p_value = {round(p_value, 4)}")

confint = stats.t.interval(0.95, len(mtcars['mpg']) - 1, loc = np.mean(mtcars['mpg']), scale = stats.sem(mtcars['mpg']))
scale = stats.sem(mtcars['mpg'])
print(f"95% 신뢰구간 : {round(confint[0], 4)} ~ {round(confint[1], 4)}")

mpg_mean = 20.0906
mpg_var = 35.189
p_value = 0.9328
95% 신뢰구간 : 17.9177 ~ 22.2636


## 두 독립표본 검정

In [13]:
from scipy import stats

data1 = [117, 108, 105, 89]
data2 = [121, 101, 102, 114]

tstat1, pval1 = stats.shapiro(data1)
tstat2, pval2 = stats.shapiro(data2)

print(f"pval1 = {pval1}, pval2 = {pval2}")

pval1 = 0.7383338595748989, pval2 = 0.34643378327710317


## 두 대응표본 검정

In [14]:
from scipy import stats

data1 = [117, 108, 105, 89]
data2 = [121, 101, 102, 114]

tstat, pval = stats.ttest_rel(data1, data2)
print(f"pval = {pval}")

pval = 0.5525407609240425


# 표본분산 검정

## 한 독립표본

In [19]:
from scipy import stats
import numpy as np

data = np.array([14.2, 15.1, 14.8, 15.4, 15.8])
population_var = 1

df = len(data) - 1
chi2_stat = df * np.var(data) / population_var

pval = 1 - stats.chi2.cdf(chi2_stat, df)
print(f"pval = {pval}")

pval = 0.8817726825828268


## 두 모분산 비

In [20]:
import numpy as np
np.random.seed(123)

sample1 = np.random.normal(loc=5, scale=2, size=50)
sample2 = np.random.normal(loc=5, scale=3, size=50)

var1 = np.var(sample1, ddof=1)
var2 = np.var(sample2, ddof=1)

fstat = var1 / var2
df1 = len(sample1) - 1
df2 = len(sample2) - 1
pval = 1 - stats.f.cdf(fstat, df1, df2)

print(f"pval = {pval}")

pval = 0.9786532588205227


## 독립성 검정

In [27]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency

titanic = pd.read_csv("../Dataset/titanic.csv")

cross_table = pd.crosstab(titanic['Sex'], titanic['Survived'])

chi2, p, df, expected = chi2_contingency(cross_table)

print(f"{chi2}\n{p}\n{df}\n{expected}")

260.71702016732104
1.197357062775565e-58
1
[[193.47474747 120.52525253]
 [355.52525253 221.47474747]]


In [30]:
def categorize_age(age):
    if age < 20: return '10대'
    elif age < 30: return '20대'
    elif age < 40: return '30대'
    elif age < 50: return '40대'
    elif age < 60: return '50대'
    else: return '60대'

avr_age = titanic['Age'].mean()
titanic['Age'] = titanic['Age'].apply(lambda x: avr_age if np.isnan(x) else x)

titanic['AgeGroup'] = titanic['Age'].apply(categorize_age)

print(titanic[['Age', 'AgeGroup']])

table = pd.crosstab(titanic['Age'], titanic['AgeGroup'])
chi2, p, df, expected = chi2_contingency(table)
print(f"chi2 = {chi2}\n p={p}\n df={df}\n expected={expected}")

           Age AgeGroup
0    22.000000      20대
1    38.000000      30대
2    26.000000      20대
3    35.000000      30대
4    35.000000      30대
..         ...      ...
886  27.000000      20대
887  19.000000      10대
888  29.699118      20대
889  26.000000      20대
890  32.000000      30대

[891 rows x 2 columns]
chi2 = 4455.0
 p=0.0
 df=440
 expected=[[1.84062851e-01 4.45566779e-01 1.87429854e-01 9.98877666e-02
  5.38720539e-02 2.91806958e-02]
 [1.84062851e-01 4.45566779e-01 1.87429854e-01 9.98877666e-02
  5.38720539e-02 2.91806958e-02]
 [3.68125701e-01 8.91133558e-01 3.74859708e-01 1.99775533e-01
  1.07744108e-01 5.83613917e-02]
 [3.68125701e-01 8.91133558e-01 3.74859708e-01 1.99775533e-01
  1.07744108e-01 5.83613917e-02]
 [1.84062851e-01 4.45566779e-01 1.87429854e-01 9.98877666e-02
  5.38720539e-02 2.91806958e-02]
 [1.28843996e+00 3.11896745e+00 1.31200898e+00 6.99214366e-01
  3.77104377e-01 2.04264871e-01]
 [1.84062851e+00 4.45566779e+00 1.87429854e+00 9.98877666e-01
  5.38720539e-01 