In [33]:
import seaborn as sns

In [34]:
pg = sns.load_dataset("penguins")
pg.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [46]:
g = sns.pairplot(data = pg,
                vars = ['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g'],
                kind='scatter', dropna=True)
g

<seaborn.axisgrid.PairGrid at 0x232f255c1f0>

In [47]:
g = sns.PairGrid(data = pg, hue='species',
                vars = ['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g'],
                dropna=True)
g.map_upper(sns.scatterplot)
g.map_lower(sns.scatterplot)
g.map_diag(sns.histplot)
g

<seaborn.axisgrid.PairGrid at 0x232f5e664c0>

In [37]:
# Q.산점도에 있는 변수의 상관 계수를 계산하고 α =0.05에서 상관 계수를 검정

import pandas as pd
import numpy as np
from scipy.stats import pearsonr

In [38]:
pg.corr(method='pearson')
# pearson의 상관계수를 계산해준다!

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
bill_length_mm,1.0,-0.235053,0.656181,0.59511
bill_depth_mm,-0.235053,1.0,-0.583851,-0.471916
flipper_length_mm,0.656181,-0.583851,1.0,0.871202
body_mass_g,0.59511,-0.471916,0.871202,1.0


In [39]:
# @ DataFrame.dropna(axis=0/1, how='any'/'all', subset=[col1, col2, ...], inplace=True/False) @

# 1) axis = 0/1 or 'index'/'columns'
#       0 or 'index' -> NaN 값이 포함된 row를 drop
#       1 or 'columns' -> NaN 값이 포함된 column을 drop

# 2) how = 'any'/'all'
#       any -> row 또는 column에 NaN값이 1개만 있어도 drop
#       all -> row 또는 column에 있는 모든 값이 NaN이어야 drop

# 3) inplace = True/False
#       True -> dropna가 적용된 DataFrame 자체에 dropna를 적용
#       False -> dropna가 적용된 DataFrame는 그대로 두고 dropna를 적용한 DataFrame을 return

# 4) subset = [col1, col2, ...]
#       subset을 명시하지 않으면 DataFrame 전체(모든 column & 모든 row)에 대해 dropna를 진행
#       subset을 명시하면 subset에 적힌 column값에 대해서만 dropna를 진행

pg2 = pg.dropna(axis=0, how='any', inplace=False)
r2,pval = pearsonr(pg2['bill_length_mm'], pg2['bill_depth_mm'])

print("상관계수 : ", r2)
print("P-Value : ", pval)

if pval > 0.05:
    print("H0 Accept")
else:
    print("H0 Reject")

상관계수 :  -0.2286256359130291
P-Value :  2.5282897209444827e-05
H0 Reject


In [40]:
# Q. 'penguins'에서 'bill_depth'를 가진 'bill_length'에 대한 회귀 계수를 찾기

import statsmodels.api as sm

x = sm.add_constant(pg2['bill_depth_mm'])
pg_fit = sm.OLS(pg2['bill_length_mm'], x).fit()
# sm.OLS(y, x).fit() -> 회귀모델 추정

# params -> 회귀 파라미터들을 반환해줌
p = pg_fit.params 
print(p)

const            54.890854
bill_depth_mm    -0.634905
dtype: float64


In [41]:
# Test: Alpha and Beta (임계값이 0.05일 때)

# t_test -> 회귀 파라미터들에 대한 테스트 결과들을 반환
print("Alpha: ", pg_fit.t_test([1,0]))
print("")
print("")
print("Beta: ", pg_fit.t_test([0,1]))

Alpha:                               Test for Constraints                             
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c0            54.8909      2.567     21.380      0.000      49.840      59.941


Beta:                               Test for Constraints                             
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c0            -0.6349      0.149     -4.273      0.000      -0.927      -0.343


In [42]:
# x에 대한 예측 값을 반환한다.
ypred = pg_fit.get_prediction(x)
result = ypred.summary_frame(alpha=0.05).round(4)
result.head()

Unnamed: 0,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper
0,43.0181,0.3707,42.2889,43.7473,32.5042,53.5321
1,43.8435,0.2943,43.2646,44.4224,33.3389,54.3481
2,43.4626,0.3174,42.8381,44.087,32.9554,53.9697
4,42.6372,0.4313,41.7887,43.4857,32.1143,53.1601
5,41.8118,0.5882,40.6548,42.9688,31.2596,52.364


In [43]:
sns.lmplot(x='bill_depth_mm',y='bill_length_mm',data=pg2,ci=95)

# show() 검색해보기.....

<seaborn.axisgrid.FacetGrid at 0x232f55eb3a0>

In [44]:
pg_fit.summary()

0,1,2,3
Dep. Variable:,bill_length_mm,R-squared:,0.052
Model:,OLS,Adj. R-squared:,0.049
Method:,Least Squares,F-statistic:,18.26
Date:,"Tue, 13 Dec 2022",Prob (F-statistic):,2.53e-05
Time:,19:30:41,Log-Likelihood:,-1028.8
No. Observations:,333,AIC:,2062.0
Df Residuals:,331,BIC:,2069.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,54.8909,2.567,21.380,0.000,49.840,59.941
bill_depth_mm,-0.6349,0.149,-4.273,0.000,-0.927,-0.343

0,1,2,3
Omnibus:,8.999,Durbin-Watson:,1.148
Prob(Omnibus):,0.011,Jarque-Bera (JB):,6.946
Skew:,0.251,Prob(JB):,0.031
Kurtosis:,2.501,Cond. No.,152.0
