# 상관분석

## 01 상관 분석 개요

### 다양한 상관 분석

- 두 변수의 선형관계를 확인하기 위해서 상관분석을 실시
- 두 수치형 변수의 비교는 Pearson's Correlation coefficient를 확인하며 그 외 다양한 상관계수 존재
- 상관계수가 0에 가까울수록 선형관계가 약하며 절대값이 1에 가까울수록 선형관계가 강함

## 02 주요 함수 및 메서드 소개

### pandas - corr()

- pandas에서 상관계수를 실시하는 데이터프레임 전용 메서드
- method에 "pearson", "kendall", "spearman"은 각각의 상관계수로 계산

### scipy - pearsonr()

- Pearson 상관분석을 실시하는 scipy의 함수
- 입력은 두 일차원 벡터를 넣고 출력은 상관계수와 p-value가 차례대로 출력

### scipy - sparmanr()

- Spearman 상관분석을 실시하는 scipy의 함수
- 입력은 두 일차원 벡터를 넣고 출력은 상관계수와 p-value가 차례대로 출력

### scipy - kendalltau()

- Kendall 상관분석을 실시하는 scipy의 함수
- 입력은 두 일차원 벡터를 넣고 출력은 상관계수와 p-value가 차례대로 출력

In [1]:
import pandas as pd
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import kendalltau

In [4]:
df = pd.read_csv("강의자료/실습파일/bike.csv")
df.tail()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
10881,2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336
10882,2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241
10883,2012-12-19 21:00:00,4,0,1,1,13.94,15.91,61,15.0013,4,164,168
10884,2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129
10885,2012-12-19 23:00:00,4,0,1,1,13.12,16.665,66,8.9981,4,84,88


In [5]:
df.corr()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
season,1.0,0.029368,-0.008126,0.008879,0.258689,0.264744,0.19061,-0.147121,0.096758,0.164011,0.163439
holiday,0.029368,1.0,-0.250491,-0.007074,0.000295,-0.005215,0.001929,0.008409,0.043799,-0.020956,-0.005393
workingday,-0.008126,-0.250491,1.0,0.033772,0.029966,0.02466,-0.01088,0.013373,-0.319111,0.11946,0.011594
weather,0.008879,-0.007074,0.033772,1.0,-0.055035,-0.055376,0.406244,0.007261,-0.135918,-0.10934,-0.128655
temp,0.258689,0.000295,0.029966,-0.055035,1.0,0.984948,-0.064949,-0.017852,0.467097,0.318571,0.394454
atemp,0.264744,-0.005215,0.02466,-0.055376,0.984948,1.0,-0.043536,-0.057473,0.462067,0.314635,0.389784
humidity,0.19061,0.001929,-0.01088,0.406244,-0.064949,-0.043536,1.0,-0.318607,-0.348187,-0.265458,-0.317371
windspeed,-0.147121,0.008409,0.013373,0.007261,-0.017852,-0.057473,-0.318607,1.0,0.092276,0.091052,0.101369
casual,0.096758,0.043799,-0.319111,-0.135918,0.467097,0.462067,-0.348187,0.092276,1.0,0.49725,0.690414
registered,0.164011,-0.020956,0.11946,-0.10934,0.318571,0.314635,-0.265458,0.091052,0.49725,1.0,0.970948


In [12]:
df[["casual", "registered"]].corr()

Unnamed: 0,casual,registered
casual,1.0,0.49725
registered,0.49725,1.0


In [10]:
pearsonr(df["casual"], df["registered"])

PearsonRResult(statistic=0.49724968508700795, pvalue=0.0)

In [13]:
df[["casual", "registered"]].corr(method = "spearman")

Unnamed: 0,casual,registered
casual,1.0,0.775785
registered,0.775785,1.0


In [11]:
spearmanr(df["casual"], df["registered"])

SpearmanrResult(correlation=0.775785305888168, pvalue=0.0)

In [14]:
df[["casual", "registered"]].corr(method = "kendall")

Unnamed: 0,casual,registered
casual,1.0,0.582213
registered,0.582213,1.0


In [15]:
kendalltau(df["casual"], df["registered"])

KendalltauResult(correlation=0.5822127999867908, pvalue=0.0)

## Q1 기온, 체감온도, 상대습도, 총 자전거 대여 숫자의 상관관계를 분석하였을 대 가장 낮은 상관계수는 얼마인가?
1) 데이터 속성에 맞는 적절한 상관분석 기법 사용  
2) 자전거 대여 숫자는 casual 변수를 사용

In [16]:
Q1 = pd.read_csv("강의자료/실습파일/bike.csv")
Q1.tail()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
10881,2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336
10882,2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241
10883,2012-12-19 21:00:00,4,0,1,1,13.94,15.91,61,15.0013,4,164,168
10884,2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129
10885,2012-12-19 23:00:00,4,0,1,1,13.12,16.665,66,8.9981,4,84,88


In [20]:
Q1[["temp", "atemp", "humidity", "casual"]].corr().round(3)

Unnamed: 0,temp,atemp,humidity,casual
temp,1.0,0.985,-0.065,0.467
atemp,0.985,1.0,-0.044,0.462
humidity,-0.065,-0.044,1.0,-0.348
casual,0.467,0.462,-0.348,1.0


## Q2 계절별로 체감온도와 자전거 대여 숫자의 상관관계를 알아보고자 한다. 이 때 적절한 상관분석기법을 사용하였을 때 상관계수로 옳은 것은?

In [22]:
Q2_Spring = Q1[Q1["season"] == 1]
Q2_Summer = Q1[Q1["season"] == 2]
Q2_Fall = Q1[Q1["season"] == 3]
Q2_Winter = Q1[Q1["season"] == 4]

In [23]:
Q2_Spring[["atemp", "casual"]].corr()

Unnamed: 0,atemp,casual
atemp,1.0,0.478312
casual,0.478312,1.0


In [24]:
Q2_Summer[["atemp", "casual"]].corr()

Unnamed: 0,atemp,casual
atemp,1.0,0.378122
casual,0.378122,1.0


In [25]:
Q2_Fall[["atemp", "casual"]].corr()

Unnamed: 0,atemp,casual
atemp,1.0,0.381423
casual,0.381423,1.0


In [26]:
Q2_Winter[["atemp", "casual"]].corr()

Unnamed: 0,atemp,casual
atemp,1.0,0.443751
casual,0.443751,1.0


In [32]:
## 답안
Q2_corr = Q1[["season", "atemp", "casual"]].groupby("season").corr().reset_index()
Q2_corr.loc[Q2_corr["atemp"] < 1]

Unnamed: 0,season,level_1,atemp,casual
1,1,casual,0.478312,1.0
3,2,casual,0.378122,1.0
5,3,casual,0.381423,1.0
7,4,casual,0.443751,1.0


## Q3 날씨에 따른 기온과 자전거 대여의 상관계수 변화를 알아보고자 한다. 날씨가 맑은 날과 그렇지 않은 날의 상관계수 차이의 절대값은 얼마인가?
1) weather 변수의 값이 1인 것이 맑은 날

In [33]:
Q3 = pd.read_csv("강의자료/실습파일/bike.csv")
Q3.tail()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
10881,2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336
10882,2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241
10883,2012-12-19 21:00:00,4,0,1,1,13.94,15.91,61,15.0013,4,164,168
10884,2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129
10885,2012-12-19 23:00:00,4,0,1,1,13.12,16.665,66,8.9981,4,84,88


In [38]:
Q3_clean = Q3[Q3["weather"] == 1]
Q3_other = Q3[Q3["weather"] != 1]

In [41]:
stat1, p1 = pearsonr(Q3_clean["temp"], Q3_clean["casual"])

In [42]:
stat2, p2 = pearsonr(Q3_other["temp"], Q3_other["casual"])

In [43]:
stat1 - stat2

0.0246916461938268