In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import koreanize_matplotlib

# 카카오 맵 API 불러오기 위해
import requests
import json

# 상관분석
from sklearn import preprocessing
from scipy.stats import chi2_contingency

## 데이터셋 불러오기

In [2]:
yangcheon_c = pd.read_csv('data/서울특별시_양천구_흡연시설 현황_20220801.csv')
print(yangcheon_c.shape)
yangcheon_c.head()

(23, 8)


Unnamed: 0,자치구,시설 구분,시설형태,설치 위치,규모(제곱미터),설치기관,데이터기준일자,주소
0,양천구,청사,완전개방형,양천구청 부지,8.0,양천구청,2022-08-01,서울특별시 양천구 목동동로 105
1,양천구,청사,완전개방형,해누리타운 4층 옥외정원,8.0,양천구청,2022-08-01,서울특별시 양천구 목동동로 81
2,양천구,청사,완전개방형,양천경찰서 부지,10.0,양천경찰서,2022-08-01,서울특별시 양천구 목동동로 99
3,양천구,청사,개방형,남부지방법원 후문 옆,17.0,남부지방법원,2022-08-01,서울특별시 양천구 신월로 386
4,양천구,청사,개방형,양천세무서 부지 우측,10.0,양천세무서,2022-08-01,서울특별시 양천구 목동동로 165


In [3]:
yangcheon_cbb = pd.read_csv('data/쓰레기통_위경도/양천구_담배꽁초쓰레기통_위경도.csv')
print(yangcheon_cbb.shape)
yangcheon_cbb.head()

(12, 6)


Unnamed: 0,address_name,경도,위도,도로명주소,설치장소,시군구명
0,서울 양천구 목동동로 293,126.875772,37.528178,서울시 양천구 목동동로 293,현대41타워앞 인도,양천구
1,서울 양천구 목동서로 57,126.879863,37.536632,서울시 양천구 목동서로 57,파리지앙A앞 공용주차장,양천구
2,서울 양천구 오목로 232,126.863206,37.525764,서울시 양천구 오목로 232,보성상가옆 먹자골목,양천구
3,서울 양천구 중앙로 276,126.852884,37.520836,서울시 양천구 중앙로 276,신정네거리역 뒤 먹자골목,양천구
4,서울 양천구 오목로 325,126.873627,37.525107,서울시 양천구 오목로 325,농협 옆 골목,양천구


In [4]:
yangcheon_c = yangcheon_c[['주소']]

In [5]:
yangcheon_cbb = yangcheon_cbb[['도로명주소']]

### mapo_c(흡연구역) 대로명 뽑기

In [7]:
a = yangcheon_c['주소'].apply(lambda x:x.split(" "))

for i in range(len(a)):
    a[i] = a[i][2]

yangcheon_c['주소'] = a

b = yangcheon_c['주소'].str.split('로')

for i in range(len(b)):
    b[i] = b[i][0] + '로'

yangcheon_c['주소'] = b

### mapo_cbb(쓰레기통) 대로명 뽑기

In [8]:
a = yangcheon_cbb['도로명주소'].apply(lambda x:x.split(" "))

for i in range(len(a)):
    a[i] = a[i][2]

yangcheon_cbb['도로명주소']=a

b = yangcheon_cbb['도로명주소'].str.split('로')

for i in range(len(b)):
    b[i] = b[i][0] + '로'

yangcheon_cbb['도로명주소'] = b

### concat

In [9]:
yangcheon_m = pd.concat([yangcheon_c, yangcheon_cbb], axis=1)
yangcheon_m

Unnamed: 0,주소,도로명주소
0,목동동로,목동동로
1,목동동로,양천구로
2,목동동로,양천구로
3,신월로,중앙로
4,목동동로,오목로
5,목동동로,오목로
6,지양로,양천구로
7,안양천로,양천구로
8,목동로,신정중앙로
9,신정이펜1로,오목로


In [10]:
yangcheon_m.columns = ['흡연구역', '쓰레기통']

## 검정

In [11]:
label = preprocessing.LabelEncoder()
data_encoded = pd.DataFrame()

for i in yangcheon_m.columns:
    data_encoded[i] = label.fit_transform(yangcheon_m[i])
    
data_encoded.head()

Unnamed: 0,흡연구역,쓰레기통
0,1,0
1,1,2
2,1,2
3,4,4
4,1,3


### 쓰레기통, 흡연구역 상관

In [12]:
df_crosstab = pd.crosstab(data_encoded['쓰레기통'], data_encoded['흡연구역'])
df_crosstab

흡연구역,0,1,2,3,4,5,6,7,8,9
쓰레기통,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,1,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0
2,0,4,0,0,0,0,0,1,0,1
3,0,2,0,0,0,0,1,0,0,0
4,0,0,0,0,1,0,0,0,0,0
5,1,3,0,3,0,2,0,1,1,0


In [13]:
rows = [row.to_list() for i, row in df_crosstab.iterrows()]
rows

[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 [0, 4, 0, 0, 0, 0, 0, 1, 0, 1],
 [0, 2, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 [1, 3, 0, 3, 0, 2, 0, 1, 1, 0]]

In [14]:
# 쓰레기통, 흡연구역 상관 없음
chi2_contingency(rows, correction=False)

(65.48030303030305,
 0.02466437384267831,
 45,
 array([[0.04347826, 0.43478261, 0.04347826, 0.13043478, 0.04347826,
         0.08695652, 0.04347826, 0.08695652, 0.04347826, 0.04347826],
        [0.04347826, 0.43478261, 0.04347826, 0.13043478, 0.04347826,
         0.08695652, 0.04347826, 0.08695652, 0.04347826, 0.04347826],
        [0.26086957, 2.60869565, 0.26086957, 0.7826087 , 0.26086957,
         0.52173913, 0.26086957, 0.52173913, 0.26086957, 0.26086957],
        [0.13043478, 1.30434783, 0.13043478, 0.39130435, 0.13043478,
         0.26086957, 0.13043478, 0.26086957, 0.13043478, 0.13043478],
        [0.04347826, 0.43478261, 0.04347826, 0.13043478, 0.04347826,
         0.08695652, 0.04347826, 0.08695652, 0.04347826, 0.04347826],
        [0.47826087, 4.7826087 , 0.47826087, 1.43478261, 0.47826087,
         0.95652174, 0.47826087, 0.95652174, 0.47826087, 0.47826087]]))

In [15]:
x2 = chi2_contingency(rows, correction=False)[0]
n = np.sum(rows)
minDimension = min(np.array(rows).shape)-1

V = np.sqrt((x2/n) / minDimension)

print(V)

0.7545819633372769


### 전체 상관 df

In [16]:
# 함수 정의 및 혼동행렬 생성
def cramers_V(var1, var2):
    crosstab = np.array(pd.crosstab(var1, var2, rownames=None, colnames=None))
    stat = chi2_contingency(crosstab)[0]
    obs = np.sum(crosstab)
    phi2 = stat / obs
    r, k = crosstab.shape
    phi2corr = max(0, phi2 - (((k-1)*(r-1))/(obs-1)))
    rcorr = r - ((r-1)**2)/(obs-1)
    kcorr = k - ((k-1)**2)/(obs-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

In [17]:
rows = []

for var1 in data_encoded:
    col = []
    for var2 in data_encoded:
        cramers = cramers_V(data_encoded[var1], data_encoded[var2])
        col.append(round(cramers, 2))
    rows.append(col)

In [18]:
cramers_results = np.array(rows)
cramers_results

array([[1.  , 0.46],
       [0.46, 1.  ]])

In [19]:
df = pd.DataFrame(cramers_results, columns = data_encoded.columns, index = data_encoded.columns)
df

Unnamed: 0,흡연구역,쓰레기통
흡연구역,1.0,0.46
쓰레기통,0.46,1.0
