# <center> 군집별로 알아보는 코로나의 영향 </center>

## __[목차]__

### 1. 사용데이터 정의
    1-1. 내부데이터
    1-2. 외부데이터
    1-3. 군집화데이터

### 2. 데이터 분석 및 시각화 방법

    2-1. 군집화 방법
    2-2. 변화추이 계산 방법
    2-3. 

### 3. 인사이트 분석

    3-1. 해석에 필요한 주요 이슈들
    3-2. 군집별 시각화 해석
    3-3. 인사이트 정리

### 4. 한계 및 추가 연구 내용

    4-1. 시각화의 한계점
    4-2. 추가 연구 내용

<img src="http://scimonitors.com/wp-content/uploads/2020/03/banner.png"  align = "center" > 

In [23]:
import os
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

# 1. 사용데이터 정의

In [2]:
# directory 정의
BASE_DIR = '../data/'
INTERNAL_DIR = 'internal_data'
EXTERNAL_DIR = 'external_data'
MERGE_DATA = 'merge_data'

In [3]:
category_dict = {'버스': '대중교통',
 '지하철': '대중교통',
 '택시': '대중교통',
 '국산신차': '교통',
 '렌트카': '교통',
 '전기차충전소': '교통',
 '주차장': '교통',
 '주유소': '교통',
 '자동차시트/타이어': '교통',
 '카인테리어': '교통',
 '중고자동차': '교통',
 '이륜차판매': '교통',
 '수입자동차': '교통',
 '캐쥬얼의류': '의류',
 '정장': '의류',
 '기타의류': '의류',
 '신발': '의류',
 '스포츠의류': '의류',
 '내의판매점': '의류',
 '맞춤복점': '의류',
 '와이셔츠/타이': '의류',
 '인터넷Mall': '온라인',
 '인터넷P/G': '온라인',
 '인터넷종합Mall': '온라인',
 '제과점': '음식점',
 '중국음식': '음식점',
 '서양읍식': '음식점',
 '스넥': '음식점',
 '일반한식': '음식점',
 '일식회집': '음식점',
 '한정식': '음식점',
 '기타건강식': '음식점',
 '당구장': '실내활동',
 '볼링장': '실내활동',
 '스크린골프': '실내활동',
 '수영장': '실내활동',
 '영화관': '실내활동',
 '카지노': '실내활동',
 '노래방': '실내활동',
 '수족관': '실내활동',
 '레져업소(회원제형태)': '실내활동',
 '헬스크럽': '실내활동',
 '여객선': '관광',
 '관광여행': '관광',
 '항공사': '관광',
 '2급 호텔': '관광',
 '1급호텔': '관광',
 '콘도': '관광',
 '기타숙박업': '관광',
 '종합레져타운': '관광',
 '한의원': '의료',
 '종합병원': '의료',
 '한약방': '의료',
 '의료용품': '의료',
 '치과의원': '의료',
 '치과병원': '의료',
 '병원': '의료',
 '한방병원': '의료',
 '약국': '의료',
 '기타의료기관 및 기타의료기기': '의료',
 '칵테일바': '유흥주점',
 '유흥주점': '유흥주점',
 '단란주점': '유흥주점',
 '테니스장': '실외활동',
 '골프연습장': '실외활동',
 '기타교육': '교육',
 '유치원': '교육',
 '기능학원': '교육',
 '보습학원': '교육',
 '학원(회원제형태)': '교육',
 '예체능학원': '교육',
 '외국어학원': '교육',
 '비료/농약/사료/종자': '농업',
 '기타농업관련': '농업',
 '농기계': '농업'}

### 1-1. 내부데이터 (Dacon 제공 데이터)
1. card.csv : 카드 소비 내역<br>
2. COVID_19/Time.csv : 코로나 확진자 수<br>

In [4]:
COVID_DIR = 'COVID_19'
CARD_FILENAME = 'card_20200717.csv'
COVID_FILENAME = 'Time.csv'
CATEGORISING_FILENAME = 'dacon_categorising.xlsx'

In [5]:
def make_datetime(x):
    x = str(x)
    year = x[:4]
    month = x[4:6]
    day = x[6:]
    date = f'{year}-{month}-{day}'
    return date

In [6]:
card_df = pd.read_csv(os.path.join(BASE_DIR, INTERNAL_DIR, CARD_FILENAME))
card_df['mrhst_induty_cl_nm'] = card_df['mrhst_induty_cl_nm'].str.replace(pat = ' ', repl = '')
card_df.shape

(3713952, 7)

In [7]:
object_df = card_df[card_df['mrhst_induty_cl_nm'].isin(category_dict)]
object_df['대분류'] = object_df['mrhst_induty_cl_nm'].replace(category_dict)
object_df.shape

(1348866, 8)

In [8]:
object_df = object_df.groupby(['receipt_dttm', '대분류', 'mrhst_induty_cl_nm'], as_index=False)['selng_cascnt'].sum()
object_df.columns = ['datetime', '대분류', '중분류', 'sales count']
object_df = object_df.reset_index(drop = True)
object_df['datetime'] = object_df['datetime'].map(make_datetime)
object_df.head()

Unnamed: 0,datetime,대분류,중분류,sales count
0,2020-01-04,관광,1급호텔,38
1,2020-01-04,관광,관광여행,1063
2,2020-01-04,관광,기타숙박업,3738
3,2020-01-04,관광,항공사,4800
4,2020-01-04,교육,기능학원,36


In [9]:
covid_df = pd.read_csv(os.path.join(BASE_DIR, INTERNAL_DIR,COVID_DIR, COVID_FILENAME))
covid_df.shape

(163, 7)

In [10]:
covid_df = covid_df[['date', 'confirmed']]

In [11]:
covid_df.columns = ['datetime', 'covid_confirmed']

In [12]:
internal_df = pd.merge(object_df, covid_df, on = 'datetime', how='left').sort_values('datetime').reset_index(drop = True)
internal_df = internal_df.rename(columns={'datetime':'사용일자', 'sales count':'중분류_카운트', 'covid_confirmed':'확진자수'})

In [13]:
check_df = internal_df.groupby(['중분류']).count()
check_df[check_df['사용일자']<50]

Unnamed: 0_level_0,사용일자,대분류,중분류_카운트,확진자수
중분류,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
카지노,3,3,3,3


In [14]:
internal_df = internal_df[internal_df['중분류']!="카지노"]

In [15]:
internal_df['확진자수'] = internal_df['확진자수'].fillna(0)

In [16]:
internal_df.head()

Unnamed: 0,사용일자,대분류,중분류,중분류_카운트,확진자수
0,2020-01-04,관광,1급호텔,38,0.0
1,2020-01-04,유흥주점,단란주점,968,0.0
2,2020-01-04,유흥주점,유흥주점,3141,0.0
3,2020-01-04,유흥주점,칵테일바,986,0.0
4,2020-01-04,음식점,기타건강식,62,0.0


### 1-2. 외부 데이터 (서울 열린데이터 광장, SEOUL TOPIS)
1. bus/data/*.csv : 버스 이용 데이터<br>
2. subway/data/*.csv : 지하철 이용 데이터 <br>
3. traffic/data/*.xlsx : 교통량 이용 데이터 <br>

In [17]:
BUS_DIR = 'bus'
BUS_FILENAME = 'bus_df.csv'
SUBWAY_DIR = 'subway'
SUBWAY_FILENAME = 'subway_df.csv'
TRAFFIC_DIR = 'traffic'
TRAFFIC_FILENAME = 'traffic_df.csv'

In [18]:
bus_df = pd.read_csv(os.path.join(BASE_DIR, EXTERNAL_DIR, BUS_DIR, BUS_FILENAME), index_col=False)
subway_df = pd.read_csv(os.path.join(BASE_DIR, EXTERNAL_DIR, SUBWAY_DIR, SUBWAY_FILENAME), index_col=False)
traffic_df = pd.read_csv(os.path.join(BASE_DIR, EXTERNAL_DIR, TRAFFIC_DIR, TRAFFIC_FILENAME), index_col=False)

In [19]:
external_df = bus_df.append(subway_df)
external_df = external_df.append(traffic_df)
external_df.head()

Unnamed: 0,사용일자,중분류_카운트,중분류
0,2020-01-01,2509091,버스
1,2020-01-02,5286628,버스
2,2020-01-03,5580078,버스
3,2020-01-04,4036695,버스
4,2020-01-05,3138649,버스


### 1-3. 최종 데이터 (내부데이터 + 외부데이터)

- Internal + External = Result<br>

In [20]:
covid_confirmation_df = internal_df[['사용일자', '확진자수']].drop_duplicates()
covid_confirmation_df.head()

Unnamed: 0,사용일자,확진자수
0,2020-01-04,0.0
49,2020-01-05,0.0
96,2020-01-06,0.0
142,2020-01-07,0.0
191,2020-01-08,0.0


In [21]:
internal_df = internal_df.iloc[:, :-1]

In [24]:
external_df['대분류'] = np.nan

In [25]:
external_df.loc[external_df['중분류'] == '교통량', ['대분류']] = '교통'
external_df.loc[external_df['중분류'] != '교통량', ['대분류']] = '대중교통'

In [26]:
concated_df = pd.concat([internal_df, external_df], axis = 0)
concated_df

Unnamed: 0,사용일자,대분류,중분류,중분류_카운트
0,2020-01-04,관광,1급호텔,38
1,2020-01-04,유흥주점,단란주점,968
2,2020-01-04,유흥주점,유흥주점,3141
3,2020-01-04,유흥주점,칵테일바,986
4,2020-01-04,음식점,기타건강식,62
...,...,...,...,...
177,2020-06-26,교통,교통량,9568914
178,2020-06-27,교통,교통량,8964257
179,2020-06-28,교통,교통량,7635665
180,2020-06-29,교통,교통량,9052687


In [27]:
median_v_series = concated_df.groupby(['대분류', '중분류'], as_index = False)['중분류_카운트'].median()
median_v_series.rename(columns = {'중분류_카운트' : '중분류_대표값'}, inplace = True)

In [28]:
combined_median_df = pd.merge(concated_df, median_v_series, on = ['대분류', '중분류'])
combined_median_df.head(2)

Unnamed: 0,사용일자,대분류,중분류,중분류_카운트,중분류_대표값
0,2020-01-04,관광,1급호텔,38,338.0
1,2020-01-06,관광,1급호텔,19,338.0


In [29]:
combined_median_df['증가추이'] = ((combined_median_df['중분류_카운트']-combined_median_df['중분류_대표값']) / combined_median_df['중분류_대표값']) * 100

In [30]:
combined_median_df.loc[(combined_median_df['중분류_대표값'] == 0) & (combined_median_df['중분류_카운트'] == 0), ['증가추이']] = 0

In [31]:
combined_median_df[combined_median_df['중분류_대표값'] == 0]

Unnamed: 0,사용일자,대분류,중분류,중분류_카운트,중분류_대표값,증가추이


In [32]:
result_df = combined_median_df.merge(covid_confirmation_df, on='사용일자')

In [33]:
result_df.head()

Unnamed: 0,사용일자,대분류,중분류,중분류_카운트,중분류_대표값,증가추이,확진자수
0,2020-01-04,관광,1급호텔,38,338.0,-88.757396,0.0
1,2020-01-04,유흥주점,단란주점,968,1195.0,-18.995816,0.0
2,2020-01-04,유흥주점,유흥주점,3141,1581.0,98.671727,0.0
3,2020-01-04,유흥주점,칵테일바,986,1505.0,-34.48505,0.0
4,2020-01-04,음식점,기타건강식,62,650.0,-90.461538,0.0
