In [194]:
import pandas as pd
import numpy as np
import os

import warnings
warnings.filterwarnings(action = 'ignore')

In [276]:
COVID_DIR = 'COVID_19'
CARD_FILENAME = 'card_20200717.csv'

In [288]:
def make_datetime(x):
    x = str(x)
    yr = x[:4]
    month = x[4:6]
    day = x[6:]
    date = f'{yr}-{month}-{day}'
    return date

## FILES

In [10]:
categorising_file = [file for file in os.listdir() if file.endswith('xlsx')][0]

In [123]:
card_df = pd.read_csv(CARD_FILENAME)
standard_df = pd.read_excel(categorising_file, na_values='nan').iloc[:12, :]
standard_df = pd.DataFrame(np.vstack([standard_df.columns, standard_df])).T
card_df.shape, standard_df.shape

((3713952, 7), (12, 13))

### Standard Categories

In [94]:
division_first = standard_df.T.iloc[0]
division_first

0     대중교통
1       교통
2       의류
3      온라인
4      음식점
5     실내활동
6       관광
7       의료
8     유흥주점
9     실외활동
10      교육
11      농업
Name: 0, dtype: object

In [138]:
division_second = (standard_df.iloc[:, 1:]).values
division_second = list(map(lambda x: [lmn for lmn in x if str(lmn) != 'nan'] , division_second))
division_second[:2]

[['버스', '지하철', '택시'],
 ['국산신차',
  '렌트카',
  '전기차충전소',
  '주차장',
  '주유소',
  '자동차시트/타이어',
  '카인테리어',
  '중고자동차',
  '이륜차판매',
  '수입자동차']]

In [182]:
division_combine_dict = {lmn:key for key, lmns in zip(division_first, division_second) for lmn in lmns}
division_combine_dict

{'버스': '대중교통',
 '지하철': '대중교통',
 '택시': '대중교통',
 '국산신차': '교통',
 '렌트카': '교통',
 '전기차충전소': '교통',
 '주차장': '교통',
 '주유소': '교통',
 '자동차시트/타이어': '교통',
 '카인테리어': '교통',
 '중고자동차': '교통',
 '이륜차판매': '교통',
 '수입자동차': '교통',
 '캐쥬얼의류': '의류',
 '정장': '의류',
 '기타의류': '의류',
 '신발': '의류',
 '스포츠의류': '의류',
 '내의판매점': '의류',
 '맞춤복점': '의류',
 '와이셔츠/타이': '의류',
 '인터넷Mall': '온라인',
 '인터넷P/G': '온라인',
 '인터넷종합Mall': '온라인',
 '제과점': '음식점',
 '중국음식': '음식점',
 '서양읍식': '음식점',
 '스넥': '음식점',
 '일반한식': '음식점',
 '일식회집': '음식점',
 '한정식': '음식점',
 '기타건강식': '음식점',
 '당구장': '실내활동',
 '볼링장': '실내활동',
 '스크린골프': '실내활동',
 '수영장': '실내활동',
 '영화관': '실내활동',
 '카지노': '실내활동',
 '노래방': '실내활동',
 '수족관': '실내활동',
 '레져업소(회원제형태)': '실내활동',
 '헬스크럽': '실내활동',
 '여객선': '관광',
 '관광여행': '관광',
 '항공사': '관광',
 '2급 호텔': '관광',
 '1급호텔': '관광',
 '콘도': '관광',
 '기타숙박업': '관광',
 '종합레져타운': '관광',
 '한의원': '의료',
 '종합병원': '의료',
 '한약방': '의료',
 '의료용품': '의료',
 '치과의원': '의료',
 '치과병원': '의료',
 '병원': '의료',
 '한방병원': '의료',
 '약국': '의료',
 '기타의료기관 및 기타의료기기': '의료',
 '칵테일바': '유흥주점',
 '유흥주점': '유흥주점',
 '단란주점': '유

## Card DATA

In [290]:
card_df['receipt_dttm'] = card_df['receipt_dttm'].map(make_datetime)

In [291]:
card_df['mrhst_induty_cl_nm'] = card_df['mrhst_induty_cl_nm'].str.replace(pat = ' ', repl = '')

In [292]:
extract_objects_list = division_combine_dict.keys()

In [293]:
object_df = card_df[card_df['mrhst_induty_cl_nm'].isin(extract_objects_list)]
object_df.reset_index(drop = True)

Unnamed: 0,receipt_dttm,adstrd_code,adstrd_nm,mrhst_induty_cl_code,mrhst_induty_cl_nm,selng_cascnt,salamt
0,2020-01-04,1174066000,성내3동,7041,약국,463,5843230
1,2020-01-04,1174066000,성내3동,7022,치과의원,33,7835550
2,2020-01-04,1174066000,성내3동,7021,한의원,53,4589800
3,2020-01-04,1174066000,성내3동,5199,기타교육,54,1446900
4,2020-01-04,1174066000,성내3동,5105,보습학원,12,3876000
...,...,...,...,...,...,...,...
1348861,2020-06-14,1174052000,상일동,3321,전기차충전소,1,5000
1348862,2020-06-14,1174052000,상일동,7112,의료용품,9,264500
1348863,2020-06-14,1174066000,성내3동,7112,의료용품,1,1200000
1348864,2020-06-14,1150053000,등촌2동,4205,내의판매점,3,2850000


In [387]:
object_df['div_first'] = object_df['mrhst_induty_cl_nm'].replace(division_combine_dict)
object_df.head()

Unnamed: 0,receipt_dttm,adstrd_code,adstrd_nm,mrhst_induty_cl_code,mrhst_induty_cl_nm,selng_cascnt,salamt,div_first
0,2020-01-04,1174066000,성내3동,7041,약국,463,5843230,의료
1,2020-01-04,1174066000,성내3동,7022,치과의원,33,7835550,의료
2,2020-01-04,1174066000,성내3동,7021,한의원,53,4589800,의료
5,2020-01-04,1174066000,성내3동,5199,기타교육,54,1446900,교육
6,2020-01-04,1174066000,성내3동,5105,보습학원,12,3876000,교육


In [400]:
object_df = object_df[['receipt_dttm',  'div_first', 'mrhst_induty_cl_nm', 'selng_cascnt']]
object_df = object_df.rename(columns = {'receipt_dttm' : 'datetime'})

In [None]:
# daily_cnt = object_df.groupby(['receipt_dttm', 'div_first', 'mrhst_induty_cl_nm'], as_index = False)['selng_cascnt'].sum()

In [373]:
# daily_median = daily_cnt.groupby(['div_first','mrhst_induty_cl_nm'],as_index =False).median()

In [380]:
# cnt_median_df = pd.merge(daily_cnt, daily_median, on = ['div_first', 'mrhst_induty_cl_nm'])
# cnt_median_df.columns = ['datetime', 'div_first', 'mrhst_induty_cl_nm', 'sales_cnt','daily_median']
# cnt_median_df.head(5)

Unnamed: 0,datetime,div_first,mrhst_induty_cl_nm,sales_cnt,daily_median
0,2020-01-04,관광,1급호텔,38,338.0
1,2020-01-06,관광,1급호텔,19,338.0
2,2020-01-07,관광,1급호텔,40,338.0
3,2020-01-10,관광,1급호텔,43,338.0
4,2020-01-11,관광,1급호텔,59,338.0


In [382]:
# cnt_median_df['pct_of_cnt'] = (cnt_median_df['sales_cnt'] / cnt_median_df['daily_median']) * 100
# cnt_median_df.head()

Unnamed: 0,datetime,div_first,mrhst_induty_cl_nm,sales_cnt,daily_median,pct_of_cnt
0,2020-01-04,관광,1급호텔,38,338.0,11.242604
1,2020-01-06,관광,1급호텔,19,338.0,5.621302
2,2020-01-07,관광,1급호텔,40,338.0,11.83432
3,2020-01-10,관광,1급호텔,43,338.0,12.721893
4,2020-01-11,관광,1급호텔,59,338.0,17.455621


## 코로나 확진자

In [392]:
covid_df = pd.read_csv(os.path.join(COVID_DIR, 'Time.csv'))
covid_df.shape

(163, 7)

In [393]:
covid_df.head(5)

Unnamed: 0,date,time,test,negative,confirmed,released,deceased
0,2020-01-20,16,1,0,1,0,0
1,2020-01-21,16,1,0,1,0,0
2,2020-01-22,16,4,3,1,0,0
3,2020-01-23,16,22,21,1,0,0
4,2020-01-24,16,27,25,2,0,0


In [394]:
covid_df = covid_df[['date', 'confirmed']].sort_values(by = 'date')
covid_df['date'].iloc[0], covid_df['date'].iloc[-1]

('2020-01-20', '2020-06-30')

In [401]:
covid_df.columns = ['datetime', 'covid_confirmed']

# Final Result

In [402]:
final = pd.merge(object_df, covid_df, on = 'datetime').sort_values('datetime').reset_index(drop = True)
final

Unnamed: 0,datetime,div_first,mrhst_induty_cl_nm,selng_cascnt,covid_confirmed
0,2020-01-20,음식점,제과점,50,1
1,2020-01-20,음식점,스넥,369,1
2,2020-01-20,음식점,중국음식,97,1
3,2020-01-20,음식점,일식회집,19,1
4,2020-01-20,음식점,일반한식,814,1
...,...,...,...,...,...
1290987,2020-06-14,의료,치과의원,8,12085
1290988,2020-06-14,음식점,제과점,179,12085
1290989,2020-06-14,의류,기타의류,23,12085
1290990,2020-06-14,음식점,제과점,145,12085


In [408]:
final.to_csv('internal_data.csv', index = False)