In [1]:
import pandas as pd
import numpy as np
import os
import itertools

In [2]:
COVID_DIR = 'COVID_19'
CARD_FILENAME = 'card_20200717.csv'
CATEGORISING_FILENAME = 'dacon_categorising.xlsx'

## 선분류작업

In [3]:
standard_df = pd.read_excel(CATEGORISING_FILENAME).iloc[:14, :].T.reset_index()
standard_df.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,대중교통,버스,지하철,택시,,,,,,,,,,,
1,교통,국산신차,렌트카,전기차충전소,주차장,주유소,자동차시트/타이어,카인테리어,중고자동차,이륜차판매,수입자동차,,,,
2,의류,캐쥬얼의류,정장,기타의류,신발,스포츠의류,내의판매점,맞춤복점,와이셔츠/타이,,,,,,
3,온라인,인터넷Mall,인터넷P/G,인터넷종합Mall,,,,,,,,,,,
4,음식점,제과점,중국음식,서양읍식,스넥,일반한식,일식회집,한정식,기타건강식,,,,,,


In [4]:
division_1st = standard_df.iloc[:, 0]
division_1st

0     대중교통
1       교통
2       의류
3      온라인
4      음식점
5     실내활동
6       관광
7       의료
8     유흥주점
9     실외활동
10      교육
11      농업
Name: index, dtype: object

In [5]:
division_2nd = standard_df.iloc[:, 1:].values
division_2nd = list(map(lambda x: [w for w in x if str(w) != 'nan'], division_2nd))
div2list= list(itertools.chain(*division_2nd))

In [6]:
div2dict = {lmn: key for key, lmns in zip(division_1st, division_2nd) for lmn in lmns}

# 카드 소비 내역

In [7]:
card_df = pd.read_csv(CARD_FILENAME)
card_df['mrhst_induty_cl_nm'] = card_df['mrhst_induty_cl_nm'].str.replace(pat = ' ', repl = '')
card_df.shape

(3713952, 7)

In [8]:
object_df = card_df[card_df['mrhst_induty_cl_nm'].isin(div2list)]
object_df.shape

(1348866, 7)

In [9]:
object_df['대분류'] = object_df['mrhst_induty_cl_nm'].replace(div2dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [10]:
object_df

Unnamed: 0,receipt_dttm,adstrd_code,adstrd_nm,mrhst_induty_cl_code,mrhst_induty_cl_nm,selng_cascnt,salamt,대분류
0,20200104,1174066000,성내3동,7041,약국,463,5843230,의료
1,20200104,1174066000,성내3동,7022,치과의원,33,7835550,의료
2,20200104,1174066000,성내3동,7021,한의원,53,4589800,의료
5,20200104,1174066000,성내3동,5199,기타교육,54,1446900,교육
6,20200104,1174066000,성내3동,5105,보습학원,12,3876000,교육
...,...,...,...,...,...,...,...,...
3713939,20200614,1174052000,상일동,3321,전기차충전소,1,5000,교통
3713940,20200614,1174052000,상일동,7112,의료용품,9,264500,의료
3713942,20200614,1174066000,성내3동,7112,의료용품,1,1200000,의료
3713946,20200614,1150053000,등촌2동,4205,내의판매점,3,2850000,의류


In [11]:
object_df = object_df.groupby(['receipt_dttm', '대분류', 'mrhst_induty_cl_nm'], as_index=False)['selng_cascnt'].sum()
object_df.columns = ['datetime', '대분류', '중분류', 'sales count']

In [12]:
object_df = object_df.reset_index(drop = True)

In [13]:
def make_datetime(x):
    x = str(x)
    yr = x[:4]
    month = x[4:6]
    day = x[6:]
    date = f'{yr}-{month}-{day}'
    return date

In [14]:
object_df['datetime'] = object_df['datetime'].map(make_datetime)

## 코로나 확진자

In [15]:
covid_df = pd.read_csv(os.path.join(COVID_DIR, 'Time.csv'))
covid_df.shape

(163, 7)

In [16]:
covid_df = covid_df[['date', 'confirmed']]

In [17]:
covid_df.columns = ['datetime', 'covid_confirmed']

# Final Result

In [18]:
result_df = pd.merge(object_df, covid_df, on = 'datetime', how='left').sort_values('datetime').reset_index(drop = True)
result_df = result_df.rename(columns={'datetime':'사용일자', 'sales count':'중분류_카운트', 'covid_confirmed':'확진자수'})

In [19]:
result_df['확진자수'] = result_df['확진자수'].fillna(0)

In [20]:
result_df.to_csv('internal_df.csv', index = False)