In [1]:
import pandas as pd
import numpy as np

import os

In [2]:
INTERNAL_DIR = 'internal_data'
EXTERNAL_DIR = 'external_data'

INTERNAL_FILENAME = 'internal_df.csv'
EXTERNAL_FILENAME = 'external_df.csv'

In [3]:
INTERNAL_PATH = os.path.join('..', INTERNAL_DIR, INTERNAL_FILENAME)
EXTERNAL_PATH = os.path.join('..', EXTERNAL_DIR, EXTERNAL_FILENAME)

In [4]:
internal_df = pd.read_csv(INTERNAL_PATH, index_col=False)
external_df = pd.read_csv(EXTERNAL_PATH, index_col=False)

In [5]:
covid_confirmation_df = internal_df[['사용일자', '확진자수']].drop_duplicates()
covid_confirmation_df.head()

Unnamed: 0,사용일자,확진자수
0,2020-01-04,0.0
49,2020-01-05,0.0
96,2020-01-06,0.0
142,2020-01-07,0.0
191,2020-01-08,0.0


In [6]:
internal_df = internal_df.iloc[:, :-1]

In [7]:
internal_df.head()

Unnamed: 0,사용일자,대분류,중분류,중분류_카운트
0,2020-01-04,관광,1급호텔,38
1,2020-01-04,유흥주점,단란주점,968
2,2020-01-04,유흥주점,유흥주점,3141
3,2020-01-04,유흥주점,칵테일바,986
4,2020-01-04,음식점,기타건강식,62


In [8]:
external_df.head()

Unnamed: 0,사용일자,중분류_카운트,중분류
0,2020-01-01,2509091,버스
1,2020-01-02,5286628,버스
2,2020-01-03,5580078,버스
3,2020-01-04,4036695,버스
4,2020-01-05,3138649,버스


In [9]:
external_df['대분류'] = np.nan

In [10]:
external_df.loc[external_df['중분류'] == '교통량', ['대분류']] = '교통'
external_df.loc[external_df['중분류'] != '교통량', ['대분류']] = '대중교통'

In [11]:
external_df

Unnamed: 0,사용일자,중분류_카운트,중분류,대분류
0,2020-01-01,2509091,버스,대중교통
1,2020-01-02,5286628,버스,대중교통
2,2020-01-03,5580078,버스,대중교통
3,2020-01-04,4036695,버스,대중교통
4,2020-01-05,3138649,버스,대중교통
...,...,...,...,...
541,2020-06-26,9568914,교통량,교통
542,2020-06-27,8964257,교통량,교통
543,2020-06-28,7635665,교통량,교통
544,2020-06-29,9052687,교통량,교통


In [12]:
covid_confirmation_df

Unnamed: 0,사용일자,확진자수
0,2020-01-04,0.0
49,2020-01-05,0.0
96,2020-01-06,0.0
142,2020-01-07,0.0
191,2020-01-08,0.0
...,...,...
9821,2020-06-10,11902.0
9889,2020-06-11,11947.0
9957,2020-06-12,12003.0
10026,2020-06-13,12051.0


In [13]:
concated_df = pd.concat([internal_df, external_df], axis = 0)
concated_df

Unnamed: 0,사용일자,대분류,중분류,중분류_카운트
0,2020-01-04,관광,1급호텔,38
1,2020-01-04,유흥주점,단란주점,968
2,2020-01-04,유흥주점,유흥주점,3141
3,2020-01-04,유흥주점,칵테일바,986
4,2020-01-04,음식점,기타건강식,62
...,...,...,...,...
541,2020-06-26,교통,교통량,9568914
542,2020-06-27,교통,교통량,8964257
543,2020-06-28,교통,교통량,7635665
544,2020-06-29,교통,교통량,9052687


In [14]:
median_v_series = concated_df.groupby(['대분류', '중분류'], as_index = False)['중분류_카운트'].median()
median_v_series.rename(columns = {'중분류_카운트' : '중분류_대표값'}, inplace = True)

In [15]:
combined_median_df = pd.merge(concated_df, median_v_series, on = ['대분류', '중분류'])
combined_median_df.head(2)

Unnamed: 0,사용일자,대분류,중분류,중분류_카운트,중분류_대표값
0,2020-01-04,관광,1급호텔,38,338.0
1,2020-01-06,관광,1급호텔,19,338.0


In [16]:
combined_median_df['증가추이'] = ((combined_median_df['중분류_카운트']-combined_median_df['중분류_대표값']) / combined_median_df['중분류_대표값']) * 100

In [17]:
combined_median_df[combined_median_df['중분류_대표값'] == 0]

Unnamed: 0,사용일자,대분류,중분류,중분류_카운트,중분류_대표값,증가추이
7975,2020-01-22,실내활동,카지노,0,0.0,
7976,2020-01-23,실내활동,카지노,0,0.0,
7977,2020-01-27,실내활동,카지노,0,0.0,


In [18]:
combined_median_df.loc[(combined_median_df['중분류_대표값'] == 0) & (combined_median_df['중분류_카운트'] == 0), ['증가추이']] = 0

In [19]:
combined_median_df[combined_median_df['중분류_대표값'] == 0]

Unnamed: 0,사용일자,대분류,중분류,중분류_카운트,중분류_대표값,증가추이
7975,2020-01-22,실내활동,카지노,0,0.0,0.0
7976,2020-01-23,실내활동,카지노,0,0.0,0.0
7977,2020-01-27,실내활동,카지노,0,0.0,0.0


In [20]:
result_df = combined_median_df.merge(covid_confirmation_df, on='사용일자')

In [21]:
result_df.head()

Unnamed: 0,사용일자,대분류,중분류,중분류_카운트,중분류_대표값,증가추이,확진자수
0,2020-01-04,관광,1급호텔,38,338.0,-88.757396,0.0
1,2020-01-04,유흥주점,단란주점,968,1195.0,-18.995816,0.0
2,2020-01-04,유흥주점,유흥주점,3141,1581.0,98.671727,0.0
3,2020-01-04,유흥주점,칵테일바,986,1505.0,-34.48505,0.0
4,2020-01-04,음식점,기타건강식,62,650.0,-90.461538,0.0


In [22]:
result_df.to_csv('result_df.csv', index = False)