### Import

In [1]:
import pandas as pd
import numpy as np
import warnings;warnings.filterwarnings(action='ignore')

### Read data
DC_230902.pqt 파일로부터 공연별 정보, 좌석별 정보를 자세히 알아내고자 한다.

In [2]:
data = pd.read_parquet('../data/DC_230910.pqt')
data.shape

(929040, 28)

In [3]:
# 2018~2023년 열린 공연 정보를 담은 외부데이터이다.
# https://www.sac.or.kr/site/main/program/schedule

outerPF = pd.DataFrame()
for i in range(2018,2024):
    ease = pd.read_csv(f'../data/Outerdata/예술의전당_공연 및 전시 안내_{i}.csv', encoding='cp949', 
                       usecols = ['장르', '공연/전시명', '기간', '장소'])\
           .query('장소=="콘서트홀"').rename(columns={'공연/전시명':'공연명'})
    # 형식에 어긋난 데이터는 병합하지 않는다.
    ease = ease[ease['기간'].apply(lambda x: len(x)) == 10]
    outerPF = pd.concat([outerPF, ease])

In [4]:
# 취소되지 않은 공연에 대해 분석한다.
outerPF = outerPF[outerPF['공연명'].apply(lambda x: '공연취소' not in x)]

# column을 정리한다.
outerPF['장르'].fillna('클래식', inplace=True)
outerPF['기간'] = pd.to_datetime(outerPF['기간'])
del outerPF['장소']
print('2018~2023년 공연수:', outerPF.shape)

2018~2023년 공연수: (1328, 3)


### 공연별 정보
- 원가(discount_type이 "일반"인 데이터)와 복원한 원가로 공연별 등급제, 등급별 가격을 확인한다.
- 725개 공연 중 628개 공연 정보를 확보했다.

In [5]:
# 공연명 merge를 위해 필요한 column을 불러온다.
performance = data[['performance_label','genre','play_date']].drop_duplicates().set_index('performance_label').sort_index()
performance

Unnamed: 0_level_0,genre,play_date
performance_label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,교향곡,2022-02-04
1,독주,2022-03-02
2,교향곡,2019-03-23
4,교향곡,2019-07-23
5,교향곡,2022-06-29
...,...,...
688,교향곡,2021-05-08
689,독주,2021-09-14
690,성악,2021-07-23
691,교향곡,2021-12-12


In [6]:
# 천원 단위로 떨어지는 것들을 원가로 생각한다.
ease = data[['performance_label', 'origin_price']]
ease['origin_price_unit'] = ease['origin_price'].apply(lambda x: round(x) if '000.0' in str(x) else np.nan)
performance = performance.merge(ease.dropna(subset=['origin_price_unit'])\
                                .groupby('performance_label')['origin_price_unit'].agg(lambda x: sorted(set(x), reverse=True)).rename('prices').reset_index(),
                                on='performance_label')

# 할인으로 인한 천원 단위 차이를 보완한다.
# 이로써 681개 공연 중 637개 공연의 원가정보를 추가했다.(44개 공연은 좌석별 원가를 알아낼 수 없었다.)
performance['prices'] = performance.prices.apply(lambda x: [x[0]]+[x[i] for i in range(1,len(x)) if x[i-1] - x[i] >= 5000])
performance['n_grade'] = performance['prices'].apply(lambda x: len(x))

In [7]:
# 콘서트홀 규정상 좌석을 단일등급화하거나 2~5등급으로만 쪼갤 수 있다.
# 이로써 가격 정보를 알 수 있는 629개 공연의 좌석별 가격, 등급수, 등급을 알아냈다.
performance = performance.loc[performance.query('n_grade <= 5').index]
performance['grade'] = performance['n_grade'].map({1:['single'],
                                                   2:['R','S'],
                                                   3:['R','S','A'],
                                                   4:['R','S','A','B'],
                                                   5:['R','S','A','B','C']})
performance

Unnamed: 0,performance_label,genre,play_date,prices,n_grade,grade
0,0,교향곡,2022-02-04,"[120000.0, 90000.0, 50000.0, 10000.0]",4,"[R, S, A, B]"
1,1,독주,2022-03-02,"[180000.0, 140000.0, 110000.0, 70000.0]",4,"[R, S, A, B]"
2,2,교향곡,2019-03-23,"[350000.0, 260000.0, 180000.0, 120000.0, 70000.0]",5,"[R, S, A, B, C]"
3,4,교향곡,2019-07-23,"[150000.0, 120000.0, 80000.0, 30000.0, 20000.0]",5,"[R, S, A, B, C]"
4,5,교향곡,2022-06-29,"[80000.0, 60000.0, 20000.0]",3,"[R, S, A]"
...,...,...,...,...,...,...
632,688,교향곡,2021-05-08,"[80000.0, 60000.0, 30000.0, 10000.0]",4,"[R, S, A, B]"
633,689,독주,2021-09-14,"[110000.0, 90000.0, 70000.0]",3,"[R, S, A]"
634,690,성악,2021-07-23,"[121000.0, 110000.0, 88000.0, 66000.0]",4,"[R, S, A, B]"
635,691,교향곡,2021-12-12,"[60000.0, 40000.0, 20000.0]",3,"[R, S, A]"


In [8]:
# 공연명을 병합한다.
ease = performance.merge(outerPF, left_on='play_date', right_on='기간', how='left')

# 두 개 이상 데이터와 결합된 데이터들은 별도로 정리한다.
# 두 개 이상 데이터와 결합된 데이터들 중 장르가 동일한 데이터와 결합된 데이터와 장르가 모두 다른 경우 결측치로 처리한다.
# 629개 공연 = 154개 공연(두 데이터셋 간 매칭되지 않은 공연) + 442개 공연(정확히 매칭된 공연) 
#             + 19개(2개 데이터와 매칭되어 같은 장르로 구분한 뒤 drop_duplicate한 공연) 
#             + 14개(2개 데이터와 매칭되었으나 같은 장르가 없어 null 처리된 데이터)
mul = ease.performance_label.value_counts()[ease.performance_label.value_counts() >= 2].index
ease_mul = ease.query('performance_label in @mul')

treated = ease_mul.query('genre == 장르 | genre == "클래식"').sort_values(by='공연명').drop_duplicates('performance_label')
performance = pd.concat([ease.query('performance_label not in @mul'), treated,
                         ease_mul.query('performance_label not in @treated.performance_label').drop('공연명', axis=1).drop_duplicates('performance_label')])
performance.drop(['장르','기간'], axis=1, inplace=True)
performance.reset_index(drop=True, inplace=True)
print('공연수:', performance.shape[0])

공연수: 629


In [9]:
# 기타 공연 정보를 병합한다.
ease = data[['performance_label','play_st_time','pre_open_date','open_date','running_time','intermission']].drop_duplicates()
ease['선예매여부'] = ease.pre_open_date.notna().astype(int)
# 선예매일과 예매일이 같으면 0으로 표시된다.
ease['선예매기간'] = (ease['open_date'] - ease['pre_open_date']).dt.days
performance = performance.merge(ease, on='performance_label')
performance['공연요일'] = performance.play_date.dt.weekday
performance.set_index('performance_label', inplace=True)
performance

Unnamed: 0_level_0,genre,play_date,prices,n_grade,grade,공연명,play_st_time,pre_open_date,open_date,running_time,intermission,선예매여부,선예매기간,공연요일
performance_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,교향곡,2022-02-04,"[120000.0, 90000.0, 50000.0, 10000.0]",4,"[R, S, A, B]",,2000,2022-01-14,2022-01-15,120,15,1,1.0,4
1,독주,2022-03-02,"[180000.0, 140000.0, 110000.0, 70000.0]",4,"[R, S, A, B]","국립합창단 기획공연 위대한 합창 시리즈Ⅰ- 칼 오르프, 카르미나 부라나",1930,2022-01-09,2022-01-10,90,0,1,1.0,2
2,교향곡,2019-03-23,"[350000.0, 260000.0, 180000.0, 120000.0, 70000.0]",5,"[R, S, A, B, C]",오페라 카니발 2019,2000,2018-11-19,2018-11-19,100,15,1,0.0,5
4,교향곡,2019-07-23,"[150000.0, 120000.0, 80000.0, 30000.0, 20000.0]",5,"[R, S, A, B, C]",제17회 코리아니쉬 플루트 오케스트라 정기연주회,2000,2019-06-02,2019-06-02,120,20,1,0.0,1
5,교향곡,2022-06-29,"[80000.0, 60000.0, 20000.0]",3,"[R, S, A]",<강남심포니오케스트라 제92회 정기연주회>,1930,2022-04-30,2022-04-30,110,15,1,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458,독주,2022-11-25,"[70000.0, 50000.0, 30000.0]",3,"[R, S, A]",,1930,2022-08-20,2022-08-21,100,20,1,1.0,4
540,성악,2021-08-28,"[150000.0, 120000.0, 100000.0, 70000.0, 50000.0]",5,"[R, S, A, B, C]",,1930,2021-06-19,2021-06-20,140,10,1,1.0,5
634,독주,2022-10-15,"[110000.0, 90000.0, 70000.0, 50000.0]",4,"[R, S, A, B]",,1700,2022-08-20,2022-08-21,70,0,1,1.0,5
649,독주,2022-08-18,"[50000.0, 30000.0]",2,"[R, S]",,1930,2022-06-18,2022-06-19,120,20,1,1.0,3


### 좌석별 정보
- 공연별 정보를 바탕으로 천원 단위로 떨어지지 않은 복원한 가격들을 가까운 등급의 가격으로 대체하고 좌석별 등급을 매긴다.

In [10]:
known_label = performance.loc[performance["prices"].notna()].index
# 초대권을 사용한 데이터까지 포함하면 초대권 사용 데이터에 최소가격이 들어가기에 제외한다.
known = data.loc[data['origin_price'].notna()].query('origin_price != 0 and performance_label in @known_label')

add = []
for _, l, p in known[['performance_label','origin_price']].itertuples():
    diff = list(map(lambda x: abs(x-p), performance.loc[l,'prices']))
    add.append(performance.loc[l, 'prices'][diff.index(min(diff))])
    
known['origin_price'] = pd.Series(add, index=known.index)

In [11]:
add = []
for _, l, p in known[['performance_label','origin_price']].itertuples():
    add.append(performance.loc[l, 'grade'][performance.loc[l, 'prices'].index(p)])
    
known['seat_grade'] = pd.Series(add, index=known.index)
known

Unnamed: 0,age,gender,tran_date,tran_time,play_date,play_st_time,price,ticket_cancel,discount_type,pre_open_date,...,corporate,performance_label,discount_rate,origin_price,층,블록,열,좌석번호,seat_label,seat_grade
0,50.0,F,2022-01-14,1512,2022-02-04,2000,10000,1,일반,2022-01-14,...,0.0,0,0.0,10000.0,3,BOX9,,10,1936,B
1,60.0,F,2020-01-16,38,2020-02-11,1930,30000,0,일반,2019-12-20,...,0.0,9,0.0,30000.0,3,BOX9,,10,1936,B
4,20.0,F,2023-04-29,1322,2023-05-23,1930,24000,0,가정의 달 특별할인(8매/4.28까지)20%,2023-02-25,...,0.0,14,20.0,30000.0,3,BOX9,,10,1936,B
5,50.0,M,2019-08-24,959,2019-08-28,2000,22000,0,골드회원 할인25%,2019-07-15,...,0.0,15,25.0,30000.0,3,BOX9,,10,1936,B
6,70.0,M,2022-06-24,1406,2022-08-30,1930,18000,1,노블회원 할인40%,2022-06-24,...,0.0,22,40.0,30000.0,3,BOX9,,10,1936,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
929034,,,2019-01-07,1256,2019-01-11,2000,35000,0,장애인/국가유공자 할인50%,2018-12-10,...,,579,50.0,70000.0,2,A,8,13,1363,A
929035,60.0,M,2019-04-18,1028,2019-07-02,2000,75000,1,장애인/국가유공자 할인50%,2018-12-29,...,0.0,262,50.0,150000.0,2,A,8,12,1362,A
929036,60.0,M,2019-02-23,1433,2019-03-01,2000,5000,0,장애인/국가유공자 할인50%,2019-01-12,...,0.0,392,50.0,10000.0,2,A,8,12,1362,C
929038,,,2019-02-16,1055,2019-02-28,2000,15000,0,장애인/국가유공자 할인50%,NaT,...,,577,50.0,30000.0,2,A,8,12,1362,B


In [12]:
# 알려진 공연의 초대권 사용 데이터를 병합한다.
known = pd.concat([known, data.query('origin_price == 0 and performance_label in @known_label')])

### Concat data
공연별 정보, 좌석별 정보를 추가한다. 이후 14개 이상의 블록 고유값을 가진 공연만을 대상으로 한다.

In [13]:
# 좌석별 정보 병합
# 복구한 가격 정보와 해당 좌석 등급을 병합한다.
data = pd.concat([data.drop('origin_price', axis=1), known[['origin_price','seat_grade']]], axis=1)
data

Unnamed: 0,age,gender,tran_date,tran_time,play_date,play_st_time,price,ticket_cancel,discount_type,pre_open_date,...,corporate,performance_label,discount_rate,층,블록,열,좌석번호,seat_label,origin_price,seat_grade
0,50.0,F,2022-01-14,1512,2022-02-04,2000,10000,1,일반,2022-01-14,...,0.0,0,0.0,3,BOX9,,10,1936,10000.0,B
1,60.0,F,2020-01-16,38,2020-02-11,1930,30000,0,일반,2019-12-20,...,0.0,9,0.0,3,BOX9,,10,1936,30000.0,B
2,,,2019-09-09,1253,2019-10-15,2000,0,0,초대권,NaT,...,,10,1.0,3,BOX9,,10,1936,0.0,
3,,,2019-03-08,1447,2019-03-22,2000,0,0,초대권,2019-03-03,...,,12,1.0,3,BOX9,,10,1936,,
4,20.0,F,2023-04-29,1322,2023-05-23,1930,24000,0,가정의 달 특별할인(8매/4.28까지)20%,2023-02-25,...,0.0,14,20.0,3,BOX9,,10,1936,30000.0,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
929035,60.0,M,2019-04-18,1028,2019-07-02,2000,75000,1,장애인/국가유공자 할인50%,2018-12-29,...,0.0,262,50.0,2,A,8,12,1362,150000.0,A
929036,60.0,M,2019-02-23,1433,2019-03-01,2000,5000,0,장애인/국가유공자 할인50%,2019-01-12,...,0.0,392,50.0,2,A,8,12,1362,10000.0,C
929037,,,2019-03-26,1650,2019-03-30,1700,15000,0,차액,2019-01-25,...,,481,,2,A,8,12,1362,,
929038,,,2019-02-16,1055,2019-02-28,2000,15000,0,장애인/국가유공자 할인50%,NaT,...,,577,50.0,2,A,8,12,1362,30000.0,B


In [14]:
# 공연별 정보 병합
data = data.merge(performance.reset_index()[['performance_label','prices','n_grade','공연명','선예매여부','선예매기간','공연요일']],
                  on='performance_label', how='left')
data

Unnamed: 0,age,gender,tran_date,tran_time,play_date,play_st_time,price,ticket_cancel,discount_type,pre_open_date,...,좌석번호,seat_label,origin_price,seat_grade,prices,n_grade,공연명,선예매여부,선예매기간,공연요일
0,50.0,F,2022-01-14,1512,2022-02-04,2000,10000,1,일반,2022-01-14,...,10,1936,10000.0,B,"[120000.0, 90000.0, 50000.0, 10000.0]",4.0,,1.0,1.0,4.0
1,60.0,F,2020-01-16,38,2020-02-11,1930,30000,0,일반,2019-12-20,...,10,1936,30000.0,B,"[120000.0, 90000.0, 60000.0, 30000.0]",4.0,,1.0,3.0,1.0
2,,,2019-09-09,1253,2019-10-15,2000,0,0,초대권,NaT,...,10,1936,0.0,,"[150000.0, 120000.0, 100000.0, 80000.0]",4.0,제 16회 차이콥스키 콩쿠르 우승자 갈라콘서트,0.0,,1.0
3,,,2019-03-08,1447,2019-03-22,2000,0,0,초대권,2019-03-03,...,10,1936,,,,,,,,
4,20.0,F,2023-04-29,1322,2023-05-23,1930,24000,0,가정의 달 특별할인(8매/4.28까지)20%,2023-02-25,...,10,1936,30000.0,B,"[110000.0, 90000.0, 60000.0, 30000.0]",4.0,코리아남성합창단 제22회 정기연주회,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
929035,60.0,M,2019-04-18,1028,2019-07-02,2000,75000,1,장애인/국가유공자 할인50%,2018-12-29,...,12,1362,150000.0,A,"[250000.0, 200000.0, 150000.0, 100000.0, 60000.0]",5.0,2019 세종솔로이스츠의 힉엣눙크! 갈라콘서트,1.0,1.0,1.0
929036,60.0,M,2019-02-23,1433,2019-03-01,2000,5000,0,장애인/국가유공자 할인50%,2019-01-12,...,12,1362,10000.0,C,"[60000.0, 40000.0, 30000.0, 20000.0, 10000.0]",5.0,,1.0,1.0,4.0
929037,,,2019-03-26,1650,2019-03-30,1700,15000,0,차액,2019-01-25,...,12,1362,,,"[121000.0, 99000.0, 77000.0, 55000.0]",4.0,LG와 함께하는 제15회 서울국제음악콩쿠르(결선/3.30),1.0,1.0,5.0
929038,,,2019-02-16,1055,2019-02-28,2000,15000,0,장애인/국가유공자 할인50%,NaT,...,12,1362,30000.0,B,"[120000.0, 90000.0, 60000.0, 30000.0, 10000.0]",5.0,2019 서울시향 슈베르트 교향곡 9번 그레이트,0.0,,3.0


In [15]:
ease = data.loc[data['origin_price'].notna()].drop_duplicates(['performance_label','층','블록']).performance_label.value_counts()
target = ease[ease >= 14].index
performance_grouping = data.query('performance_label in @target').dropna(subset='origin_price')
performance_grouping

Unnamed: 0,age,gender,tran_date,tran_time,play_date,play_st_time,price,ticket_cancel,discount_type,pre_open_date,...,좌석번호,seat_label,origin_price,seat_grade,prices,n_grade,공연명,선예매여부,선예매기간,공연요일
0,50.0,F,2022-01-14,1512,2022-02-04,2000,10000,1,일반,2022-01-14,...,10,1936,10000.0,B,"[120000.0, 90000.0, 50000.0, 10000.0]",4.0,,1.0,1.0,4.0
1,60.0,F,2020-01-16,38,2020-02-11,1930,30000,0,일반,2019-12-20,...,10,1936,30000.0,B,"[120000.0, 90000.0, 60000.0, 30000.0]",4.0,,1.0,3.0,1.0
2,,,2019-09-09,1253,2019-10-15,2000,0,0,초대권,NaT,...,10,1936,0.0,,"[150000.0, 120000.0, 100000.0, 80000.0]",4.0,제 16회 차이콥스키 콩쿠르 우승자 갈라콘서트,0.0,,1.0
4,20.0,F,2023-04-29,1322,2023-05-23,1930,24000,0,가정의 달 특별할인(8매/4.28까지)20%,2023-02-25,...,10,1936,30000.0,B,"[110000.0, 90000.0, 60000.0, 30000.0]",4.0,코리아남성합창단 제22회 정기연주회,1.0,1.0,1.0
5,50.0,M,2019-08-24,959,2019-08-28,2000,22000,0,골드회원 할인25%,2019-07-15,...,10,1936,30000.0,B,"[120000.0, 80000.0, 50000.0, 30000.0, 10000.0]",5.0,노부스 콰르텟〈Slavic〉,1.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
929034,,,2019-01-07,1256,2019-01-11,2000,35000,0,장애인/국가유공자 할인50%,2018-12-10,...,13,1363,70000.0,A,"[140000.0, 100000.0, 70000.0, 50000.0]",4.0,,1.0,0.0,4.0
929035,60.0,M,2019-04-18,1028,2019-07-02,2000,75000,1,장애인/국가유공자 할인50%,2018-12-29,...,12,1362,150000.0,A,"[250000.0, 200000.0, 150000.0, 100000.0, 60000.0]",5.0,2019 세종솔로이스츠의 힉엣눙크! 갈라콘서트,1.0,1.0,1.0
929036,60.0,M,2019-02-23,1433,2019-03-01,2000,5000,0,장애인/국가유공자 할인50%,2019-01-12,...,12,1362,10000.0,C,"[60000.0, 40000.0, 30000.0, 20000.0, 10000.0]",5.0,,1.0,1.0,4.0
929038,,,2019-02-16,1055,2019-02-28,2000,15000,0,장애인/국가유공자 할인50%,NaT,...,12,1362,30000.0,B,"[120000.0, 90000.0, 60000.0, 30000.0, 10000.0]",5.0,2019 서울시향 슈베르트 교향곡 9번 그레이트,0.0,,3.0


### Save data

In [16]:
data.to_parquet('../data/DC_230910_final.pqt')
performance_grouping.to_parquet('../data/DC_performancegrouping.pqt')
performance.to_parquet('../data/performance.pqt')
known.to_parquet('../data/known.pqt')