## Import

In [1]:
import pandas as pd
import numpy as np
import pickle

# W2V
import re
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

# ignore warnings
import warnings ; warnings.filterwarnings('ignore')

## Read data

In [2]:
data = pd.read_parquet('../data/DataCleansing_final.pqt')

In [3]:
# 장르별 Clustering 결과를 불러온다.
symphony, chorus, voice, solo, chamber = pickle.load(open(f'../data/ClusteringResult.pkl','rb'))

In [4]:
data_symphony = data.query('genre=="교향곡"').merge(symphony[['seat_label', 'cluster']], on='seat_label')
print(f'교향곡 데이터수: {data_symphony.shape[0]}')

data_chorus = data.query('genre=="합창"').merge(chorus[['seat_label', 'cluster']], on='seat_label')
print(f'합창 데이터수: {data_chorus.shape[0]}')

data_voice = data.query('genre=="성악"').merge(voice[['seat_label', 'cluster']], on='seat_label')
print(f'성악 데이터수: {data_voice.shape[0]}')

data_solo = data.query('genre=="독주"').merge(solo[['seat_label', 'cluster']], on='seat_label')
print(f'독주 데이터수: {data_solo.shape[0]}')

data_chamber = data.query('genre=="실내악"').merge(chamber[['seat_label', 'cluster']], on='seat_label')
print(f'실내악 데이터수: {data_chamber.shape[0]}')

교향곡 데이터수: 476396
합창 데이터수: 108114
성악 데이터수: 38279
독주 데이터수: 58517
실내악 데이터수: 33189


## Feature Generation
장르별 Clustering 결과에 Feature Generation 함수를 적용할 수 있도록 한다.

- 기초 Features<br>
  : 공연년도, 공연월, 공연시간, 공연요일, 예매년도, 예매월, 선예매기간, 공연일과 예매일 차이, 공연진행시간, 휴식시간, 좌석등급수<br>
  $~$ 공연진행시간 대비 휴식시간, 동월 해당장르 공연수, 동월 전체 공연 대비 해당장르 공연수, 평균 예매수

In [5]:
def basic_features(DATA):
    ease = DATA.drop_duplicates('performance_label')\
           [['performance_label','play_year','play_month','play_time','play_weekday','open_year','open_month',
             'pre_open_gap','open_gap','running_time','intermission','n_grade']]
    # 공연진행시간 대비 휴식시간
    ease['intermission_rate'] = (ease['intermission'] / ease['running_time']).apply(lambda x: 0 if x==np.infty else x)
    # 월별 공연수, 전체 공연 대비 공연수
    month = DATA.groupby(['play_year','play_month'])['performance_label'].nunique().rename('n_performance_month')
    month = pd.concat([month, 
                       month.divide(data.groupby(['play_year','play_month'])['performance_label'].nunique().loc[month.index]).rename('n_performance_rate')], axis=1)
    ease = ease.merge(month.reset_index(),
                      on=['play_year','play_month']).set_index('performance_label').sort_index()
    
    # 평균 예매수
    performance = DATA.groupby(['play_date','performance_label'])['price'].size().reset_index()
    performance['sales'] = pd.Series(np.append(np.nan, 
                                     performance['price'].cumsum()[:-1].reset_index(drop=True)\
                                     .divide(pd.Series(range(1, performance.shape[0])), axis=0).values.flatten()), index=performance.index)
    performance['sales'].fillna(performance['sales'].mean(), inplace=True)
    performance['rolling_sales'] = performance['price'].rolling(2, closed='left').mean()
    performance['rolling_sales'].fillna(performance['rolling_sales'].mean(), inplace=True)
    return pd.concat([ease, performance.set_index('performance_label')[['sales','rolling_sales']]], axis=1)

- 좌석특성별 시계열 평균 예매수 관련 Features<br>
  :피아노뷰 통로좌석, 사운드 좌석, 다리 피는 좌석, 3층 안좋은 좌석, 소리가 안좋은 좌석, 지휘자 뒤를 보는 좌석

In [6]:
def seat_features(DATA):
    # 피아노뷰 통로좌석
    piano = pd.pivot_table(DATA, index=['play_date','performance_label'], columns='piano_hallway',
                           values='seat_label',aggfunc=np.size).cumsum().reset_index()
    piano['piano_0'] = np.append(np.nan, piano.iloc[:-1,-2].divide(pd.Series(range(1,piano.shape[0])), axis=0).values.flatten())
    piano['piano_1'] = np.append(np.nan, piano.iloc[:-1,-1].divide(pd.Series(range(1,piano.shape[0])), axis=0).values.flatten())
    piano[['piano_0','piano_1']] = piano[['piano_0','piano_1']].fillna(piano[['piano_0','piano_1']].mean())
    
    # 사운드 좌석
    sound = pd.pivot_table(DATA, index=['play_date','performance_label'], columns='sound_quality',
                           values='seat_label',aggfunc=np.size).cumsum().reset_index()
    sound['sound_0'] = np.append(np.nan, sound.iloc[:-1,-2].divide(pd.Series(range(1,sound.shape[0])), axis=0).values.flatten())
    sound['sound_1'] = np.append(np.nan, sound.iloc[:-1,-1].divide(pd.Series(range(1,sound.shape[0])), axis=0).values.flatten())
    sound[['sound_0','sound_1']] = sound[['sound_0','sound_1']].fillna(sound[['sound_0','sound_1']].mean())
    
    # 다리 피는 좌석
    leg = pd.pivot_table(DATA, index=['play_date','performance_label'], columns='leg_free',
                           values='seat_label',aggfunc=np.size).cumsum().reset_index()
    leg['leg_0'] = np.append(np.nan, leg.iloc[:-1,-2].divide(pd.Series(range(1,leg.shape[0])), axis=0).values.flatten())
    leg['leg_1'] = np.append(np.nan, leg.iloc[:-1,-1].divide(pd.Series(range(1,leg.shape[0])), axis=0).values.flatten())
    leg[['leg_0','leg_1']] = leg[['leg_0','leg_1']].fillna(leg[['leg_0','leg_1']].mean())
    
    # 3층 안좋은 좌석
    thirdbad = pd.pivot_table(DATA, index=['play_date','performance_label'], columns='3rd_bad',
                           values='seat_label',aggfunc=np.size).cumsum().reset_index()
    thirdbad['thirdbad_0'] = np.append(np.nan, thirdbad.iloc[:-1,-2].divide(pd.Series(range(1,thirdbad.shape[0])), axis=0).values.flatten())
    thirdbad['thirdbad_1'] = np.append(np.nan, thirdbad.iloc[:-1,-1].divide(pd.Series(range(1,thirdbad.shape[0])), axis=0).values.flatten())
    thirdbad[['thirdbad_0','thirdbad_1']] = thirdbad[['thirdbad_0','thirdbad_1']].fillna(thirdbad[['thirdbad_0','thirdbad_1']].mean())
    
    # 소리가 안좋은 좌석
    bad = pd.pivot_table(DATA, index=['play_date','performance_label'], columns='bad_sound',
                           values='seat_label',aggfunc=np.size).cumsum().reset_index()
    bad['bad_0'] = np.append(np.nan, bad.iloc[:-1,-2].divide(pd.Series(range(1,bad.shape[0])), axis=0).values.flatten())
    bad['bad_1'] = np.append(np.nan, bad.iloc[:-1,-1].divide(pd.Series(range(1,bad.shape[0])), axis=0).values.flatten())
    bad[['bad_0','bad_1']] = bad[['bad_0','bad_1']].fillna(bad[['bad_0','bad_1']].mean()) 
    
    # 지휘자 뒤를 보는 좌석
    back = pd.pivot_table(DATA, index=['play_date','performance_label'], columns='conductor_back',
                           values='seat_label',aggfunc=np.size).cumsum().reset_index()
    back['back_0'] = np.append(np.nan, back.iloc[:-1,-2].divide(pd.Series(range(1,back.shape[0])), axis=0).values.flatten())
    back['back_1'] = np.append(np.nan, back.iloc[:-1,-1].divide(pd.Series(range(1,back.shape[0])), axis=0).values.flatten())
    back[['back_0','back_1']] = back[['back_0','back_1']].fillna(back[['back_0','back_1']].mean())
    return pd.concat([piano.set_index('performance_label')[['piano_0','piano_1']],
                      sound.set_index('performance_label')[['sound_0','sound_1']],
                      leg.set_index('performance_label')[['leg_0','leg_1']],
                      thirdbad.set_index('performance_label')[['thirdbad_0','thirdbad_1']],
                      bad.set_index('performance_label')[['bad_0','bad_1']],
                      back.set_index('performance_label')[['back_0','back_1']]], axis=1)

In [7]:
def seat_rolling_features(DATA):
    # 피아노뷰 통로좌석
    piano = pd.pivot_table(DATA, index=['play_date','performance_label'], columns='piano_hallway',
                           values='seat_label',aggfunc=np.size).rolling(2, closed='left').mean()
    piano.columns = ['piano_0','piano_1']
    piano[['piano_0','piano_1']] = piano[['piano_0','piano_1']].fillna(piano[['piano_0','piano_1']].mean())
    
    # 사운드 좌석
    sound = pd.pivot_table(DATA, index=['play_date','performance_label'], columns='sound_quality',
                           values='seat_label',aggfunc=np.size).rolling(2, closed='left').mean()
    sound.columns = ['sound_0','sound_1']
    sound[['sound_0','sound_1']] = sound[['sound_0','sound_1']].fillna(sound[['sound_0','sound_1']].mean())
    
    # 다리 피는 좌석
    leg = pd.pivot_table(DATA, index=['play_date','performance_label'], columns='leg_free',
                           values='seat_label',aggfunc=np.size).rolling(2, closed='left').mean()
    leg.columns = ['leg_0','leg_1']
    leg[['leg_0','leg_1']] = leg[['leg_0','leg_1']].fillna(leg[['leg_0','leg_1']].mean())
    
    # 3층 안좋은 좌석
    thirdbad = pd.pivot_table(DATA, index=['play_date','performance_label'], columns='3rd_bad',
                           values='seat_label',aggfunc=np.size).rolling(2, closed='left').mean()
    thirdbad.columns = ['thirdbad_0','thirdbad_1']
    thirdbad[['thirdbad_0','thirdbad_1']] = thirdbad[['thirdbad_0','thirdbad_1']].fillna(thirdbad[['thirdbad_0','thirdbad_1']].mean())
    
    # 소리가 안좋은 좌석
    bad = pd.pivot_table(DATA, index=['play_date','performance_label'], columns='bad_sound',
                           values='seat_label',aggfunc=np.size).rolling(2, closed='left').mean()
    bad.columns = ['bad_0','bad_1']
    bad[['bad_0','bad_1']] = bad[['bad_0','bad_1']].fillna(bad[['bad_0','bad_1']].mean())
    
    # 지휘자 뒤를 보는 좌석
    back = pd.pivot_table(DATA, index=['play_date','performance_label'], columns='conductor_back',
                           values='seat_label',aggfunc=np.size).rolling(2, closed='left').mean()
    back.columns = ['back_0','back_1']
    back[['back_0','back_1']] = back[['back_0','back_1']].fillna(back[['back_0','back_1']].mean())
    
    # 이동평균으로 구한 것을 구분하기 위해 rolling을 붙인다.
    ease = pd.concat([piano.reset_index().set_index('performance_label')[['piano_0','piano_1']],
                      sound.reset_index().set_index('performance_label')[['sound_0','sound_1']],
                      leg.reset_index().set_index('performance_label')[['leg_0','leg_1']],
                      thirdbad.reset_index().set_index('performance_label')[['thirdbad_0','thirdbad_1']],
                      bad.reset_index().set_index('performance_label')[['bad_0','bad_1']],
                      back.reset_index().set_index('performance_label')[['back_0','back_1']]], axis=1)
    ease.columns = [f'rolling_{i}' for i in ease.columns]
    return ease

- 가격 관련 Features<br>
  : 등급별 가격

In [8]:
def price_features(DATA):
    grade = DATA.dropna(subset=['origin_price']).query('origin_price != 0')\
           .groupby('performance_label')['origin_price'].unique()
    grade = grade.apply(lambda x: sorted(x, reverse=True)+[0]*(5-len(x)))
    grade = pd.DataFrame([*grade.values], index=grade.index, columns=[f'G{i}' for i in range(1,6)])//1000
    
    rate = DATA.groupby(['play_date','performance_label'])[['price','origin_price']].sum().cumsum()
    rate['price_rate'] = rate['price']/rate['origin_price']
    rate = pd.Series(np.append(np.nan, 
                               rate['price_rate'].iloc[:-1].reset_index(drop=True)\
                               .divide(pd.Series(range(1, len(rate))), axis=0).values.flatten()),
                     index=rate.index,name='price_rate').fillna(rate.mean())
    
    rolling_rate = DATA.groupby(['play_date','performance_label'])[['price','origin_price']].sum()
    rolling_rate['rolling_price_rate'] = (rolling_rate['price']/rolling_rate['origin_price']).rolling(2, closed='left').mean()
    rolling_rate['rolling_price_rate'].fillna(rolling_rate['rolling_price_rate'].mean(), inplace=True)
    return pd.concat([grade, rate.reset_index().set_index('performance_label')['price_rate'], 
                      rolling_rate.reset_index().set_index('performance_label')['rolling_price_rate']], axis=1)

- 날짜 간격 관련 Features<br>
  :최초 100개 공연 매진까지 걸린 시간의 시계열평균, 예매와 공연일 차이의 시계열평균 

In [9]:
def gap_features(DATA):
    tran = DATA.groupby(['play_date','performance_label'])[['tran_gap']].agg(lambda x: np.mean(sorted(x[:100])))
    tran['100_tran_gap'] = pd.Series(np.append(np.nan, 
                                     tran.cumsum()[:-1].reset_index(drop=True)\
                                     .divide(pd.Series(range(1, len(tran))), axis=0).values.flatten()),index=tran.index)
    tran['100_tran_gap'].fillna(tran['100_tran_gap'].mean(), inplace=True)
    
    play = DATA.groupby(['play_date','performance_label'])[['play_gap']].mean()
    play['play_gap'] = pd.Series(np.append(np.nan, 
                                           play.cumsum()[:-1].reset_index(drop=True)\
                                           .divide(pd.Series(range(1, len(play))), axis=0).values.flatten()),index=play.index)
    play['play_gap'].fillna(play['play_gap'].mean(), inplace=True)
    
    return pd.concat([tran.reset_index().set_index('performance_label')['100_tran_gap'],
                       play.reset_index().set_index('performance_label')['play_gap']], axis=1)

In [10]:
def gap_rolling_features(DATA):
    tran = DATA.groupby(['play_date','performance_label'])['tran_gap'].agg(lambda x: np.mean(sorted(x[:100])))\
           .rolling(3, closed='left').mean().rename('rolling_100_tran_gap')
    tran.fillna(tran.mean(), inplace=True)
    
    play = DATA.groupby(['play_date','performance_label'])['play_gap'].mean().rolling(2, closed='left').mean()\
           .rename('rolling_play_gap')
    play.fillna(play.mean(), inplace=True)
    
    return pd.concat([tran.reset_index().set_index('performance_label')['rolling_100_tran_gap'],
                       play.reset_index().set_index('performance_label')['rolling_play_gap']], axis=1)

- 공연명 Word2Vec

In [11]:
def W2V(DATA):
    ease = DATA.drop_duplicates('performance_label').set_index('performance_label').sort_index()[['공연명전처리']]
    ease = ease.dropna(subset=['공연명전처리'])
    # 토큰화
    data_words = [word_tokenize(title) for title in ease['공연명전처리'].dropna()]
    # TaggedDocument 객체 생성
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(data_words)]

    # Doc2Vec 모델 초기화
    model = Doc2Vec(vector_size=2, window=2, min_count=1, workers=4, epochs=100)
    # 모델 어휘 구축
    model.build_vocab(documents)
    # 모델 훈련
    model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

    return pd.DataFrame([model.infer_vector(doc_words) for i, doc_words in enumerate(data_words)], 
                         index=ease.index, columns=['vector_{}'.format(i) for i in range(model.vector_size)])

- 클러스터 관련 Features<br>
  :클러스터별 평균 예매수, 클러스터별 최초 50개 예매 발생까지의 평균 시간, 클러스터별 예매가격/정가

In [12]:
def cluster_features(DATA):
    # 클러스터별 평균 예매수
    sale = pd.pivot_table(DATA, index=['play_date','performance_label'], columns='cluster', values='seat_label',
                          aggfunc=np.size, fill_value=0).reset_index()
    for i in sale.columns[2:]:
        sale[f'meansale_{i}'] = np.append(np.nan, 
                                          sale[i].iloc[:-1].divide(pd.Series(range(1,sale.shape[0])), axis=0).values.flatten())
    sale.fillna(sale[[i for i in sale.columns if 'meansale_' in str(i)]].mean(), inplace=True)
    
    # 클러스터별 최초 50개 예매 발생까지의 평균 시간
    time = pd.pivot_table(DATA, index=['play_date','performance_label'], columns='cluster', values='tran_gap',
                          aggfunc=lambda x: np.mean(sorted(x[:50]))).reset_index()
    for i in time.columns[2:]:
        time[f'tran_gap_{i}'] = np.append(np.nan, 
                                         time[i].iloc[:-1].divide(pd.Series(range(1,time.shape[0])), axis=0).values.flatten())
    time.fillna(time[[i for i in time.columns if 'tran_gap_' in str(i)]].mean(), inplace=True)
    
    # 예매가격/정가
    price_rate = pd.pivot_table(DATA, index=['play_date','performance_label'], columns='cluster', values='price', aggfunc=sum)\
                 .divide(pd.pivot_table(DATA, index=['play_date','performance_label'], columns='cluster', values='origin_price', 
                                        aggfunc=sum, fill_value=0), axis=0)\
                 .applymap(lambda x: np.nan if x==np.infty else x).reset_index()
    for i in price_rate.columns[2:]:
        price_rate[f'price_rate_{i}'] = np.append(np.nan, 
                                                  price_rate[i].iloc[:-1].divide(pd.Series(range(1,price_rate.shape[0])), axis=0).values.flatten())
    price_rate.fillna(price_rate[[i for i in price_rate.columns if 'price_rate_' in str(i)]].mean(), inplace=True)
    
    return pd.concat([sale.set_index('performance_label')[[i for i in sale.columns if 'meansale_' in str(i)]],
                      time.set_index('performance_label')[[i for i in time.columns if 'tran_gap_' in str(i)]],
                      price_rate.set_index('performance_label')[[i for i in price_rate.columns if 'price_rate_' in str(i)]]], axis=1)

In [13]:
def cluster_rolling_features(DATA):
    # 클러스터별 평균 예매수
    sale = pd.pivot_table(DATA, index=['play_date','performance_label'], columns='cluster', values='seat_label',
                          aggfunc=np.size, fill_value=0).rolling(2, closed='left').mean()
    sale.fillna(sale.mean(), inplace=True)
    sale.columns = [f'rolling_meansale_{i}' for i in sale.columns]

    # 클러스터별 최초 50개 예매 발생까지의 평균 시간
    time = pd.pivot_table(DATA, index=['play_date','performance_label'], columns='cluster', values='tran_gap',
                          aggfunc=lambda x: np.mean(sorted(x[:50]))).rolling(2, closed='left').mean()
    time.fillna(time.mean(), inplace=True)
    time.columns = [f'rolling_tran_gap_{i}' for i in time.columns]

    # 예매가격/정가
    price_rate = pd.pivot_table(DATA, index=['play_date','performance_label'], columns='cluster', values='price', aggfunc=sum)\
                 .divide(pd.pivot_table(DATA, index=['play_date','performance_label'], columns='cluster', values='origin_price', 
                                        aggfunc=sum, fill_value=0), axis=0)\
                 .rolling(2, closed='left').mean().applymap(lambda x: np.nan if x==np.infty else x)
    price_rate.fillna(price_rate.mean(), inplace=True)
    price_rate.columns = [f'rolling_price_rate_{i}' for i in price_rate.columns]

    return pd.concat([sale.reset_index().set_index('performance_label')[[i for i in sale.columns if 'rolling_meansale_' in i]],
                      time.reset_index().set_index('performance_label')[[i for i in time.columns if 'rolling_tran_gap_' in i]],
                      price_rate.reset_index().set_index('performance_label')[[i for i in price_rate.columns if 'rolling_price_rate_' in i]]], axis=1)

- `TARGET` 생성<br>
  :클러스터별 좌석예매수

In [14]:
def target(DATA):
    target = pd.pivot_table(DATA, index=['play_date','performance_label'], columns='cluster', values='seat_label',
                            aggfunc=np.size, fill_value=0)\
             .divide(DATA.groupby('cluster')['seat_label'].nunique(), axis=1).reset_index()
    target.columns=target.columns.tolist()[:2] + [f'TARGET_{i}' for i in target.columns[2:]]
    return target.set_index('performance_label')[[i for i in target.columns if 'TARGET' in i]]

## Merge features & target

In [15]:
# 교향곡
symphony_feature = pd.concat([basic_features(data_symphony), 
                              seat_features(data_symphony), seat_rolling_features(data_symphony),
                              price_features(data_symphony),
                              gap_features(data_symphony), gap_rolling_features(data_symphony),
                              W2V(data_symphony),
                              cluster_features(data_symphony), cluster_rolling_features(data_symphony),
                              target(data_symphony)], axis=1)
print(f'데이터 크기: {symphony_feature.shape}')

# 결측치를 처리한다.
symphony_feature = symphony_feature.fillna(0)

데이터 크기: (344, 95)


In [16]:
# 합창
chorus_feature = pd.concat([basic_features(data_chorus), 
                            seat_features(data_chorus), seat_rolling_features(data_chorus),
                            price_features(data_chorus),
                            gap_features(data_chorus), gap_rolling_features(data_chorus),
                            W2V(data_chorus),
                            cluster_features(data_chorus), cluster_rolling_features(data_chorus),
                            target(data_chorus)], axis=1)
print(f'데이터 크기: {chorus_feature.shape}')

# 결측치를 처리한다.
chorus_feature = chorus_feature.fillna(0)

데이터 크기: (69, 102)


In [17]:
# 성악
voice_feature = pd.concat([basic_features(data_voice), 
                          seat_features(data_voice), seat_rolling_features(data_voice),
                          price_features(data_voice),
                          gap_features(data_voice), gap_rolling_features(data_voice),
                          W2V(data_voice),
                          cluster_features(data_voice), cluster_rolling_features(data_voice),
                          target(data_voice)], axis=1)
print(f'데이터 크기: {voice_feature.shape}')

# 결측치를 처리한다.
voice_feature = voice_feature.fillna(0)

데이터 크기: (28, 102)


In [18]:
# 독주
solo_feature = pd.concat([basic_features(data_solo), 
                          seat_features(data_solo), seat_rolling_features(data_solo),
                          price_features(data_solo),
                          gap_features(data_solo), gap_rolling_features(data_solo),
                          W2V(data_solo),
                          cluster_features(data_solo), cluster_rolling_features(data_solo),
                          target(data_solo)], axis=1)
print(f'데이터 크기: {solo_feature.shape}')

# 결측치를 처리한다.
solo_feature = solo_feature.fillna(0)

데이터 크기: (39, 102)


In [19]:
# 실내악
chamber_feature = pd.concat([basic_features(data_chamber), 
                              seat_features(data_chamber), seat_rolling_features(data_chamber),
                              price_features(data_chamber),
                              gap_features(data_chamber), gap_rolling_features(data_chamber),
                              W2V(data_chamber),
                              cluster_features(data_chamber), cluster_rolling_features(data_chamber),
                              target(data_chamber)], axis=1)
print(f'데이터 크기: {chamber_feature.shape}')

# 결측치를 처리한다.
chamber_feature = chamber_feature.fillna(0)

데이터 크기: (26, 95)


## Save data

In [20]:
pickle.dump((symphony_feature, chorus_feature, voice_feature, solo_feature, chamber_feature), 
            open(f'../data/SaleRatePredictionFeatures.pkl', 'wb'))