## Import

In [1]:
import pandas as pd
import numpy as np
import pickle

# W2V
import re
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

# ignore warnings
import warnings ; warnings.filterwarnings('ignore')

## Read data

In [2]:
data = pd.read_parquet('../data/DataCleansing_final.pqt')

In [3]:
# 장르별 Clustering 결과를 불러온다.
symphony = pd.read_csv('../data/ClusteringResult/symphony_clustering.csv')
data_symphony = data.merge(symphony[['seat_label','cluster']], on='seat_label')

chorus = pd.read_csv('../data/ClusteringResult/chorus_clustering.csv')
data_chorus = data.merge(chorus[['seat_label','cluster']], on='seat_label')

voice = pd.read_csv('../data/ClusteringResult/voice_clustering.csv')
data_voice = data.merge(voice[['seat_label','cluster']], on='seat_label')

solo = pd.read_csv('../data/ClusteringResult/solo_clustering.csv')
data_solo = data.merge(solo[['seat_label','cluster']], on='seat_label')

## Feature Generation
장르별 Clustering 결과에 Feature Generation 함수를 적용할 수 있도록 한다.

- 기초 Features<br>
  : 공연년도, 공연월, 공연시간, 공연요일, 예매년도, 예매월, 선예매기간, 동월 해당장르 공연수, 동월 전체 공연 대비 해당 장르 공연수

In [4]:
def basic_features(DATA):
    ease = DATA.drop_duplicates('performance_label')\
           [['performance_label','play_year','play_month','play_time','play_weekday','open_year','open_month',
             'pre_open_gap','open_gap','running_time','intermission','n_grade']]
    # 월별 공연수, 타공연대비 비율
    month = DATA.groupby(['play_year','play_month'])['performance_label'].nunique().rename('n_performance_month')
    ease = ease.merge(month.reset_index(),
                      on=['play_year','play_month']).set_index('performance_label').sort_index()
    
    # 평균 공연 예매수
    performance = DATA.groupby(['play_date','performance_label'])['price'].size().rolling(2, closed='left').mean().rename('sales').reset_index()
    performance['sales'].fillna(performance['sales'].iloc[2], inplace=True)
    return pd.concat([ease, performance.set_index('performance_label')['sales']], axis=1)

- 가격 관련 Features<br>
  : 등급별 가격

In [5]:
def price_features(DATA):
    grade = DATA.dropna(subset=['origin_price']).query('origin_price != 0')\
           .groupby('performance_label')['origin_price'].unique()
    grade = grade.apply(lambda x: sorted(x, reverse=True)+[0]*(5-len(x)))
    grade = pd.DataFrame([*grade.values], index=grade.index, columns=[f'G{i}' for i in range(1,6)])//1000
    
    rate = DATA.groupby(['play_date','performance_label'])[['price','origin_price']].sum()
    rate['price_rate'] = (rate['price']/rate['origin_price']).rolling(2, closed='left').mean()
    rate['price_rate'].fillna(rate['price_rate'].iloc[2], inplace=True)
    return pd.concat([grade, rate.reset_index().set_index('performance_label')['price_rate']], axis=1)

- 날짜 간격 관련 Features<br>
  :최초 100개 공연 매진까지 걸린 시간의 시계열평균, 예매와 공연일 차이의 시계열평균 

In [6]:
def gap_features(DATA):
    tran = DATA.groupby(['play_date','performance_label'])['tran_gap'].agg(lambda x: np.mean(sorted(x[:100])))\
           .rolling(3, closed='left').mean().rename('100_tran_gap')
    tran.fillna(tran.iloc[3], inplace=True)
    
    play = DATA.groupby(['play_date','performance_label'])[['play_gap']].mean().rolling(2, closed='left').mean()
    play.fillna(play.iloc[2], inplace=True)
    
    return pd.concat([tran.reset_index().set_index('performance_label')['100_tran_gap'],
                       play.reset_index().set_index('performance_label')['play_gap']], axis=1)

- 공연명 Word2Vec

In [7]:
def W2V(DATA):
    ease = DATA.drop_duplicates('performance_label').set_index('performance_label').sort_index()[['공연명전처리']]
    ease = ease.dropna(subset=['공연명전처리'])
    # 토큰화
    data_words = [word_tokenize(title) for title in ease['공연명전처리'].dropna()]
    # TaggedDocument 객체 생성
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(data_words)]

    # Doc2Vec 모델 초기화
    model = Doc2Vec(vector_size=2, window=2, min_count=1, workers=4, epochs=100)
    # 모델 어휘 구축
    model.build_vocab(documents)
    # 모델 훈련
    model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

    return pd.DataFrame([model.infer_vector(doc_words) for i, doc_words in enumerate(data_words)], 
                         index=ease.index, columns=['vector_{}'.format(i) for i in range(model.vector_size)])

- 클러스터 관련 Features<br>
  :클러스터별 평균 예매수

In [8]:
def cluster_features(DATA):
    mean = pd.DataFrame()
    for c in DATA.cluster.unique():
        # Target
        ease = DATA.query('cluster == @c').groupby(['play_date','performance_label'])[['seat_label']].agg(pd.Series.nunique)\
               /DATA.query('cluster == @c').seat_label.nunique()
        # 클러스터별 평균 예매건수
        ease = pd.concat([ease, ease.rolling(2, closed='left').mean().rename(columns={'seat_label':'meansale'})], axis=1)
        ease['meansale'].fillna(ease['meansale'].iloc[2], inplace=True)
        
        # 클러스터별 최초 50개 예매 발생까지의 평균 시간
        ease['cluster_tran_gap'] = DATA.query('cluster == @c').groupby(['play_date','performance_label'])['tran_gap'].agg(lambda x: np.mean(sorted(x[:50])))\
                                   .rolling(2, closed='left').mean()
        ease['cluster_tran_gap'].fillna(ease['cluster_tran_gap'].iloc[2], inplace=True)
        # 실제 예매결정 가격/정가
        ease[['price','origin_price']] = DATA.query('cluster == @c').groupby(['play_date','performance_label'])[['price','origin_price']].sum()
        ease['cluster_price_rate'] = (ease['price']/ease['origin_price']).rolling(2, closed='left').mean()
        ease['cluster_price_rate'].fillna(ease['cluster_price_rate'].iloc[2], inplace=True)
        ease['cluster_price_rate'] = ease['cluster_price_rate'].apply(lambda x: 1 if x==np.infty else x)
        ease['cluster'] = c
        mean = pd.concat([mean, ease])
                                              
    mean = mean.reset_index().set_index('performance_label')[['seat_label','meansale','cluster_tran_gap','cluster_price_rate','cluster']].rename(columns={'seat_label':'TARGET'})
    mean = pd.concat([mean, pd.get_dummies(mean['cluster'], prefix='cluster', dtype=int)], axis=1)
    mean.drop('cluster', axis=1, inplace=True)
    return mean

## Merge features & target

In [9]:
# 교향곡
symphony_feature = cluster_features(data_symphony).reset_index()\
                   .merge(pd.concat([basic_features(data_symphony), price_features(data_symphony), 
                                     gap_features(data_symphony), W2V(data_symphony)], axis=1).reset_index(),
                          on='performance_label').set_index('performance_label')
print(f'데이터 크기: {symphony_feature.shape}')

# 결측치를 처리한다.
symphony_feature[['G1', 'G2', 'G3', 'G4', 'G5']] = symphony_feature[['G1', 'G2', 'G3', 'G4', 'G5']].fillna(-1)
symphony_feature.fillna(0, inplace=True)

데이터 크기: (3858, 33)


In [10]:
# 합창
chorus_feature = cluster_features(data_chorus).reset_index()\
                 .merge(pd.concat([basic_features(data_chorus), price_features(data_chorus), 
                                   gap_features(data_chorus), W2V(data_chorus)], axis=1).reset_index(),
                        on='performance_label').set_index('performance_label')
print(f'데이터 크기: {chorus_feature.shape}')

# 결측치를 처리한다.
chorus_feature[['G1', 'G2', 'G3', 'G4', 'G5']] = chorus_feature[['G1', 'G2', 'G3', 'G4', 'G5']].fillna(-1)
chorus_feature.fillna(0, inplace=True)

데이터 크기: (4115, 34)


In [11]:
# 성악
voice_feature = cluster_features(data_voice).reset_index()\
               .merge(pd.concat([basic_features(data_voice), price_features(data_voice), 
                                 gap_features(data_voice), W2V(data_voice)], axis=1).reset_index(),
                      on='performance_label').set_index('performance_label')
print(f'데이터 크기: {voice_feature.shape}')

# 결측치를 처리한다.
voice_feature[['G1', 'G2', 'G3', 'G4', 'G5']] = voice_feature[['G1', 'G2', 'G3', 'G4', 'G5']].fillna(-1)
voice_feature.fillna(0, inplace=True)

데이터 크기: (4406, 34)


In [12]:
# 독주
solo_feature = cluster_features(data_solo).reset_index()\
               .merge(pd.concat([basic_features(data_solo), price_features(data_solo), 
                                 gap_features(data_solo), W2V(data_solo)], axis=1).reset_index(),
                      on='performance_label').set_index('performance_label')
print(f'데이터 크기: {solo_feature.shape}')

# 결측치를 처리한다.
solo_feature[['G1', 'G2', 'G3', 'G4', 'G5']] = solo_feature[['G1', 'G2', 'G3', 'G4', 'G5']].fillna(-1)
solo_feature.fillna(0, inplace=True)

데이터 크기: (4564, 34)


## Save data

In [13]:
pickle.dump((symphony_feature, chorus_feature, voice_feature, solo_feature), 
            open(f'../data/Feature/Feature_230925.pkl', 'wb'))