## Import

In [1]:
import pandas as pd
import numpy as np
import pickle

# ignore warnings
import warnings ; warnings.filterwarnings('ignore')

## Read data

In [2]:
data = pd.read_parquet('../data/DataCleansing_final.pqt')

## Feature Generation

In [3]:
def basic_features(DATA):
    ease = DATA.drop_duplicates('seat_label').set_index('seat_label').sort_index()\
           [['층','블록','열','층_1', '층_2', '층_3','층_합창석', 'BOX', 
             'side_seats', 'middle', 'left', 'right', 'front', 'mid', 'front_mid','back', 
             'piano_hallway','sound_quality', 'leg_free', '3rd_bad','bad_sound', 'singer_face','conductor_back','row_ratio']]
    ease['층/블록'] = ease['층']+ease['블록']
    return ease.drop('블록', axis=1)

In [4]:
def booked_features(DATA):
    ease = DATA.groupby(['seat_label','층'])['price'].size().rename('booked_count').reset_index()
    # 합창석은 일부 공연만 예매할 수 있도록 좌석을 풀기에 예매율 산출 시 별도로 구해야 한다.
    total, exp = DATA.performance_label.nunique(), DATA.query('층=="합창석"').performance_label.nunique()
    ease['booking_rate'] = ease[['층','booked_count']].apply(lambda x: round((x[1]/exp)*100, 3) if x[0]=="합창석" else round((x[1]/total)*100, 3), axis=1)
    return ease.set_index('seat_label')[['booked_count','booking_rate']]

In [5]:
def time_features(DATA):
    return DATA.groupby('seat_label')['res_time_rank_scaled'].mean().apply(lambda x: round(x, 4)).rename('res_time_rank_mean')

- 교향곡

In [6]:
symphony_features = pd.concat([basic_features(data), 
                                       booked_features(data.query('genre=="교향곡"')),
                                       time_features(data.query('genre=="교향곡"'))], axis=1)

- 합창

In [7]:
chorus_features = pd.concat([basic_features(data), 
                                     booked_features(data.query('genre=="합창"')),
                                     time_features(data.query('genre=="합창"'))], axis=1)

- 성악

In [8]:
voice_features = pd.concat([basic_features(data), 
                                    booked_features(data.query('genre=="성악"')),
                                   time_features(data.query('genre=="성악"'))], axis=1)

- 독주

In [9]:
solo_features = pd.concat([basic_features(data), 
                                   booked_features(data.query('genre=="독주"')),
                                   time_features(data.query('genre=="독주"'))], axis=1)

- 실내악

In [10]:
chamber_features = pd.concat([basic_features(data), 
                                      booked_features(data.query('genre=="실내악"')),
                                      time_features(data.query('genre=="실내악"'))], axis=1)

## Feature Transformation

- 교향곡

In [11]:
symphony_features['res_time_rank_mean'] = 1 - symphony_features['res_time_rank_mean']
symphony_features['row_ratio'] = 1 - symphony_features['row_ratio']

- 합창

In [12]:
chorus_features['res_time_rank_mean'] = 1 - chorus_features['res_time_rank_mean']
chorus_features['row_ratio'] = 1 - chorus_features['row_ratio']

- 성악

In [13]:
voice_features['res_time_rank_mean'] = 1 - voice_features['res_time_rank_mean']
voice_features['row_ratio'] = 1 - voice_features['row_ratio']

- 독주

In [14]:
solo_features['res_time_rank_mean'] = 1 - solo_features['res_time_rank_mean']
solo_features['row_ratio'] = 1 - solo_features['row_ratio']

- 실내악

In [15]:
chamber_features['res_time_rank_mean'] = 1 - chamber_features['res_time_rank_mean']
chamber_features['row_ratio'] = 1 - chamber_features['row_ratio']

## Feature Selection

- 교향곡

In [16]:
unuse = ['front_mid','singer_face']
symphony_features.drop(unuse, axis=1, inplace=True)

- 합창

In [17]:
unuse = ['front_mid','piano_hallway','bad_sound', 'singer_face']
chorus_features.drop(unuse, axis=1, inplace=True)

- 성악

In [18]:
unuse = ['front_mid','sound_quality','conductor_back']
voice_features.drop(unuse, axis=1, inplace=True)

- 독주

In [19]:
unuse = ['front_mid','sound_quality','bad_sound', 'singer_face','conductor_back']
solo_features.drop(unuse, axis=1, inplace=True)

- 실내악

In [20]:
unuse = ['piano_hallway','singer_face','conductor_back']
chamber_features.drop(unuse, axis=1, inplace=True)

## Save data

In [21]:
pickle.dump((symphony_features, chorus_features, voice_features, solo_features, chamber_features), 
            open(f'../data/ClusteringFeatures.pkl', 'wb'))