In [1]:
#########
# BASIC #
#########
import os
import sys
import platform
import warnings
from collections import Counter
import calendar
from tqdm.notebook import tqdm_notebook

#############
# LOAD DATA #
#############
import numpy as np
import pandas as pd

#############################
# EXPLORATORY DATA ANALYSIS #
#############################
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

######################
# DATA PREPROCESSING #
######################
import re
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
## NATURAL LANGUAGE PROCESSING 
from konlpy.tag import Komoran
from gensim.models import Word2Vec

#######################
# FEATURE ENGINEERING #
#######################
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

#####################
# FEATURE SELECTION #
#####################


############
# MODELING #
############
## MACHINE LEARNING
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
# from lightgbm import LGBMRegressor

## DEEP LEARNING
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

###########
# TESTING #
###########
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [2]:
warnings.filterwarnings(action='ignore')
pd.options.display.float_format = '{:.4f}'.format
# plt.style.use('ggplot')
if platform.platform().split('-')[0] == 'Windows':
    mpl.rcParams['font.family'] = 'Malgun Gothic' # Windows에서는 한글이 나오게 하기 위해서
else: # macOS라고 가정
    mpl.rcParams['font.family'] = 'AppleGothic' # mac에서는 한글이 나오게 하기 위해서

mpl.rcParams['axes.unicode_minus'] = False
plt.style.use('ggplot')
device = 'cuda'  if torch.cuda.is_available() else 'cpu'
if torch.cuda.is_available():
    print(f'현재 CUDA를 사용할 수 있습니다. 사용하고 있는 장치는 {device} 입니다.')
else:
    print(f'현재 CUDA를 사용할 수 없습니다. 사용하고 있는 장치는 {device} 입니다. CPU로 학습시 오래 걸릴 수 있습니다.')

현재 CUDA를 사용할 수 있습니다. 사용하고 있는 장치는 cuda 입니다.


In [3]:
RANDOM_STATE = 42
TEST_SIZE = 0.2
learning_rate = 1e-4

In [7]:
data = dict()

In [8]:
data['train'] = pd.read_excel(r'../../data/01_제공데이터/2020 빅콘테스트 데이터분석분야-챔피언리그_2019년 실적데이터.xlsx',
                       header=1,
                       names=['방송일시', '노출(분)', '마더코드', '상품코드',
                              '상품명', '상품군', '판매단가', '취급액'],
                       thousands=',',
                       )
data['train'].index = pd.to_datetime(data['train']['방송일시'])
data['train'] = data['train'].drop('방송일시',axis=1)
data['train'] = data['train']['2019']
data['train']

Unnamed: 0_level_0,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액
방송일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-01 06:00:00,20.0000,100346,201072,테이트 남성 셀린니트3종,의류,39900,2099000.0000
2019-01-01 06:00:00,,100346,201079,테이트 여성 셀린니트3종,의류,39900,4371000.0000
2019-01-01 06:20:00,20.0000,100346,201072,테이트 남성 셀린니트3종,의류,39900,3262000.0000
2019-01-01 06:20:00,,100346,201079,테이트 여성 셀린니트3종,의류,39900,6955000.0000
2019-01-01 06:40:00,20.0000,100346,201072,테이트 남성 셀린니트3종,의류,39900,6672000.0000
...,...,...,...,...,...,...,...
2019-12-31 23:20:00,,100448,201391,일시불쿠첸압력밥솥 6인용,주방,148000,1664000.0000
2019-12-31 23:40:00,20.0000,100448,201383,무이자쿠첸압력밥솥 10인용,주방,178000,9149000.0000
2019-12-31 23:40:00,,100448,201390,일시불쿠첸압력밥솥 10인용,주방,168000,15282000.0000
2019-12-31 23:40:00,,100448,201384,무이자쿠첸압력밥솥 6인용,주방,158000,2328000.0000


In [10]:
data['train']['연'] = data['train'].index.map(lambda x: x.year)
data['train']['월'] = data['train'].index.map(lambda x: x.month)
data['train']['주'] = data['train'].index.map(lambda x: x.week)
data['train']['일'] = data['train'].index.map(lambda x: x.day)
data['train']['시'] = data['train'].index.map(lambda x: x.hour)
data['train']['분'] = data['train'].index.map(lambda x: x.minute)
data['train']['요일'] = data['train'].index.map(lambda x: calendar.day_abbr[x.weekday()])

In [11]:
train = data['train'].copy()

# General Prime Time

+ WEEKDAY 20:00 ~ 24:00
+ SAT 19:00 ~ 23:30
+ SUN 18:00 ~ 23:30

In [40]:
def prime_time(timestamp):
    
    weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri']
    
    day = calendar.day_abbr[timestamp.weekday()]
    time = int(f'{str(timestamp.hour).zfill(2)}{str(timestamp.minute).zfill(2)}')
    
    if day in weekdays:
        return True if 2000 <= time <= 2400 else False
            
    elif day == 'Sat':
        return True if 1900 <= time <= 2330 else False
    
    elif day == 'Sun':
        return True if 1800 <= time <= 2330 else False

In [42]:
train['prime_time'] = train.index.map(prime_time)

# Specific Prime Time

상품군별로 다르게 팔리는 시간이 있는가 확인

In [118]:
import plotly.express as px

In [133]:
groups = list(map(lambda x: train[(train.상품군==x) & (train.요일=='Sun')].groupby('시').mean(), train.상품군.unique()))

totals = pd.DataFrame({k: v for v, k in zip(list(map(lambda x: x.취급액, groups)), train.상품군.unique())})

fig = px.line(totals)
fig.show()

In [134]:
groups = list(map(lambda x: train[(train.상품군==x)].groupby('시').mean(), train.상품군.unique()))

totals = pd.DataFrame({k: v for v, k in zip(list(map(lambda x: x.취급액, groups)), train.상품군.unique())})

fig = px.line(totals)
fig.show()

# Duration Word Cloud