In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from matplotlib import font_manager, rc
import platform
import re

if platform.system() == 'Windows':
# 윈도우인 경우
    font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
    rc('font', family=font_name)
    
else:    
# Mac 인 경우
    rc('font', family='AppleGothic')
    
plt.style.use('ggplot')
import datetime
import os

np.random.seed(0)
os.environ["PYTHONHASHSEED"] = "123"


In [12]:
## 제공 데이터
train = pd.read_csv('train_after_processing_weatheropt3.csv',header = 0,index_col=0, encoding='cp949') 
test = pd.read_csv('test_after_processing_weatheropt3.csv',header = 0,index_col=0, encoding='cp949')


In [13]:
train = train.query('상품군 != "무형" & 취급액 != 0 & 취급액 != 50000' ).reset_index()
test = test.query('상품군 != "무형"').reset_index()
target = train.취급액
orgin_tr_index = train.loc[:,'index']
orgin_te_index = test.loc[:,'index'] 

train.drop(columns = ['index','취급액','브랜드','시청률'], axis = 1, inplace = True)
test.drop(columns = ['index','취급액','브랜드'], axis = 1, inplace = True)

In [14]:
def get_merge(df, n, columns) :
    merge_list = []
    for i in range(len(df)) :
        value = list(df.loc[i,columns].values)
        #for j in range(len(value)) :
        #    if '/' in value[j] :
        #        v = re.split('[/]', value[j])
        #        for k in range(len(v)) :
        #            value.insert(j+k+1, v[k])
        #        value.pop(j)
        #    else :
        #        continue
        for h in range(n) :
            value = list(np.append(value, np.random.choice(value, len(value), replace = False)))
        merge_list.append(value)
    
    
    return merge_list

In [15]:
columns_name = ['상품군','대','중','소']

w2v_train = get_merge(train,3, columns_name)
w2v_test = get_merge(test, 3, columns_name)

In [16]:
np.append(columns_name, np.random.choice(columns_name, 3))

array(['상품군', '대', '중', '소', '중', '중', '중'], dtype='<U3')

In [17]:
num_features = 200 # 단어 벡터 차원 수
min_word_count = 3 # 최소 단어 수
context = 5 # 학습 윈도우(인접한 단어 리스트) 크기

# 초기화 및 모델 학습
from gensim.models import word2vec

# 모델 학습
w2v = word2vec.Word2Vec(w2v_train, 
                        size=num_features, 
                        min_count=min_word_count,
                        window=context,
                        seed=5, workers=1)
# 필요없는 메모리 unload
w2v.init_sims(replace=True)

### Make features
# 구매상품에 해당하는 벡터의 평균/최소/최대 벡터를 feature로 만드는 전처리기(pipeline에서 사용 가능)
class EmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = num_features
    def fit(self, X):
        return self
    def transform(self, X):
        return np.array([
            np.hstack([
                np.max([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.min([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0)                
            ]) 
            for words in X
        ])  
    

In [18]:
w2v_tr_feature = EmbeddingVectorizer(w2v).transform(w2v_train)
w2v_tr_feature = pd.DataFrame(w2v_tr_feature, columns = [f'w2v_{i}' for i in range(w2v_tr_feature.shape[1])])
w2v_te_feature = EmbeddingVectorizer(w2v).transform(w2v_test)
w2v_te_feature = pd.DataFrame(w2v_te_feature, columns = [f'w2v_{i}' for i in range(w2v_te_feature.shape[1])])

del w2v_train, w2v_test

In [9]:
w2v_te_feature

Unnamed: 0,w2v_0,w2v_1,w2v_2,w2v_3,w2v_4,w2v_5,w2v_6,w2v_7,w2v_8,w2v_9,...,w2v_590,w2v_591,w2v_592,w2v_593,w2v_594,w2v_595,w2v_596,w2v_597,w2v_598,w2v_599
0,0.161326,0.071568,0.137064,0.080376,0.083341,0.106520,0.018159,0.000766,0.057134,0.106968,...,-0.009090,0.076115,0.017956,-0.018893,-0.057691,0.001090,-0.025813,-0.031951,-0.080963,0.038917
1,0.161326,0.071568,0.137064,0.080376,0.083341,0.106520,0.018159,0.000766,0.057134,0.106968,...,-0.009090,0.076115,0.017956,-0.018893,-0.057691,0.001090,-0.025813,-0.031951,-0.080963,0.038917
2,0.161326,0.071568,0.137064,0.080376,0.083341,0.106520,0.018159,0.000766,0.057134,0.106968,...,-0.009090,0.076115,0.017956,-0.018893,-0.057691,0.001090,-0.025813,-0.031951,-0.080963,0.038917
3,0.161326,0.085906,0.120582,0.112020,0.058155,0.099753,0.178326,-0.014310,0.082873,0.161616,...,-0.053645,0.015761,0.000398,0.036898,0.025368,0.046607,-0.052769,0.039640,-0.006543,0.087846
4,0.161326,0.085906,0.120582,0.112020,0.058155,0.099753,0.178326,-0.014310,0.082873,0.161616,...,-0.053645,0.015761,0.000398,0.036898,0.025368,0.046607,-0.052769,0.039640,-0.006543,0.087846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2711,0.161326,0.114629,0.120582,0.112020,0.058155,0.099753,0.033954,-0.014310,0.082873,0.161616,...,-0.047873,-0.002005,0.019865,0.016011,0.036317,0.057281,-0.058927,0.020623,0.008343,0.105750
2712,0.161326,0.114629,0.120582,0.112020,0.058155,0.099753,0.033954,-0.014310,0.082873,0.161616,...,-0.047873,-0.002005,0.019865,0.016011,0.036317,0.057281,-0.058927,0.020623,0.008343,0.105750
2713,0.161326,0.114629,0.120582,0.112020,0.058155,0.099753,0.033954,-0.014310,0.082873,0.161616,...,-0.047873,-0.002005,0.019865,0.016011,0.036317,0.057281,-0.058927,0.020623,0.008343,0.105750
2714,0.033771,0.086086,0.101374,0.093991,0.006654,0.065398,0.047921,-0.033960,0.121550,0.142028,...,-0.058189,0.048869,-0.017648,0.008747,-0.031993,0.045282,-0.026095,-0.037091,0.006614,0.014509


In [20]:
w2v_tr_feature.to_csv('w2v_tr_feature_.csv', index = False, encoding = 'cp949')
w2v_te_feature.to_csv('w2v_te_feature_.csv', index = False, encoding= 'cp949')
# w2v고정