# Preprocess

In [1]:
### Import ###
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
plt.rc('font', family='Malgun Gothic')

outcome = pd.read_excel('./rsc/given/outcome.xlsx', header=1, encoding='utf-8')
outcome = outcome[:-9]

FileNotFoundError: [Errno 2] No such file or directory: './rsc/given/outcome.xlsx'

In [None]:
def re_sub(series: pd.Series) -> pd.Series:
    series = series.str.replace(pat=r'[ㄱ-ㅎ]', repl=r'', regex=True)  # ㅋ 제거용
    series = series.str.replace(pat=r'[^\w\s]', repl=r'', regex=True)  # 특수문자 제거
    series = series.str.replace(pat=r'[ ]{2,}', repl=r' ', regex=True)  # 공백 제거
    series = series.str.replace(pat=r'[\u3000]+', repl=r'', regex=True)  # u3000 제거
    return series

In [None]:
train = outcome[['상품명', '상품군']].copy()
train.상품명 = re_sub(train.상품명)

In [None]:
prod_names = (train[train.상품군 != '무형'].상품명 + ' ' + train[train.상품군 != '무형'].상품군).values# len(prod_names) > 1692 (무형 빼고)

In [None]:
from konlpy.tag import Komoran
komoran = Komoran(userdic='./user_dict_0913.txt')
print(komoran.nouns("KOMORAN은 한국어 형태소 분석기입니다."))

naive_tokens = list(map(komoran.pos, prod_names))

In [None]:
def morph_filter(lst):
    return list(map(lambda x: x[0], filter(lambda x: (x[1] in usable_morphs), lst)))

usable_morphs = ['NNG', 'NNP', 'NP', 'NR'] #+ ['SL', 'SN']
naive_tokens = list(map(morph_filter, naive_tokens))

In [None]:
exclude_words = ['종', '인용']
def exclude(lst):
    return list(filter(lambda x: x not in exclude_words, lst))
processed_tokens = list(map(exclude, naive_tokens))

In [None]:
from gensim.models import Word2Vec
model = Word2Vec(sentences=processed_tokens, size=10, window=6, min_count=5, workers=4, sg=1)

In [None]:
word_vectors = model.wv

vocabs = word_vectors.vocab.keys()
word_vectors_list = [word_vectors[v] for v in vocabs]

In [None]:
def sum_embed(lst):
    wv = []
    for word in lst:
        try:
            wv.append(word_vectors[word])
        except:
            pass
    return sum(wv)

In [None]:
unique_data = [list(x) for x in set(tuple(x) for x in processed_tokens)]

In [None]:
embedded_tokens = list(map(sum_embed, unique_data))

# Dimensionality Reduction

## 2D

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2)
mat_2d = tsne.fit_transform(embedded_tokens)
xs_2d = mat_2d[:, 0]
ys_2d = mat_2d[:, 1]

## 3D

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=3)
mat_3d = tsne.fit_transform(embedded_tokens)
xs_3d = mat_3d[:, 0]
ys_3d = mat_3d[:, 1]
zs_3d = mat_3d[:, 2]

# Plot with Plotly

In [None]:
gdict = {k: i for i, k in enumerate(set(map(lambda x: x[-1], unique_data)))}
gdict

In [None]:
df_2d = pd.DataFrame({
    'x': xs_2d,
    'y': ys_2d,
    '상품군': list(map(lambda x: x[-1], unique_data)),
    '상품명': list(map(' '.join, unique_data))
})
df_2d

In [None]:
import plotly.express as px

In [None]:
fig = px.scatter(df_2d,
                 x='x',
                 y='y',
                 color='상품군',
                 hover_data=['상품명'],
                 )
fig.show()

In [None]:
fig.write_html('./2d.html')

In [None]:
df_3d = pd.DataFrame({
    'x': xs_3d,
    'y': ys_3d,
    'z': zs_3d,
    '상품군': list(map(lambda x: x[-1], unique_data)),
    '상품명': list(map(' '.join, unique_data)),
    'size': (.1 for _ in xs_3d)
})
df_3d

In [None]:
import plotly.graph_objects as go

In [None]:
fig = px.scatter_3d(df_3d, x='x', y='y', z='z',
                    color='상품군', hover_data=['상품명'],
                    size_max=5, size='size')
fig.show()

In [None]:
fig.write_html('./3d.html')

# WordCloud

In [None]:
from wordcloud import WordCloud

texts = list(map(' '.join, unique_data))
wordcloud = WordCloud(font_path='font/NanumGothic.ttf', background_color='white').generate(' '.join(texts))

plt.figure(figsize=(12, 12))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
from PIL import Image
import numpy as np

alice_mask = np.array(Image.open("ns_mask_conversion.png"))
wordcloud = WordCloud(
    font_path='font/NanumGothic.ttf',
    width = 800,
    height = 800,
    background_color="black",
    mask = alice_mask
)

wordcloud = wordcloud.generate(' '.join(texts))
plt.figure(figsize=(10, 10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()