This notebook is a sample code with Korean comments.

# 3.3 텍스트 데이터 다루기

In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame({'text': ['I like kaggle very much',
                            'I do not like kaggle',
                            'I do really love machine learning']})
df

Unnamed: 0,text
0,I like kaggle very much
1,I do not like kaggle
2,I do really love machine learning


# Bag of Words

In [3]:
from sklearn.feature_extraction.text import CountVectorizer


vectorizer = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b')
bag = vectorizer.fit_transform(df['text'])
bag.toarray()

array([[0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1],
       [1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0],
       [1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0]], dtype=int64)

In [4]:
print(vectorizer.vocabulary_)

{'i': 1, 'like': 4, 'kaggle': 2, 'very': 10, 'much': 7, 'do': 0, 'not': 8, 'really': 9, 'love': 5, 'machine': 6, 'learning': 3}


# TF-IDF

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


vectorizer = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b')
transformer = TfidfTransformer()

tf = vectorizer.fit_transform(df['text'])
tfidf = transformer.fit_transform(tf)
print(tfidf.toarray())

[[0.         0.31544415 0.40619178 0.         0.40619178 0.
  0.         0.53409337 0.         0.         0.53409337]
 [0.43306685 0.33631504 0.43306685 0.         0.43306685 0.
  0.         0.         0.56943086 0.         0.        ]
 [0.34261996 0.26607496 0.         0.45050407 0.         0.45050407
  0.45050407 0.         0.         0.45050407 0.        ]]


In [6]:
print(vectorizer.vocabulary_)

{'i': 1, 'like': 4, 'kaggle': 2, 'very': 10, 'much': 7, 'do': 0, 'not': 8, 'really': 9, 'love': 5, 'machine': 6, 'learning': 3}


# Word2vec

In [7]:
from gensim.models import word2vec


sentences = [d.split() for d in df['text']]
model = word2vec.Word2Vec(sentences, size=10, min_count=1, window=2, seed=7)

In [8]:
model.wv['like']

array([-0.04932676, -0.01171829,  0.04239148,  0.01735417, -0.04764815,
       -0.03205363, -0.02873827,  0.04682567,  0.04185081,  0.00795709],
      dtype=float32)

In [9]:
model.wv.most_similar('like')

[('much', 0.31108221411705017),
 ('really', 0.11813490092754364),
 ('not', 0.07177764177322388),
 ('learning', -0.014833025634288788),
 ('very', -0.03584161400794983),
 ('do', -0.11829414963722229),
 ('machine', -0.12069450318813324),
 ('kaggle', -0.532151997089386),
 ('love', -0.5468614101409912),
 ('I', -0.7641928195953369)]

In [10]:
df['text'][0].split()

['I', 'like', 'kaggle', 'very', 'much']

In [11]:
import numpy as np


wordvec = np.array([model.wv[word] for word in df['text'][0].split()])
wordvec

array([[-0.00070634,  0.04390315, -0.03669089,  0.02026465,  0.04046954,
         0.02365695,  0.020924  , -0.03109757, -0.04436051, -0.00691835],
       [-0.04932676, -0.01171829,  0.04239148,  0.01735417, -0.04764815,
        -0.03205363, -0.02873827,  0.04682567,  0.04185081,  0.00795709],
       [ 0.03825201, -0.04983004, -0.03085005, -0.0421443 ,  0.04703034,
        -0.01274201,  0.00586073, -0.02872854,  0.01241979, -0.03893603],
       [-0.01407814, -0.03944685,  0.01979917, -0.00788147,  0.03230685,
         0.04465036, -0.01564248,  0.04261149, -0.04766037,  0.03080159],
       [ 0.0160588 , -0.04853946,  0.02299253,  0.00940678, -0.04020066,
         0.00423941,  0.00689822,  0.02838706, -0.02563218,  0.02724046]],
      dtype=float32)

In [12]:
np.mean(wordvec, axis=0)

array([-0.00196009, -0.0211263 ,  0.00352845, -0.00060003,  0.00639158,
        0.00555022, -0.00213956,  0.01159962, -0.01267649,  0.00402895],
      dtype=float32)

In [13]:
np.max(wordvec, axis=0)

array([0.03825201, 0.04390315, 0.04239148, 0.02026465, 0.04703034,
       0.04465036, 0.020924  , 0.04682567, 0.04185081, 0.03080159],
      dtype=float32)