# DictVectorizer

In [9]:
from sklearn.feature_extraction import DictVectorizer

v = DictVectorizer(sparse=False)
D = [{'A':1, 'B':2}, {'B':3, 'C':1}]
X = v.fit_transform(D)
X

array([[1., 2., 0.],
       [0., 3., 1.]])

In [10]:
v.feature_names_

['A', 'B', 'C']

In [11]:
v.transform({"C":4, 'D':3})

array([[0., 0., 4.]])

# CountVectorizer

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
    'The last document?',
]
vect = CountVectorizer()
vect.fit(corpus)
vect.vocabulary_

{'this': 9,
 'is': 3,
 'the': 7,
 'first': 2,
 'document': 1,
 'second': 6,
 'and': 0,
 'third': 8,
 'one': 5,
 'last': 4}

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
corpus_kr = ['보험계약의 보험료 결제를 위해',
          '카드 유효 기간을',
          ' 변경하실 카드번호',
]
vect = CountVectorizer()
vect.fit(corpus_kr)
vect.vocabulary_

{'보험계약의': 3,
 '보험료': 4,
 '결제를': 0,
 '위해': 5,
 '카드': 7,
 '유효': 6,
 '기간을': 1,
 '변경하실': 2,
 '카드번호': 8}

In [29]:
vect.transform(['This is the second document.']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [30]:
vect.transform(['보험계약의 보험료 결제를 위해.']).toarray()

array([[1, 0, 0, 1, 1, 1, 0, 0, 0]], dtype=int64)

In [31]:
vect.transform(corpus).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [32]:
vect.transform(corpus_kr).toarray()

array([[1, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 1]], dtype=int64)

# Stop Words

In [33]:
vect = CountVectorizer(stop_words=["and", "is", "the", "this"]).fit(corpus)
vect.vocabulary_

{'first': 1, 'document': 0, 'second': 4, 'third': 5, 'one': 3, 'last': 2}

In [35]:
vect = CountVectorizer(stop_words=["유효", "위해"]).fit(corpus_kr)
vect.vocabulary_

{'보험계약의': 3, '보험료': 4, '결제를': 0, '카드': 5, '기간을': 1, '변경하실': 2, '카드번호': 6}

# Token

In [37]:
vect = CountVectorizer(analyzer="char").fit(corpus)
vect.vocabulary_

{'t': 16,
 'h': 8,
 'i': 9,
 's': 15,
 ' ': 0,
 'e': 6,
 'f': 7,
 'r': 14,
 'd': 5,
 'o': 13,
 'c': 4,
 'u': 17,
 'm': 11,
 'n': 12,
 '.': 1,
 'a': 3,
 '?': 2,
 'l': 10}

In [38]:
vect = CountVectorizer(analyzer="char").fit(corpus_kr)
vect.vocabulary_

{'보': 11,
 '험': 22,
 '계': 4,
 '약': 13,
 '의': 17,
 ' ': 0,
 '료': 7,
 '결': 2,
 '제': 18,
 '를': 8,
 '위': 14,
 '해': 21,
 '카': 19,
 '드': 6,
 '유': 15,
 '효': 24,
 '기': 5,
 '간': 1,
 '을': 16,
 '변': 10,
 '경': 3,
 '하': 20,
 '실': 12,
 '번': 9,
 '호': 23}

In [39]:
vect = CountVectorizer(token_pattern="t\w+").fit(corpus)
vect.vocabulary_

{'this': 2, 'the': 0, 'third': 1}

In [41]:
vect = CountVectorizer(token_pattern="위해").fit(corpus_kr)
vect.vocabulary_

{'위해': 0}

In [45]:
import nltk

vect = CountVectorizer(tokenizer=nltk.word_tokenize).fit(corpus)
vect.vocabulary_

{'this': 11,
 'is': 5,
 'the': 9,
 'first': 4,
 'document': 3,
 '.': 0,
 'second': 8,
 'and': 2,
 'third': 10,
 'one': 7,
 '?': 1,
 'last': 6}

In [46]:
vect = CountVectorizer(tokenizer=nltk.word_tokenize).fit(corpus_kr)
vect.vocabulary_

{'보험계약의': 3,
 '보험료': 4,
 '결제를': 0,
 '위해': 5,
 '카드': 7,
 '유효': 6,
 '기간을': 1,
 '변경하실': 2,
 '카드번호': 8}

# N gram

In [47]:
vect = CountVectorizer(ngram_range=(2,2)).fit (corpus)
vect.vocabulary_

{'this is': 12,
 'is the': 2,
 'the first': 7,
 'first document': 1,
 'the second': 9,
 'second second': 6,
 'second document': 5,
 'and the': 0,
 'the third': 10,
 'third one': 11,
 'is this': 3,
 'this the': 13,
 'the last': 8,
 'last document': 4}

In [48]:
vect = CountVectorizer(ngram_range=(2,2)).fit (corpus_kr)
vect.vocabulary_

{'보험계약의 보험료': 2,
 '보험료 결제를': 3,
 '결제를 위해': 0,
 '카드 유효': 5,
 '유효 기간을': 4,
 '변경하실 카드번호': 1}

In [49]:
vect = CountVectorizer(ngram_range=(1, 2), token_pattern="t\w+").fit(corpus)
vect.vocabulary_

{'this': 3, 'the': 0, 'this the': 4, 'third': 2, 'the third': 1}

In [55]:
vect = CountVectorizer(ngram_range=(2, 2)).fit(corpus_kr)
vect.vocabulary_

{'보험계약의 보험료': 2,
 '보험료 결제를': 3,
 '결제를 위해': 0,
 '카드 유효': 5,
 '유효 기간을': 4,
 '변경하실 카드번호': 1}

# Frequency

In [56]:
vect = CountVectorizer(max_df=4, min_df=2).fit(corpus)
vect.vocabulary_, vect.stop_words_

({'this': 3, 'is': 2, 'first': 1, 'document': 0},
 {'and', 'last', 'one', 'second', 'the', 'third'})

In [58]:
vect = CountVectorizer(max_df=4, min_df=0).fit(corpus_kr)
vect.vocabulary_, vect.stop_words_

({'보험계약의': 3,
  '보험료': 4,
  '결제를': 0,
  '위해': 5,
  '카드': 7,
  '유효': 6,
  '기간을': 1,
  '변경하실': 2,
  '카드번호': 8},
 set())

In [59]:
vect.transform(corpus).toarray().sum(axis=0)

array([0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [60]:
vect.transform(corpus_kr).toarray().sum(axis=0)

array([1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

# TF-IDF

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidv = TfidfVectorizer().fit(corpus)
tfidv.transform(corpus).toarray()

array([[0.        , 0.38947624, 0.55775063, 0.4629834 , 0.        ,
        0.        , 0.        , 0.32941651, 0.        , 0.4629834 ],
       [0.        , 0.24151532, 0.        , 0.28709733, 0.        ,
        0.        , 0.85737594, 0.20427211, 0.        , 0.28709733],
       [0.55666851, 0.        , 0.        , 0.        , 0.        ,
        0.55666851, 0.        , 0.26525553, 0.55666851, 0.        ],
       [0.        , 0.38947624, 0.55775063, 0.4629834 , 0.        ,
        0.        , 0.        , 0.32941651, 0.        , 0.4629834 ],
       [0.        , 0.45333103, 0.        , 0.        , 0.80465933,
        0.        , 0.        , 0.38342448, 0.        , 0.        ]])

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidv = TfidfVectorizer().fit(corpus_kr)
tfidv.transform(corpus_kr).toarray()

array([[0.5       , 0.        , 0.        , 0.5       , 0.5       ,
        0.5       , 0.        , 0.        , 0.        ],
       [0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.        , 0.57735027, 0.57735027, 0.        ],
       [0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.70710678]])

# Hashing Trick

In [63]:
from sklearn.datasets import fetch_20newsgroups
twenty = fetch_20newsgroups()
len(twenty.data)

11314

In [65]:
%time
CountVectorizer().fit(twenty.data).transform(twenty.data)

CPU times: total: 0 ns
Wall time: 0 ns


<11314x130107 sparse matrix of type '<class 'numpy.int64'>'
	with 1787565 stored elements in Compressed Sparse Row format>

In [66]:
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(n_features=300000)

In [67]:
%time hv.transform(twenty.data)

CPU times: total: 1.58 s
Wall time: 1.59 s


<11314x300000 sparse matrix of type '<class 'numpy.float64'>'
	with 1786336 stored elements in Compressed Sparse Row format>

In [100]:
import warnings
warnings.simplefilter("ignore")

import string
from urllib.request import urlopen
from konlpy.utils import pprint
from konlpy.tag import Hannanum
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from bs4 import BeautifulSoup

hannanum = Hannanum()

url = "https://news.naver.com/"

# 웹 페이지 열기
html = urlopen(url)

# BeautifulSoup을 사용하여 HTML 파싱
soup = BeautifulSoup(html, 'html.parser')

# 더 넓은 선택자로 데이터 추출
texts = soup.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6", "span", "a"])

# 원하는 데이터 추출
cell_texts = [text.get_text() for text in texts]

# 문서 생성
docs = [
    w for w in hannanum.nouns(" ".join(cell_texts))
    if ((not w[0].isnumeric()) and (w[0] not in string.punctuation))
]

# 벡터화 및 특성 추출
vect = CountVectorizer().fit(docs)
count = vect.transform(docs).toarray().sum(axis=0)
idx = np.argsort(-count)
count = count[idx]
feature_name = np.array(vect.get_feature_names_out())[idx]

pprint(list(zip(feature_name, count)))


[('심층기획', 66),
 ('구독', 54),
 ('언론사별', 54),
 ('부산', 42),
 ('대통령', 36),
 ('기자', 30),
 ('서울', 28),
 ('한국', 26),
 ('뉴스', 24),
 ('단독', 23),
 ('대표', 22),
 ('폐렴', 22),
 ('김홍', 22),
 ('세계', 21),
 ('윤석열', 20),
 ('글로벌', 20),
 ('인터뷰', 19),
 ('정부', 18),
 ('시대', 17),
 ('올해', 17),
 ('이스라엘', 16),
 ('금융', 16),
 ('오늘', 16),
 ('회장', 16),
 ('그룹', 16),
 ('확대', 15),
 ('한동훈', 15),
 ('하마스', 15),
 ('경찰', 15),
 ('요소', 15),
 ('고금리', 14),
 ('이유', 14),
 ('우려', 14),
 ('문제', 14),
 ('팔레스타', 14),
 ('마약', 14),
 ('전세사기', 13),
 ('내년', 13),
 ('정전', 13),
 ('인류', 13),
 ('장관', 13),
 ('의혹', 13),
 ('노조', 13),
 ('유동규', 13),
 ('경제', 13),
 ('주가', 13),
 ('논란', 13),
 ('아내', 13),
 ('지난해', 12),
 ('조사', 12),
 ('체결', 12),
 ('선거구', 12),
 ('미국', 12),
 ('동영상', 12),
 ('국내', 12),
 ('총선', 12),
 ('아파트', 12),
 ('이준석', 12),
 ('격려', 12),
 ('중국', 12),
 ('이상', 12),
 ('이날', 12),
 ('혐의', 11),
 ('우리', 11),
 ('이슈', 11),
 ('변호사', 11),
 ('애플', 11),
 ('후보자', 11),
 ('방송', 11),
 ('이재명', 11),
 ('피해자', 11),
 ('국민의힘', 11),
 ('이후', 10),
 ('일본', 10),
 ('비판', 1