In [1]:
sentences = ["It was the best of times", 
             "it was the worst of times", 
             "it was the age of wisdom", 
             "it was the age of foolishness"]

tokenized_sentences = [[t for t in sentence.split()] for sentence in sentences]

vocabulary = set([w for s in tokenized_sentences for w in s])

import pandas as pd
[[w, i] for i,w in enumerate(vocabulary)]

[['foolishness', 0],
 ['times', 1],
 ['wisdom', 2],
 ['worst', 3],
 ['age', 4],
 ['it', 5],
 ['best', 6],
 ['of', 7],
 ['It', 8],
 ['was', 9],
 ['the', 10]]

In [2]:
def onehot_encode(tokenized_sentence):
    return [1 if w in tokenized_sentence else 0 for w in vocabulary]

onehot = [onehot_encode(tokenized_sentence) for tokenized_sentence in tokenized_sentences]

for (sentence, oh) in zip(sentences, onehot):
    print("%s: %s" % (oh, sentence))

[0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1]: It was the best of times
[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1]: it was the worst of times
[0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1]: it was the age of wisdom
[1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1]: it was the age of foolishness


In [4]:
vocabulary

{'It',
 'age',
 'best',
 'foolishness',
 'it',
 'of',
 'the',
 'times',
 'was',
 'wisdom',
 'worst'}

In [6]:
pd.DataFrame(onehot, columns=list(vocabulary))

Unnamed: 0,foolishness,times,wisdom,worst,age,it,best,of,It,was,the
0,0,1,0,0,0,0,1,1,1,1,1
1,0,1,0,1,0,1,0,1,0,1,1
2,0,0,1,0,1,1,0,1,0,1,1
3,1,0,0,0,1,1,0,1,0,1,1


In [8]:
sim = [onehot[0][i] & onehot[1][i] for i in range(0, len(vocabulary))]
sum(sim)

4

In [10]:
import numpy as np
np.dot(onehot[0], onehot[1])

4

In [14]:
onehot

[[0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1],
 [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1],
 [0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1],
 [1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1]]

In [15]:
np.transpose(onehot)

array([[0, 0, 0, 1],
       [1, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 1],
       [0, 1, 1, 1],
       [1, 0, 0, 0],
       [1, 1, 1, 1],
       [1, 0, 0, 0],
       [1, 1, 1, 1],
       [1, 1, 1, 1]])

In [16]:
np.dot(onehot, np.transpose(onehot))

array([[6, 4, 3, 3],
       [4, 6, 4, 4],
       [3, 4, 6, 5],
       [3, 4, 5, 6]])

<br>

## 단어 가방 모델

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [18]:
more_sentences = sentences + ["John likes to watch movies. Mary likes movies too.",
                              "Mary also likes to watch football games."]
pd.DataFrame(more_sentences)

Unnamed: 0,0
0,It was the best of times
1,it was the worst of times
2,it was the age of wisdom
3,it was the age of foolishness
4,John likes to watch movies. Mary likes movies ...
5,Mary also likes to watch football games.


In [19]:
cv.fit(more_sentences)

In [23]:
print(cv.get_feature_names_out())

['age' 'also' 'best' 'foolishness' 'football' 'games' 'it' 'john' 'likes'
 'mary' 'movies' 'of' 'the' 'times' 'to' 'too' 'was' 'watch' 'wisdom'
 'worst']


In [28]:
dt=cv.transform(more_sentences)

In [29]:
dt

<6x20 sparse matrix of type '<class 'numpy.int64'>'
	with 38 stored elements in Compressed Sparse Row format>

In [30]:
pd.DataFrame(dt.toarray(), columns=cv.get_feature_names_out())

Unnamed: 0,age,also,best,foolishness,football,games,it,john,likes,mary,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0,0,1,0,0,0,1,0,0,0,0,1,1,1,0,0,1,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0,1,1,1,0,0,1,0,0,1
2,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0
3,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,1,2,1,2,0,0,0,1,1,0,1,0,0
5,0,1,0,0,1,1,0,0,1,1,0,0,0,0,1,0,0,1,0,0


단어 가방모델을 이용해 문서간의 유사성 계산: 벡터간의 각도를 활용

In [32]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(dt[0], dt[1])

array([[0.83333333]])

In [34]:
pd.DataFrame(cosine_similarity(dt, dt))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.833333,0.666667,0.666667,0.0,0.0
1,0.833333,1.0,0.666667,0.666667,0.0,0.0
2,0.666667,0.666667,1.0,0.833333,0.0,0.0
3,0.666667,0.666667,0.833333,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.524142
5,0.0,0.0,0.0,0.0,0.524142,1.0


<br>

## TF-IDF 모델

In [36]:
from sklearn.feature_extraction.text import TfidfTransformer

In [37]:
tfidf = TfidfTransformer()
tfidf_dt = tfidf.fit_transform(dt)
pd.DataFrame(tfidf_dt.toarray(), columns=cv.get_feature_names_out())

Unnamed: 0,age,also,best,foolishness,football,games,it,john,likes,mary,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0.0,0.0,0.56978,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.467228,0.0,0.0,0.338027,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.467228,0.0,0.0,0.338027,0.0,0.0,0.56978
2,0.467228,0.0,0.0,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.0,0.0,0.0,0.338027,0.0,0.56978,0.0
3,0.467228,0.0,0.0,0.56978,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.0,0.0,0.0,0.338027,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.305609,0.501208,0.250604,0.611219,0.0,0.0,0.0,0.250604,0.305609,0.0,0.250604,0.0,0.0
5,0.0,0.419233,0.0,0.0,0.419233,0.419233,0.0,0.0,0.343777,0.343777,0.0,0.0,0.0,0.0,0.343777,0.0,0.0,0.343777,0.0,0.0


In [38]:
pd.DataFrame(cosine_similarity(tfidf_dt, tfidf_dt))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.675351,0.457049,0.457049,0.0,0.0
1,0.675351,1.0,0.457049,0.457049,0.0,0.0
2,0.457049,0.457049,1.0,0.675351,0.0,0.0
3,0.457049,0.457049,0.675351,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.43076
5,0.0,0.0,0.0,0.0,0.43076,1.0


In [40]:
headlines = pd.read_csv("./data/abcnews-date-text.csv", parse_dates=["publish_date"])

In [41]:
print(len(headlines))
headlines.head()

1103663


Unnamed: 0,publish_date,headline_text
0,2003-02-19,aba decides against community broadcasting lic...
1,2003-02-19,act fire witnesses must be aware of defamation
2,2003-02-19,a g calls for infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise
4,2003-02-19,air nz strike to affect australian travellers


In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
dt=tfidf.fit_transform(headlines["headline_text"])

In [43]:
dt

<1103663x95878 sparse matrix of type '<class 'numpy.float64'>'
	with 7001357 stored elements in Compressed Sparse Row format>

In [44]:
%%time
cosine_similarity(dt[0:10000], dt[0:10000])

CPU times: total: 297 ms
Wall time: 532 ms


array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.16913596,
        0.16792138],
       [0.        , 0.        , 0.        , ..., 0.16913596, 1.        ,
        0.33258708],
       [0.        , 0.        , 0.        , ..., 0.16792138, 0.33258708,
        1.        ]])

In [45]:
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

In [46]:
print(len(stopwords))

326


In [50]:
tfidf = TfidfVectorizer(stop_words=list(stopwords))
dt = tfidf.fit_transform(headlines["headline_text"])
dt



<1103663x95600 sparse matrix of type '<class 'numpy.float64'>'
	with 5644186 stored elements in Compressed Sparse Row format>

In [52]:
tfidf = TfidfVectorizer(stop_words=list(stopwords), min_df=2)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

<1103663x58527 sparse matrix of type '<class 'numpy.float64'>'
	with 5607113 stored elements in Compressed Sparse Row format>

In [54]:
tfidf = TfidfVectorizer(stop_words=list(stopwords), min_df=0.0001)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

<1103663x6772 sparse matrix of type '<class 'numpy.float64'>'
	with 4816381 stored elements in Compressed Sparse Row format>

In [55]:
tfidf = TfidfVectorizer(stop_words=list(stopwords), max_df=0.1)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

<1103663x95600 sparse matrix of type '<class 'numpy.float64'>'
	with 5644186 stored elements in Compressed Sparse Row format>

In [59]:
from tqdm.auto import tqdm
import spacy
nlp = spacy.load("en_core_web_sm")
nouns_adjectives_verbs = ["NOUN", "PROPN", "ADJ", "ADV", "VERB"]
for i, row in tqdm(headlines.iterrows(), total=len(headlines)):
    doc = nlp(str(row["headline_text"]))
    headlines.at[i, "lemmas"] = " ".join([token.lemma_ for token in doc])
    headlines.at[i, "nav"] = " ".join([token.lemma_ for token in doc if token.pos_ in nouns_adjectives_verbs])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1103663/1103663 [44:46<00:00, 410.77it/s]


In [60]:
headlines

Unnamed: 0,publish_date,headline_text,lemmas,nav
0,2003-02-19,aba decides against community broadcasting lic...,aba decide against community broadcasting licence,aba decide community broadcasting licence
1,2003-02-19,act fire witnesses must be aware of defamation,act fire witness must be aware of defamation,act fire witness aware defamation
2,2003-02-19,a g calls for infrastructure protection summit,a g call for infrastructure protection summit,g call infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise,air nz staff in aust strike for pay rise,air nz staff aust strike pay rise
4,2003-02-19,air nz strike to affect australian travellers,air nz strike to affect australian traveller,air nz strike affect australian traveller
...,...,...,...,...
1103658,2017-12-31,the ashes smiths warners near miss liven up bo...,the ashe smith warner near miss liven up box d...,ashe smith warner miss liven box day test
1103659,2017-12-31,timelapse: brisbanes new year fireworks,timelapse : brisbane new year firework,timelapse brisbane new year firework
1103660,2017-12-31,what 2017 meant to the kids of australia,what 2017 mean to the kid of australia,mean kid australia
1103661,2017-12-31,what the papodopoulos meeting may mean for ausus,what the papodopoulos meeting may mean for ausus,papodopoulos meeting mean ausus


### 원형을 사용한 문서 벡터화

In [62]:
tfidf = TfidfVectorizer(stop_words=list(stopwords))
dt = tfidf.fit_transform(headlines["lemmas"].map(str))
dt



<1103663x82761 sparse matrix of type '<class 'numpy.float64'>'
	with 5546562 stored elements in Compressed Sparse Row format>

### 단어 유형 제한

In [63]:
tfidf = TfidfVectorizer(stop_words=list(stopwords))
dt = tfidf.fit_transform(headlines["nav"].map(str))
dt

<1103663x79759 sparse matrix of type '<class 'numpy.float64'>'
	with 5443874 stored elements in Compressed Sparse Row format>

### 일반 단어 제거

In [64]:
top_10000 = pd.read_csv("https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt", header=None)

In [83]:
tfidf = TfidfVectorizer(stop_words=list(stopwords), ngram_range=(1,2), min_df=2)
dt = tfidf.fit_transform(headlines["headline_text"])
dt



<1103663x559961 sparse matrix of type '<class 'numpy.float64'>'
	with 8415675 stored elements in Compressed Sparse Row format>

In [84]:
tfidf = TfidfVectorizer(stop_words=list(stopwords), ngram_range=(1,3), min_df=2)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

<1103663x747988 sparse matrix of type '<class 'numpy.float64'>'
	with 9045013 stored elements in Compressed Sparse Row format>

In [85]:
tfidf = TfidfVectorizer(stop_words=list(top_10000.iloc[:,0].values), ngram_range=(1,2), min_df=2)
dt = tfidf.fit_transform(headlines["nav"].map(str))
dt

<1103663x94689 sparse matrix of type '<class 'numpy.float64'>'
	with 1531237 stored elements in Compressed Sparse Row format>

<br>

## 구문 유사성

In [86]:
# 불용어와 바이그램을 이용한 벡터화
stopwords.add("test")
tfidf = TfidfVectorizer(stop_words=list(stopwords), ngram_range=(1,2), min_df=2, norm='l2')
dt = tfidf.fit_transform(headlines["headline_text"])

In [87]:
dt

<1103663x559346 sparse matrix of type '<class 'numpy.float64'>'
	with 8405225 stored elements in Compressed Sparse Row format>

In [88]:
made_up = tfidf.transform(["australia and new zealand discuss optimal apple size"])

In [89]:
sim = cosine_similarity(made_up, dt)
headlines.iloc[np.argmax(sim)]

publish_date                                2014-05-09 00:00:00
headline_text    call for australia and new zealand to give job
lemmas           call for australia and new zealand to give job
nav                         call australia new zealand give job
Name: 873886, dtype: object

In [93]:
%%time
batch = 10000
max_sim = 0.0
max_a = None
max_b = None
for a in range(0, dt.shape[0], batch):
    for b in range(0, a+batch, batch):
        r = np.dot(dt[a:a+batch], np.transpose(dt[b:b+batch]))
        r[r > 0.9999] = np.nan
        sim = r.max()
        if sim > max_sim:
            (max_a, max_b) = np.unravel_index(np.argmax(r), r.shape)
            max_a += a
            max_b += b
            max_sim = sim

CPU times: total: 59.4 s
Wall time: 4min 12s


In [94]:
print(max_sim, max_a, max_b)

0.985505841515539 904965 364042


In [95]:
print(headlines.iloc[max_a])
print(headlines.iloc[max_b])

publish_date                                2014-09-18 00:00:00
headline_text    vline fails to meet punctuality targets report
lemmas             vline fail to meet punctuality target report
nav                   vline fail meet punctuality target report
Name: 904965, dtype: object
publish_date                         2008-02-15 00:00:00
headline_text    vline fails to meet punctuality targets
lemmas             vline fail to meet punctuality target
nav                   vline fail meet punctuality target
Name: 364042, dtype: object


### 관련 단어 탐색 (문서-용어 행렬 대신 용어-문서 행렬을 이용 -> 열벡터를 사용)

In [103]:
tfidf_word = TfidfVectorizer(stop_words=list(stopwords), min_df=1000)
dt_word = tfidf_word.fit_transform(headlines["headline_text"])

In [104]:
dt_word

<1103663x1132 sparse matrix of type '<class 'numpy.float64'>'
	with 2980495 stored elements in Compressed Sparse Row format>

In [105]:
r = cosine_similarity(dt_word.T, dt_word.T)
np.fill_diagonal(r, 0)

In [113]:
# 유사도가 가장 큰 항목 찾기
# 1차원배열로 변환 후 np.argsort를 통해 정렬된 요소의 인덱스를 가져오고 어휘 조회를 위해 원래 인덱스를 복원
voc = tfidf_word.get_feature_names_out()
size = r.shape[0]
for index in np.argsort(r.flatten())[::-1][0:40]:
    a = int(index / size)
    b = index % size
    if a > b:
        print(f'{voc[a]} related to {voc[b]}')

sri related to lanka
hour related to country
seekers related to asylum
springs related to alice
pleads related to guilty
hill related to broken
trump related to donald
violence related to domestic
climate related to change
driving related to drink
care related to aged
gold related to coast
royal related to commission
mental related to health
wind related to farm
flu related to bird
murray related to darling
world related to cup
north related to korea
hour related to 2014
