In [1]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import normalize
from scipy.sparse import dok_matrix
from stop_words import get_stop_words
import numpy as np
import json

In [2]:
with open("twit_new.json", "r") as f:
    data = json.load(f)

In [3]:
twitTexts = set()

voca = set()
for twit in data:
    body = twit['body']
    # 단어가 3개 이상인 트윗만 취급
    body = body.split()
    if len(body) > 3:
        voca.update(body)
        body = " ".join(body)
        twitTexts.add(body)

del data
twitTexts = list(twitTexts)

In [4]:
# stopwords 제거
stopwords = set(get_stop_words('en'))
stopwords.update(['via', 'will', 'just'])
voca = {v for v in voca if len(v) > 2}  # 단어길이가 3 이상인 경우만 취급
voca = list(voca - stopwords)
# 단어 인덱싱
voca_id = {w: i for i, w in enumerate(voca)}

In [5]:
# term-document matrix를 sparse matrix로 생성
tdm = dok_matrix((len(twitTexts), len(voca)), dtype=np.float32)
print(tdm.shape)

(20185, 17253)


In [6]:
# term-document matrix
for i, twit in enumerate(twitTexts):
    for word in twit.split():
        try:
            tdm[i, voca_id[word]] += 1
        except:
            # stopwords
            continue

In [7]:
# 각 document별로 l2-normalize
tdm_ = normalize(tdm)

In [8]:
# NMF
K = 10
nmf = NMF(n_components=K, init='nndsvd')
W = nmf.fit_transform(tdm_)
H = nmf.components_

In [9]:
# 각 토픽별 키워드 출력
for k in range(K):
    print(f"{k}th topic")
    for index in H[k].argsort()[::-1][:20]:
        print(voca[index], end=" ")
    print()
    print()

0th topic
ransomware new know threat now bitcoin hit data pay get hackers need healthcare files read nhscyberattack virus infected ransom prevent 

1th topic
wannacry windows now know patch need microsoft like bitcoin don ransom wcry new nsa kill wannacrypt switch analysis virus infected 

2th topic
cybersecurity infosec hackers defstar5 databreach cybercrime cyberattacks iot 2017 makeyourownlane business infographic bigdata machinelearning hacking fintech experts top steps ciso 

3th topic
attack cyber global massive nhs countries hit hospitals wake hits call nsa recent next news microsoft behind korea says biggest 

4th topic
security cyber infosec experts iot hacking microsoft threats data technology now tech windows internet news updates patch lessons cybercrime privacy 

5th topic
latest thanks daily news paper blog hacker wannacry tech diario times updates tcdisrupt read check business ciberataque cloud infosec journal 

6th topic
attacks cyber wannacrypt healthcare like recent t

In [10]:
# tfidf를 통한 nmf
tfidf = TfidfTransformer()
tdm_ = tfidf.fit_transform(tdm)

In [11]:
K = 10
nmf = NMF(n_components=K, init='nndsvd')
W = nmf.fit_transform(tdm_)
H = nmf.components_

In [12]:
for k in range(K):
    print(f"{k}th topic")
    for index in H[k].argsort()[::-1][:20]:
        print(voca[index], end=" ")
    print()
    print()

0th topic
ransomware attacks new healthcare threat data pay prevent now get bitcoin ransom can files free hit read businesses 2016 hackers 

1th topic
cybersecurity infosec malware defstar5 hackers databreach cybercrime ransomware cyberattacks iot steps bigdata machinelearning hacking makeyourownlane fintech infographic 2017 top ciso 

2th topic
wannacry kill switch like don wcry now ransom analysis bitcoin north worm korea ransomeware virus make infected spread btc total 

3th topic
protect can business ransomware learn help tips attacks steps organization ways data computer simple best webinar don check advice take 

4th topic
latest thanks daily news paper wannacry hacker blog tech tcdisrupt ransomware times diario updates ciberataque read roundup stories check update 

5th topic
know need everything ransomware things wannacry wcry want wannacrypt don recent faq immediately cybersecurity crisis video anyone global five defend 

6th topic
attack cyber global massive ransomware countr

In [14]:
W_ = W.T
for k in range(K):
    print(f"{k}th topic")
    for index in W_[k].argsort()[::-1][:5]:
        print(twitTexts[index])
    print()

0th topic
what is ransomware via
the growing threat of ransomware ransomware ransomware
how to deal with the rising threat of ransomware ransomware ransomware ransomware
us hospitals face growing ransomware threat ransomware ransomware ransomware
5 things to know about ransomware ransomware ransomware

1th topic
here is how not to wannacry cybersecurity
10 steps to cybersecurity cyberattack ransomware malware infosec ai
what are 10 steps of cybersecurity by cyberattack ransomware malware infosec ai ml
infosec security cybersecurity on the ransomware attack
how to recover from a ransomware attack cybersecurity infosec malware

2th topic
wannacry wannacry 1 wannacry wannacry 1 wannacry wannacry 1 wannacry wannacry 1
here we go wannacry
wannacry when him go
what is the ransomware wannacry worm wannacry wannacry
make s you wannacry wannacry via

3th topic
how to protect yourself against ransomware
how to protect yourself from ransomware via
how to protect yourself from ransomware
how to pr