# Analysis

In [29]:
from tqdm import tqdm
import pandas as pd

flu_rate = pd.read_csv('./data/170102_210531_influenza.csv')
word_trends = pd.read_csv('./data/독감+증상_word_trends.csv')

In [3]:
def cross_corr(x, y, max_lag=0):
    '''
    Args:
        x (pandas.Series): fixed data
        y (pandas.Series): data to be shifted
    
    Returns:
        corr (pandas.Series): {0: XX.XX, 1: XX.XX, ..., max_lag: XX.XX}
    '''
    corr = {}
    for lag in range(max_lag+1):
        corr[lag] = x.corr(y.shift(lag))
    return pd.Series(corr)


In [27]:
word_trends_with_lag = {word: cross_corr(flu_rate['ratio'], word_trends[word], max_lag=12)
                         for word in tqdm(word_trends.columns[1:])}

100%|██████████| 1000/1000 [00:05<00:00, 194.77it/s]


In [30]:
corr_topn = []
for word, corr in word_trends_with_lag.items():
    lag = corr.argmax()
    max_corr = corr[lag]
    corr_topn.append((word, max_corr, lag))

corr_topn.sort(key=(lambda x: x[1]), reverse=True)

## Extract "flu-related" words

In [1]:
from gensim.models import KeyedVectors

kv_fname = 'kowiki-neg-300.kv'
w2v_kv = KeyedVectors.load(f'./models/{kv_fname}')

In [3]:
w2v_kv.most_similar_cosmul(positive=['독감', '증상'], topn=10)

[('질병', 0.6456781029701233),
 ('합병증', 0.6416015028953552),
 ('패혈증', 0.6396576166152954),
 ('피부병', 0.6389033198356628),
 ('황달', 0.637260377407074),
 ('매독', 0.6342878341674805),
 ('기관지염', 0.6325851678848267),
 ('폐렴', 0.6323192715644836),
 ('급성', 0.6281660795211792),
 ('복통', 0.6278930902481079)]

In [21]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [4]:
from pytrends.request import TrendReq

flu_related_words = ['독감'] + [w for w, _ in w2v_kv.most_similar_cosmul(positive=['독감', '증상'], topn=1000)]
pyt = TrendReq(hl='ko-KR')

In [25]:
import pandas as pd
import time
from tqdm import tqdm

for i, keywords in tqdm(enumerate(chunks(flu_related_words, 5))):
    try:
        pyt.build_payload(keywords, timeframe='2017-01-01 2021-06-05', geo='KR')
        trends = pd.concat([trends, pyt.interest_over_time()], axis=1)
        trends.pop('isPartial')
        trends.to_csv('./data/독감+증상_google_trends.csv')
        time.sleep(5)
    except:
        print(i, keywords)
        break


199it [21:14,  6.40s/it]


In [31]:
trends

Unnamed: 0_level_0,독감,질병,합병증,패혈증,피부병,황달,매독,기관지염,폐렴,급성,...,성욕,뇌막,뇌일혈,에리트로포이에틴,원추각막,열대병,요법,마비저,손상,통풍
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-01,16,6,1,0,2,0,0,0,1,0,...,68,0,0,0,0,0,63,0,63,52
2017-01-08,10,6,1,1,1,0,0,0,1,0,...,55,0,0,0,4,0,52,0,37,61
2017-01-15,9,5,1,0,1,1,0,0,1,1,...,42,0,0,0,0,0,40,0,44,41
2017-01-22,3,4,1,1,2,0,0,0,1,1,...,40,0,0,0,0,0,31,0,35,29
2017-01-29,2,5,2,1,1,0,0,0,1,2,...,65,0,0,0,0,0,46,0,21,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-02,3,22,1,2,1,0,0,0,1,1,...,33,0,0,0,4,0,31,0,47,37
2021-05-09,3,33,2,1,0,0,0,0,1,1,...,39,0,0,0,0,0,40,0,34,35
2021-05-16,3,17,2,1,0,0,0,0,1,1,...,40,0,0,0,0,0,37,0,61,35
2021-05-23,2,26,0,2,0,0,0,0,1,1,...,44,0,0,0,0,0,32,0,47,63


## Make wor2vec

In [4]:
from gensim.models import Word2Vec

min_count = 5
window_size = 5
num_neg = 15
vector_size = 300

print('Making sentences as list...')
sents = []
corpus_fname = 'kowiki_corpus.txt'

with open(f'./data/{corpus_fname}', 'r', encoding='utf8') as fin:
    line = fin.readline()
    while line:
        words = line.split()
        sents.append(words)
        line = fin.readline()

print('Making word vectors...')
w2v_model = Word2Vec(sents, vector_size=vector_size, min_count=min_count, negative=num_neg, window=window_size)

w2v_model.save('./models/kowiki-neg-300.bin')

Making sentences as list...
Making word vectors...


In [13]:
w2v_model.wv.most_similar('독감')

[('인플루엔자', 0.7366062998771667),
 ('홍역', 0.6661979556083679),
 ('콜레라', 0.652269721031189),
 ('출혈열', 0.648910641670227),
 ('유행병', 0.6479013562202454),
 ('전염병', 0.6393334269523621),
 ('뎅기열', 0.6355220079421997),
 ('대유행', 0.6307356357574463),
 ('말라리아', 0.6306702494621277),
 ('광견병', 0.6233310103416443)]

In [14]:
w2v_kv = w2v_model.wv
w2v_kv.save('./models/kowiki-neg-300.kv')

## Build corpus

In [4]:
from xml.etree import ElementTree as ET

wiki_fname = 'kowiki-latest-pages-articles.xml'

In [5]:
import re

def clean_text(text):    
    # Common
    text = re.sub("(?s)<ref>.+?</ref>", "", text) # remove reference links
    text = re.sub("(?s)<[^>]+>", "", text) # remove html tags
    text = re.sub("&[a-z]+;", "", text) # remove html entities
    text = re.sub("(?s){{.+?}}", "", text) # remove markup tags
    text = re.sub("(?s){.+?}", "", text) # remove markup tags
    text = re.sub("(?s)\[\[([^]]+\|)", "", text) # remove link target strings
    text = re.sub("(?s)\[\[([^]]+\:.+?]])", "", text) # remove media links
    
    text = re.sub("[']{5}", "", text) # remove italic+bold symbols
    text = re.sub("[']{3}", "", text) # remove bold symbols
    text = re.sub("[']{2}", "", text) # remove italic symbols
    
    text = re.sub(u"[^\s\r\n가-힣.?!]", " ", text) # Replace unacceptable characters with a space.
    text = re.sub('([.?!]){2,}', '\\1', text) # remove repeated punctuation
    text = re.sub('\s[.?!]\s', '', text) # remove isolated punctuation
    
    # Common
    text = re.sub("\s{2,}", " ", text) # Squeeze spaces.
    return text

In [10]:
def sentence_segment(text):
    '''
    Args:
      text: A string. A unsegmented paragraph.
    
    Returns:
      A list of sentences.
    '''
    return re.split('([.?!])?[\n]+|[.?!] ', text)

In [11]:
from konlpy.tag import Mecab

mecab = Mecab()
def word_segment(text):
    return [word for word, _ in mecab.pos(text)]

In [47]:
with open(f'./data/{wiki_fname.split("-")[0]}_corpus.txt', 'w', encoding='utf-8') as fout:
    fout.close()

In [49]:
import itertools
from tqdm import tqdm

ns = '{http://www.mediawiki.org/xml/export-0.10/}'
with open(f'./data/{wiki_fname.split("-")[0]}_corpus.txt', 'w', encoding='utf-8') as fout:
    for _, elem in tqdm(ET.iterparse(f'./data/{wiki_fname}')):
        try:
            tag = elem.tag.replace(ns, '')
            if tag == 'text':
                running_text = clean_text(elem.text)
                sents = sentence_segment(running_text)
                for sent in sents:
                    if sent:
                        words = word_segment(sent)
                        if len(words) > 10:
                            fout.write(' '.join(words) + '\n')
        except:
            continue
        elem.clear()

25979812it [30:56, 13991.10it/s]
