In [None]:
from collections import defaultdict, Counter
from functools import reduce
import glob

import pandas as pd
import simplejson as json

In [None]:
words_filename = '../words/words.csv'

In [None]:
def wordlist_to_wordset(word_list):
    words = []
    for word_group in word_list:
        if type(word_group) is str:
            words.extend([c.strip() for c in word_group.split('/')])
    return set(words)

def build_tw_cn_dict(tw_word_list, cn_word_list):
    
    tw2cn = defaultdict(list)
    cn2tw = defaultdict(list)
    
    for tw_word_group, cn_word_group in zip(tw_word_list, cn_word_list):
        if type(tw_word_group) is str:
            tw_words = [c.strip() for c in tw_word_group.split('/')]
        else:
            tw_words = [None]
        
        if type(cn_word_group) is str:
            cn_words = [c.strip() for c in cn_word_group.split('/')]
        else:
            tw_words = [None]
            
        for tw_word in tw_words:
            for cn_word in cn_words:
                tw2cn[tw_word].append(cn_word)
                cn2tw[cn_word].append(tw_word)
    
    return tw2cn, cn2tw

def new_defined_wordset(tw_word_list, cn_word_list):
    
    wordset = set()
    for tw_word_group, cn_word_group in zip(tw_word_list, cn_word_list):
        if type(tw_word_group) is str and type(cn_word_group) is str:
            tw_words = [c.strip() for c in tw_word_group.split('/')]
            cn_words = set([c.strip() for c in cn_word_group.split('/')])
            for cn_word in cn_words:
                for tw_word in tw_words:
                    if cn_word in tw_word:
                        break
                else:
                    wordset.add(cn_word)
    
    wordset -= wordlist_to_wordset(tw_word_list)
    return wordset

df = pd.read_csv(words_filename)
cn_word_list = df['cn_word'].tolist()
tw_word_list = df['tw_word'].tolist()
cn_word_set = wordlist_to_wordset(cn_word_list)
tw_word_set = wordlist_to_wordset(tw_word_list)
# filtered_word_set = cn_word_set - tw_word_set
filtered_word_set = new_defined_wordset(tw_word_list, cn_word_list)
tw2cn, cn2tw = build_tw_cn_dict(tw_word_list, cn_word_list)

In [None]:
def find_cn_words(article, word_set):
    cn_word_count = Counter()
    for cn_word in word_set:
        if cn_word in article:
            cn_word_count[cn_word] += 1
    return cn_word_count

In [None]:
freq_counter = Counter()
for filename in glob.glob('../news/*/*/*'):
    with open(filename, 'r', encoding='utf-8') as f:
        article = json.load(f)
        result = find_cn_words(article['content'], filtered_word_set)
        if result:
            print(article['url'], result)
            freq_counter += result