In [1]:
from copy import copy
import pandas as pd
import numpy as np

from collections import Counter

In [2]:
class Analyzer(object):
    
    def __init__(self, csv_filename, blacklist_filename):
        
        self.csv_filename = csv_filename
        self.blacklist_filename = blacklist_filename
        self.df = pd.read_csv(csv_filename)
        self.build_blacklist_word()
    
    def build_blacklist_word(self):
        
        self.blacklist_words = set()
        with open(self.blacklist_filename, 'r', encoding='utf-8') as f:
            for line in f:
                self.blacklist_words.add(line.strip())

    
    def word_ranking(self):
        
        counter = Counter()
        for word_freq_str in self.df['Word Frequency']:
            if type(word_freq_str) is str:
                word_freq_pair = [ i.split(':') for i in word_freq_str.split('/')]
                for word, freq in word_freq_pair:
                    if word not in self.blacklist_words:
                        counter[word] += 1
                
        most_common = counter.most_common()
        return pd.DataFrame(most_common, columns = ['Word' , 'Frequency']) 
    
    def category_article_count(self):
        _df = copy(self.df)
        _df['Word Frequency'] = 1 - (_df['Word Frequency'].isnull()).astype(int)
        _df['Article Count'] = 1
        _df = _df.groupby(['Media','Category']).agg(sum).reset_index()
        return _df
    
    def determine_date(self, date_str):
        if date_str < '2017-12-11':
            return 1
        elif date_str < '2018-12-11':
            return 2
        elif date_str < '2019-12-10':
            return 3
        else:
            return 4
    
    def year_article_count(self):
        _df = copy(self.df)
        _df['Word Frequency'] = 1 - (_df['Word Frequency'].isnull()).astype(int)
        _df['Article Count'] = 1
        _df['Year'] = _df['Date'].apply(self.determine_date)
        _df = _df.groupby(['Media','Year']).agg(sum).reset_index()
        return _df
        

In [3]:
analyzer = Analyzer(csv_filename='../result/article_words.csv', blacklist_filename='../words/analyze_black_list.txt')

In [4]:
analyzer.word_ranking().to_csv('../result/word_ranking.csv', index=False, encoding='utf_8_sig')

In [5]:
analyzer.category_article_count().to_csv('../result/category_article_count.csv', index=False, encoding='utf_8_sig')

In [6]:
analyzer.year_article_count().to_csv('../result/year_article_count.csv', index=False, encoding='utf_8_sig')