## Preprocessing
I used python generators to deal with huge size of data. The generators read, process, and write a row by a row.

In [1]:
import csv
import copy
import numpy as np
import os
from itertools import tee
from datetime import datetime
import pandas as pd
import re

word2id = dict()
id2word = dict()

def preprocessing(gen):
    '''
    This function edits data from generator.
    year, month, day, weekday, hour information from time_created
    change boolean and string to integer
    
    yield row
    '''
    id_ = 0
    
    for row in gen:
        
        row['time_created'] = datetime.fromtimestamp(int(row['time_created'])+3600*8)
        row['year'] = row['time_created'].year
        row['month'] = row['time_created'].month
        row['day'] = row['time_created'].day
        row['weekday'] = row['time_created'].weekday()
        row['hour'] = row['time_created'].hour

        row['up_votes'] = int(row['up_votes'])
        row['down_votes'] = int(row['down_votes'])
        row['over_18'] = (row['over_18']=='True')*1
        
        row['title'] = row['title'].lstrip(' \t\r\n').rstrip('\t\n\r ')
        row['title'] = re.sub(r"[\n\t\r]", " ", row['title'])

        if row['author'] not in word2id:
            word2id[row['author']] = id_
            id2word[id_] = row['author']
            row['author'] = id_
            id_ += 1
        else:
            row['author'] = word2id[row['author']]

        del row['time_created'], row['date_created'], row['down_votes']
        
        yield row

def csv_generator(path):
    '''
    read csv file as dictionary
    yield line(a row from a csv file)
    '''
    with open(path, 'r', encoding='UTF-8', newline='') as csvfile:
        f = csv.DictReader(csvfile)
        for line in f:
            yield line

def save_json(data, path):
    '''
    save dictionary as json file
    '''
    import json

    with open(path, 'w') as fp:
        json.dump(data, fp)

            
def train_test_val_split(gen, file_path, columns, train_path, val_path, test_path, val_rt, tst_rt, seed=777):
    '''
    It split one data in to three different files(traning, validation, and test set).
    '''
    
    np.random.seed(seed)
    
    with open(train_path, 'w', encoding='UTF-8', newline='') as f1, \
    open(test_path, 'w', encoding='UTF-8', newline='') as f2,\
    open(val_path, 'w', encoding='UTF-8', newline='') as f3:
        train_writes = csv.DictWriter(f1, fieldnames=columns)
        test_writes = csv.DictWriter(f2, fieldnames=columns)
        val_writes = csv.DictWriter(f3, fieldnames=columns)
        
        train_writes.writeheader()
        test_writes.writeheader()
        val_writes.writeheader()
        
        
        for line in gen:
            
            rand = np.random.uniform()

            if rand < tst_rt:
                test_writes.writerow(line)
                
            elif tst_rt < rand < val_rt + tst_rt:
                val_writes.writerow(line)
            
            else:
                train_writes.writerow(line)

                    
def undersampling(file_path, save_path, columns, sampling_rate, seed=777):
    '''
    Training and validation set were undersampled (only 1% of data) when the target(over_18) is zero, 
    since it is extremely imbalanced (about 1600(neg) : 1(pos))
    After the undersampling, the ratio becomes about 14-18 : 1.
    
    yield undersample   
    '''
    
    np.random.seed(seed)
    
    with open(file_path, 'r', encoding='UTF-8', newline='') as read_file,\
    open(save_path, 'w', encoding='UTF-8', newline='') as write_file:
        reads = csv.DictReader(read_file)
        writes = csv.DictWriter(write_file, fieldnames=columns)
        writes.writeheader()
        
        for line in reads:
            
            rand = np.random.uniform()
            if line['over_18'] == '1' or rand < sampling_rate:
                writes.writerow(line)
        
    
def filtering(gen, **kwargs):
    '''
    This function is for filtering the data with specific conditions.
    It can accept only 'key = value' format.
    '''
    
    filters = []
    for key, value in kwargs.items():
        filters.append((key,value))
        
    for row in gen:
        rulls = []
        for f in filters:
            rulls.append(row[f[0]] == f[1])
        
        if all(rulls):
            yield row['title']

In [2]:
file_path = '../data/Eluvio_DS_Challenge.csv'
tr_path = '../data/train.csv'
test_path = '../data/test.csv'
val_path = '../data/val.csv'
tr_un_sam_path = '../data/train_undersampled.csv'
val_un_sam_path = '../data/val_undersampled.csv'
gen = csv_generator(file_path)
gen = preprocessing(gen)

In [3]:
columns = ['over_18', 'up_votes', 'year', 'month', 'day', 'weekday', 'hour', 'category', 'author', 'title']
train_test_val_split(gen, file_path, columns, tr_path, val_path, test_path, 0.2, 0.2)
undersampling(tr_path, tr_un_sam_path, columns, 0.01)
undersampling(val_path, val_un_sam_path, columns, 0.01)
save_json(word2id, '../data/word2id_author.json')
save_json(id2word, '../data/id2word_author.json')

## Topic Modeling

Topic modeling is a statistical method to extract topics from documents. In my opinion, Topic modeling can be useful to understand of the given data that change by time. For example, a dominant topic in 2008 might be the financial crisis. Likewise, we can see topics that are related world cup in June 2010 and 2014 data.  
I used the latent Dirichlet allocation (LDA) for topic modeling. Unfortunately, the LDA model failed to build an accurate topic model since the titles were too short as a document. If the given data were whole newspaper articles or bigger, it could induce better results.

In [4]:
# from https://github.com/susanli2016/Machine-Learning-with-Python/blob/master/topic_modeling_Gensim.ipynb

import spacy
from spacy.lang.en import English

import nltk
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

from gensim import corpora
import gensim

import random
import pickle


parser = English()
nltk.download('omw')
nltk.download('wordnet')
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package omw to C:\Users\Jay
[nltk_data]     Kim\AppData\Roaming\nltk_data...
[nltk_data]   Package omw is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Jay
[nltk_data]     Kim\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Jay
[nltk_data]     Kim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Process for LDA. Tokenization, lemmatization, stop words, and biagram were used for this task.

In [5]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def process_for_lda(gen):
    
    text_data = []
    for line in gen:
        tokens = gensim.utils.tokenize(line, lower=True)
        tokens = [get_lemma(token) for token in tokens]
        tokens = [token for token in tokens if token not in en_stop]
        text_data.append(tokens)
    
    bigram = gensim.models.Phrases(text_data)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    text_data = bigram_mod[text_data]

    processed_data = []
    for text in text_data:
        tokens = [token for token in text if len(token) > 1]
        processed_data.append(tokens)
    
    return processed_data

In [6]:
corpus_gen = csv_generator(tr_path)
filtered_gen = filtering(corpus_gen, year='2008', month='10')
titles = process_for_lda(filtered_gen)

Get dictionary from the titles

In [7]:
dictionary = corpora.Dictionary(titles)
corpus = [dictionary.doc2bow(title) for title in titles]
pickle.dump(corpus, open('../models/corpus.pkl', 'wb'))
dictionary.save('../models/dictionary.gensim')

LDA with 5 topics

In [8]:
NUM_TOPICS = 5
ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus, workers = 7, num_topics = NUM_TOPICS, id2word=dictionary, passes=20)
ldamodel.save('../models/model5.gensim')
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.010*"uk" + 0.008*"say" + 0.007*"war" + 0.007*"ha" + 0.006*"one"')
(1, '0.007*"us" + 0.006*"ha" + 0.005*"world" + 0.005*"kill" + 0.004*"say"')
(2, '0.008*"uk" + 0.007*"government" + 0.005*"war" + 0.005*"china" + 0.005*"wa"')
(3, '0.011*"us" + 0.009*"world" + 0.008*"pakistan" + 0.007*"military" + 0.007*"strike"')
(4, '0.005*"wa" + 0.005*"israel" + 0.005*"bank" + 0.005*"us" + 0.005*"say"')


In [9]:
from gensim.models.coherencemodel import CoherenceModel
score = CoherenceModel(model=ldamodel, texts=titles, dictionary=dictionary, coherence='c_v').get_coherence()
print('Coherence score:',score)

Coherence score: 0.5385849516814162


LDA with 10 topics

In [10]:
ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus, workers = 7, num_topics = 10, id2word=dictionary, passes=20)
ldamodel.save('../models/model10.gensim')
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.010*"wa" + 0.008*"new" + 0.006*"right" + 0.006*"government" + 0.006*"world"')
(1, '0.009*"us" + 0.008*"military" + 0.006*"war" + 0.005*"continue" + 0.005*"pakistan"')
(2, '0.015*"us" + 0.013*"pakistan" + 0.010*"kill" + 0.008*"strike" + 0.007*"government"')
(3, '0.017*"uk" + 0.010*"us" + 0.010*"say" + 0.009*"pakistan" + 0.007*"ha"')
(4, '0.011*"israeli" + 0.010*"china" + 0.009*"say" + 0.007*"arrest" + 0.005*"iraq"')
(5, '0.009*"crisis" + 0.008*"military" + 0.006*"iraq" + 0.006*"india" + 0.006*"say"')
(6, '0.008*"kill" + 0.006*"uk" + 0.006*"death" + 0.005*"afghanistan" + 0.005*"un"')
(7, '0.013*"world" + 0.008*"us" + 0.008*"ha" + 0.007*"crisis" + 0.007*"uk"')
(8, '0.007*"market" + 0.005*"us" + 0.005*"chinese" + 0.005*"world" + 0.004*"new"')
(9, '0.008*"kill" + 0.006*"say" + 0.006*"uk" + 0.006*"one" + 0.006*"world"')


In [11]:
from gensim.models.coherencemodel import CoherenceModel
score = CoherenceModel(model=ldamodel, texts=titles, dictionary=dictionary, coherence='c_v').get_coherence()
print('Coherence score:',score)

Coherence score: 0.5449451342290867


LDA visualization

In [12]:
dictionary = gensim.corpora.Dictionary.load('../models/dictionary.gensim')
corpus = pickle.load(open('../models/corpus.pkl', 'rb'))

import pyLDAvis.gensim
lda = gensim.models.ldamodel.LdaModel.load('../models/model5.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [13]:
lda10 = gensim.models.ldamodel.LdaModel.load('../models/model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)