In [110]:
import os, datetime
import torch
import pandas as pd
import numpy as np
import gensim, re, os 
import string
import subprocess
import multiprocessing
import pickle as pk
import time, sys
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from gensim import corpora
from gensim.models import Word2Vec
from numpy.linalg import norm
from gensim.models import ldamulticore

np.random.seed(42)
lemma = WordNetLemmatizer()
exclude = set(string.punctuation)
exclude.update(set(string.digits))
include = set(string.printable)
stop = set(stopwords.words('english'))
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
origin = ['hong kong', 'north korea', 'new york', 'south korea', 'united state', 'united kingdom', ' corp ',
          ' cant ', 'white house', 'united nation']
joined = ['hong-kong', 'north-korea', 'new-york', 'south-korea', 'united-state', 'united-kingdom', ' corporation ',
          ' cannot ', 'white-house', 'united-nation']

In [18]:
def clean(doc):
    #print(len(doc))
    #sys.stdout.flush()
    doc = doc.lower()
    doc = re.sub(r'\s+', ' ', doc)
    punc_free = ''.join(ch for ch in doc if ch in include)
    punc_free = ''.join(ch for ch in punc_free if ch not in exclude)
    for o, j in zip(origin, joined):
        punc_free = punc_free.replace(o, j)
    stop_free = ' '.join([i for i in punc_free.split() if i not in stop])
    normalized = ' '.join(lemma.lemmatize(word) for word in stop_free.split())
    #normalized = ' '.join(lemma.lemmatize(word, 'v') for word in normalized.split())
    stop_free = ' '.join([i for i in normalized.split() if i not in stop])
    return stop_free.split()

In [11]:
df = pd.read_csv('./data/news_guardian.csv')

In [149]:
len(df)

132542

In [21]:
doc = df['content'].tolist()
doc = [str(x).replace('\n', ' ') for x in doc]
len_lst = [len(x) for x in doc]
with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
    cleaned = pool.map(clean, [x for x in doc])
dictionary = corpora.Dictionary(cleaned)
dictionary.filter_extremes(no_below=int(len(cleaned)*0.01), no_above=0.8)
print("DICTIONARY", len(dictionary))
with open("./savepoints/dictionary.pickle" , "wb") as output_file:
    pk.dump(dictionary, output_file)
print('Loading Complete')

DICTIONARY 4662


NameError: name 'target_path' is not defined

In [22]:
with open("./savepoints/dictionary.pickle" , "wb") as output_file:
    pk.dump(dictionary, output_file)
print('Loading Complete')

Loading Complete


In [47]:
skip_model = gensim.models.Word2Vec(cleaned, min_count = 1, size = 128, window = 5, sg = 1, workers = 4)

In [69]:
with open('./savepoints/w2.pk', 'wb') as file:
    pk.dump(skip_model, file)

In [55]:
len(dictionary.token2id)

4662

In [88]:
def get_top(dictionary, keyword, model, topk = 10):
    top10_lst = [''] * 10
    top_value = [10000] * 10
    for term in dictionary.token2id:
        diff = norm(model.wv[keyword] - model.wv[term])
        for idx, value in enumerate(top_value[0:(topk + 1)]):
            if value > diff:
                top_value.insert(idx, diff)
                top10_lst.insert(idx, term)
                break
    return top10_lst[0:topk], top_value[0:topk]

In [57]:
norm(skip_model.wv['tariff'])

5.7209263

In [127]:
word_lst, val_lst = get_top(dictionary, 'tariff', skip_model, 200)
print(word_lst)

In [148]:
word_lst, val_lst = get_top(dictionary, 'economy', skip_model, 20)
print(word_lst)

['economy', 'economic', 'growth', 'manufacturing', 'sector', 'gdp', 'recession', 'market', 'country', 'financial', 'global', 'britain', 'investment', 'industry', 'business', 'finance', 'outlook', 'uncertainty', 'export', 'infrastructure']


### LDA Topic Modelling

In [78]:
doc_term_matrix = [dictionary.doc2bow(x) for x in cleaned]

In [136]:
num_topics = 10
prior_eta_mat = np.ones((num_topics, len(dictionary))) / len(dictionary)
one_idx = [dictionary.token2id[x] for x in word_lst]
prior_eta_mat[0,:] = prior_eta_mat[0,:] - 1/len(dictionary)
for ctr, word in enumerate(word_lst):
    prior_eta_mat[0, dictionary.token2id[word]] = len(word_lst) - ctr
#prior_eta_mat[0,one_idx] = 

In [102]:
dictionary.id2token[3734]

'tariff'

In [None]:
print("Training Start", datetime.datetime.now())
ldamodel = ldamulticore.LdaMulticore(doc_term_matrix, id2word=dictionary, workers = 20, num_topics=30, 
                                    eta = prior_eta_mat[0,:], passes=100)
print("Training End", datetime.datetime.now())

Training Start 2019-10-16 01:47:59.125181


In [139]:
ldamodel.print_topics()

[(24,
  '0.022*"johnson" + 0.022*"brexit" + 0.019*"mp" + 0.017*"minister" + 0.015*"would" + 0.015*"labour" + 0.014*"may" + 0.014*"said" + 0.012*"prime" + 0.011*"parliament"'),
 (4,
  '0.020*"say" + 0.015*"like" + 0.012*"people" + 0.011*"one" + 0.011*"get" + 0.010*"dont" + 0.010*"time" + 0.010*"think" + 0.009*"thing" + 0.009*"would"'),
 (12,
  '0.014*"fashion" + 0.010*"brand" + 0.008*"twitter" + 0.008*"new" + 0.007*"make" + 0.007*"pinterest" + 0.007*"look" + 0.007*"one" + 0.007*"also" + 0.007*"store"'),
 (28,
  '0.152*"animal" + 0.133*"dog" + 0.080*"cat" + 0.068*"stone" + 0.044*"pet" + 0.019*"owner" + 0.017*"crystal" + 0.013*"human" + 0.006*"farmer" + 0.006*"good"'),
 (5,
  '0.018*"school" + 0.015*"people" + 0.015*"health" + 0.014*"child" + 0.012*"university" + 0.010*"student" + 0.010*"year" + 0.009*"said" + 0.007*"care" + 0.007*"service"'),
 (11,
  '0.039*"letter" + 0.033*"guardian" + 0.017*"email" + 0.016*"reader" + 0.015*"click" + 0.012*"share" + 0.011*"like" + 0.011*"well" + 0.011*"