In [1]:
from gensim import models, corpora
import numpy as np
import os
import json
import pickle as cp 
import nltk
from nltk.corpus import stopwords as sw
from itertools import chain
from gensim.test.utils import datapath
from gensim import utils



### Gensim preliminaries

In [2]:
stoplist = sw.words('english')

In [3]:
corpus_path = '/data1/sunchak/news_dataset/toi/articles/'

In [4]:
dictionary = corpora.Dictionary.load('toi.dict')

In [None]:
lda = models.ldamodel.LdaModel.load('toi.LDA.lda')

In [None]:
w2v = models.Word2Vec.load('toi.W2V.w2v')

### Computing the embeddings

In [None]:
def daily_w2v_emb(day_docs):
    day_w2v = []
    for doc in day_docs:
        for wrd in doc:
            try:
                day_w2v.append(w2v.wv[wrd])
            except Exception as error:
                print("word not found: ", wrd) 
    daily_w2v_emb = np.array(day_w2v).mean(axis=0)
    return daily_w2v_emb

In [None]:
def daily_lda_embedding(day_docs):
    '''
    Computing the raw topic probabilites without any aggregation
    '''
    day_corpus = [dictionary.doc2bow(article) for article in day_docs]
    day_topics = []
    
    for d in day_corpus:
        try:
            dtop = [k[1] for k in lda.get_document_topics(d,minimum_probability=0.0)]
            
            day_topics.append(dtop)
        except Exception as error:
            print("An exception occurred in the function:", error) 
    return day_topics

In [None]:
### For training data

emb_lda_tr = []
emb_w2v_tr = []
dates = []
yr = '2006-2012'

for k in sorted(os.listdir(os.path.join(corpus_path,str(yr)))):
    print(k)
    day_docs = [nltk.word_tokenize(json.loads(d)['text'].lower()) for d in open(os.path.join(corpus_path,str(yr),k)).readlines()]

    #daily_topics = daily_lda_embedding(day_docs)
    daily_w2v = daily_w2v_emb(day_docs)

    #emb_lda_tr.append(daily_topics)
    emb_w2v_tr.append(daily_w2v)

## cp.dump(emb_lda_tr,open('raw_lda_train.list','wb'))
## emb_lda_tr = cp.load(open('raw_lda_train.list','rb'))

### max
lda_train_max = []
for i,daily_docs in enumerate(emb_lda_tr):
    tmp = np.array(daily_docs)
    lda_train_max.append(tmp.max(axis=0))


### avg
lda_train_avg = []
for i,daily_docs in enumerate(emb_lda_tr):
    tmp = np.array(daily_docs)        
    lda_train_avg.append(tmp.mean(axis=0))
                    
    

# np.save('toi.maxLDA_train.npy',np.array(lda_train_max))
# np.save('toi.avgLDA_train.npy',np.array(lda_train_avg))

# np.save('toi.w2v_train.npy',emb_w2v_tr)



In [None]:
### For testing data

emb_lda_tst = []
emb_w2v_tst = []
dates = []
test_years = range(2013,2021)
for yr in test_years:
    for k in sorted(os.listdir(os.path.join(corpus_path,str(yr)))):
            day_docs = [nltk.word_tokenize(json.loads(d)['text'].lower()) for d in open(os.path.join(corpus_path,str(yr),k)).readlines()]
            
            daily_topics = daily_lda_embedding(day_docs)
            daily_w2v = daily_w2v_emb(day_docs)
            
            emb_lda_tst.append(daily_topics)
            emb_w2v_tst.append(daily_w2v)

## cp.dump(emb_lda_tst,open('raw_lda_test.list','wb'))
## emb_lda_tst = cp.load(open('raw_lda_test.list','rb'))

### max
lda_test_max = []
for i,daily_docs in enumerate(emb_lda_tst):
    tmp = np.array(daily_docs)
    lda_train.append(tmp.max(axis=0))


### avg
lda_test_avg = []
for i,daily_docs in enumerate(raw_lda):#[:100]:
    tmp = np.array(daily_docs)        
    lda_train.append(tmp.mean(axis=0))
                    
    

np.save('toi.maxLDA_train.npy',np.array(lda_train_max))
np.save('toi.avgLDA_train.npy',np.array(lda_train_avg))

np.save('toi.w2v_train.npy',emb_w2v_tr)

### Training

#### LDA

In [21]:
all_docs = []
for k in sorted(os.listdir(os.path.join(corpus_path,'2006-2012'))):
    all_docs += [json.loads(d)['text'] for d in open(os.path.join(corpus_path,'2006-2012',k)).readlines()]

texts = [[word for word in nltk.word_tokenize(document.lower()) if word not in stoplist] for document in all_docs]
corpus = [dictionary.doc2bow(text) for text in texts]
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=250)

lda.save('toi.LDA.lda')



#### Word2Vec

In [15]:
sentences = []
all_docs = []
for k in sorted(os.listdir(os.path.join(corpus_path,'2006-2012'))):
    all_docs += list(chain.from_iterable([nltk.sent_tokenize(json.loads(d)['text']) for d in open(os.path.join(corpus_path,'2006-2012',k)).readlines()]))

i=0
for doc in all_docs:
    sentences.append(utils.simple_preprocess(doc))

w2vmodel = models.Word2Vec(sentences=sentences, vector_size=250, window=5, min_count=10, workers=10)
   
w2vmodel.save('toi.W2V.w2v')


In [16]:
w2vmodel.wv.most_similar(positive=['mumbai', 'bangalore'], topn=5)

[('hyderabad', 0.768227219581604),
 ('kolkata', 0.7237099409103394),
 ('pune', 0.6968643069267273),
 ('ahmedabad', 0.6388577818870544),
 ('chennai', 0.613448977470398)]