In [1]:
import re
import os
import gensim
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
import string
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
import json
import time
from collections import Counter
from tqdm import tqdm

In [2]:
trainfilename = '../../data/train.tsv'
validfilename = '../../data/valid.tsv'
testfilename = '../../data/personalized_test.tsv'
docsfilename = '../../data/news.tsv'
stop_words = set(stopwords.words('english'))

In [3]:
WORD_FREQ_THRESHOLD = 3
MAX_CONTENT_LEN = 500
MAX_BODY_LEN = 100
MAX_TITLE_LEN = 16
WORD_EMBEDDING_DIM = 300
MAX_CLICK_LEN = 50

word2freq = {}
word2index = {}

In [4]:
def word_tokenize(sent):
    pat = re.compile(r'[\w]+|[.,!?;|]')
    if isinstance(sent, str):
        return pat.findall(sent.lower())
    else:
        return []

In [5]:
def read_news(filename,filer_num=3):
    """
    Input: Takes news file
    Output: 
        news: newsid_newsdetail map: {'N10000': ["category", ["tokenized", "title", "..."], ["tokenized", "body", "..."] ]
        news_index: newsid_index map {'N10000': 1, 'N10001': 2,...}
        category_dict: category_index map {"cat1": 1, "cat2":2,...}
        word_dict: word_index map {"word1": 3, "word2":4...}
    """
    news={}
    category, subcategory=[], []
    news_index={}
    index=1
    word_cnt=Counter()
    err = 0
    news_data = pd.read_csv(filename, sep='\t')
    news_data.fillna(value=" ", inplace=True)
    for i in tqdm(range(len(news_data))):
        doc_id,vert,_, title, snipplet= news_data.loc[i,:][:5]
        news_index[doc_id]=index
        index+=1

        title = title.lower()
        title = word_tokenize(title)
        snipplet = snipplet.lower()
        snipplet = word_tokenize(snipplet)
        category.append(vert)
        news[doc_id] = [vert,title,snipplet]     
        word_cnt.update(snipplet+title)
    # 0: pad; 1: <sos>; 2: <eos>
    word = [k for k , v in word_cnt.items() if v >= filer_num]
    word_dict = {k:v for k, v in zip(word, range(3,len(word)+3))}
    category=list(set(category))
    category_dict={k:v for k, v in zip(category, range(1,len(category)+1))}

    return news,news_index,category_dict,word_dict

In [6]:
%time news,news_index,category_dict,word_dict = read_news(docsfilename)

100%|████████████████████████████████████████████████| 113762/113762 [00:24<00:00, 4732.05it/s]


CPU times: user 42.7 s, sys: 5.75 s, total: 48.5 s
Wall time: 48.7 s


In [7]:
# min(news_index.keys()), max(news_index.keys())
list(news.values())[0][0]

'sports'

In [8]:
word_dict['unk'] = 0
word_dict['<sos>'] = 1
word_dict['<eos>'] = 2

In [9]:
# TODO: avoid pickle, use parquet or csv instead 
with open('../../data2/dict.pkl', 'wb') as f:
    pickle.dump([news_index,category_dict,word_dict], f)
with open('../../data2/news.pkl', 'wb') as f:
    pickle.dump(news, f)

## get inputs for user encoder

In [11]:

def get_rep_for_userencoder(news,news_index,category_dict,word_dict):
    """
    Input: 
        news: newsid_newsdetail map: {'N10000': ["category", ["tokenized", "title", "..."], ["tokenized", "body", "..."] ]
        news_index: newsid_index map {'N10000': 1, 'N10001': 2,...}
        category_dict: category_index map {"cat1": 1, "cat2":2,...}
        word_dict: word_index map {"word1": 3, "word2":4...}
    Output:
        news_vert: Numerical representation of category of each article. numpy tensor of shape (news_num).
        every article has single catgory hence represented by single number 
        news_title: Numerical representation of all titles. numpy tensor of shape (news_num, MAX_TITLE_LEN)
        news_body: Numerical representation of all news bodies. numpy tensor of shape (news_num, MAX_BODY_LEN)
    """
    news_num=len(news)+1
    news_title=np.zeros((news_num,MAX_TITLE_LEN),dtype='int32')
    news_body=np.zeros((news_num,MAX_BODY_LEN),dtype='int32')
    news_vert=np.zeros((news_num),dtype='int32')
    for key in news:    
        vert,title,body=news[key]
        doc_index=news_index[key]
        # encode category
        news_vert[doc_index] = category_dict[vert]
        
        # encode title
        counter = 0
        for word_id in range(min(MAX_TITLE_LEN,len(title))):
            if title[word_id] in word_dict:
                news_title[doc_index,counter]=word_dict[title[word_id].lower()]
                counter += 1
        
        # encode body
        counter = 0
        for word_id in range(min(MAX_BODY_LEN,len(body))):
            if body[word_id] in word_dict:
                news_body[doc_index,counter]=word_dict[body[word_id].lower()]
                counter += 1
    return news_vert, news_title, news_body

In [12]:
%time news_vert, news_title, news_body = get_rep_for_userencoder(news,news_index,category_dict,word_dict)

CPU times: user 3.22 s, sys: 0 ns, total: 3.22 s
Wall time: 3.22 s


In [13]:
# len(news_vert),len(news_title), len(news_body)
news_title[2]

array([298, 299, 407, 303, 301, 122, 324, 325, 326,   0,   0,   0,   0,
         0,   0,   0], dtype=int32)

In [14]:
np.save('../../data2/news_vert.npy', news_vert)
np.save('../../data2/news_title.npy', news_title)
np.save('../../data2/news_body.npy', news_body)

## get inputs/ targets for seq2seq model

In [15]:
def get_rep_for_seq2seq(news,news_index,word_dict):
    """
    input:
        news: newsid_newsdetail map: {'N10000': ["category", ["tokenized", "title", "..."], ["tokenized", "body", "..."] ]
        news_index: newsid_index map {'N10000': 1, 'N10001': 2,...}
        word_dict: word_index map {"word1": 3, "word2":4...}
    output:
        sources: numpy tensor of shape (news_num,MAX_CONTENT_LEN)
        target_inputs: numpy tensor of shape (news_num,MAX_TITLE_LEN)
        target_outputs: numpy tensor of shape (news_num,MAX_TITLE_LEN) same as target inputs left shifted by one
    """
    news_num=len(news)+1
    sources=np.zeros((news_num,MAX_CONTENT_LEN),dtype='int32')
    target_inputs=np.zeros((news_num,MAX_TITLE_LEN),dtype='int32')
    target_outputs=np.zeros((news_num,MAX_TITLE_LEN),dtype='int32')
    
    for key in tqdm(news):    
        _, title, body = news[key]
        doc_index = news_index[key]
        
        # get numerical indexes of body words from word_dict   
        counter = 0
        for word_id in range(min(MAX_CONTENT_LEN-1,len(body))):
            if body[word_id] in word_dict:
                sources[doc_index,counter]=word_dict[body[word_id].lower()]
                counter += 1
        sources[doc_index,counter] = 2 
        
        # get numerical indexes of title words from word_dict
        target_inputs[doc_index,0] = 1 # set first word as <sos>
        counter = 1
        for word_id in range(min(MAX_TITLE_LEN-1,len(title))):
            if title[word_id] in word_dict:
                target_inputs[doc_index,counter]=word_dict[title[word_id].lower()]
                counter += 1

        # get numerical indexes of title words from word_dict   
        counter = 0
        for word_id in range(min(MAX_TITLE_LEN-1,len(title))):
            if title[word_id] in word_dict:
                target_outputs[doc_index,counter]=word_dict[title[word_id].lower()]
                counter += 1
        target_outputs[doc_index,counter] = 2 # set last target word as <eos>
        
    return sources, target_inputs, target_outputs

In [16]:
%time sources, target_inputs, target_outputs = get_rep_for_seq2seq(news,news_index,word_dict)

100%|███████████████████████████████████████████████| 113762/113762 [00:11<00:00, 10123.65it/s]

CPU times: user 11.1 s, sys: 172 ms, total: 11.3 s
Wall time: 11.2 s





In [23]:
# sources[2]
print(target_inputs[2])
print(target_outputs[2])

[  1 298 299 407 303 301 122 324 325 326   0   0   0   0   0   0]
[298 299 407 303 301 122 324 325 326   2   0   0   0   0   0   0]


In [24]:
np.save('../../data2/sources.npy', sources)
np.save('../../data2/target_inputs.npy', target_inputs)
np.save('../../data2/target_outputs.npy', target_outputs)

## get embedding matrix

In [27]:
def load_matrix(embedding_path,word_dict):
    """
    input: 
        embedding_path: path where model is stored 
        word_dict: word_index map
    output:
        embedding_matrix: tensor of shape (word_dict, 300) embedding corresponding to each vocab word available. 
        if unavailable, random values between 0 and 0.1
        have_word: list of words in vocab which are available in embedding model.
    """
    mu, sigma = 0, 0.1
    embedding_zero = np.zeros((1,300))
    embedding_matrix = np.random.normal(mu, sigma, (len(word_dict)-1, WORD_EMBEDDING_DIM))
    embedding_matrix = np.concatenate((embedding_zero,embedding_matrix))
    have_word=[]
    with open(os.path.join(embedding_path,'glove.840B.300d.txt'),'rb') as f:
        while True:
            l=f.readline()
            if len(l)==0:
                break
            l=l.split()
            word = l[0].decode()
            if word in word_dict:
                index = word_dict[word]
                tp = [float(x) for x in l[1:]]
                embedding_matrix[index]=np.array(tp)
                have_word.append(word)
    return embedding_matrix,have_word

In [29]:
%time embedding_matrix, have_word = load_matrix('../../data',word_dict)

CPU times: user 26.9 s, sys: 4.32 s, total: 31.2 s
Wall time: 35.4 s


In [34]:
embedding_matrix.shape
# len(word_dict)

(141910, 300)

In [35]:
len(word_dict),len(have_word)

(141910, 100875)

In [36]:
np.save('../../data2/embedding_matrix.npy', embedding_matrix)

## get train/ valid/ test examples from user logs

In [38]:
def Doc2ID(doclist,news2id):
    """
    convert news_id list  to index_id list using lookup dictionary news2id 
    """
    return [news2id[i] for i in doclist if i in news2id ]

In [39]:
def PadDoc(doclist):
    """
    pad document list from left side according to MAX_CLICK_LEN 
    """
    if len(doclist) >= MAX_CLICK_LEN:
        return doclist[-MAX_CLICK_LEN:]
    else:
        return [0] * (MAX_CLICK_LEN-len(doclist)) + doclist[:MAX_CLICK_LEN]

In [40]:
def user2dict(users):
    user_set = set(users)
    user_dict = {k:v for k, v in zip(user_set, range(0,len(user_set)))}
    return user_dict

In [37]:
def parse_train_user(filename,news_index):
    """
    input:
        filename: path to train/test/valid filename
        news_index: newsid_index map {'N10000': 1, 'N10001': 2,...}
    output:
        train_users: list of list. [[0padded, click, news, indexes],...]
        train_samples: [userId, [news_id1, news_id2], [1(pos), 0(neg)] 
        
    """
    df = pd.read_csv(filename, sep='\t')
    df.fillna(value=" ", inplace=True)

    # convert news id to index id and perform padding
    df['ClicknewsID'] = df['ClicknewsID'].apply(lambda x: PadDoc(Doc2ID(x.split(),news_index)))

    # convert pos, neg news_id to index_id
    df['pos']  = df['pos'].apply(lambda x: Doc2ID(x.split(),news_index))
    df['neg'] = df['neg'].apply(lambda x: Doc2ID(x.split(),news_index))
    
    pos_neg_lists = []
    
    for userindex, (pos_list, neg_list) in tqdm(enumerate(zip(df['pos'].values.tolist(), df['neg'].values.tolist()))):
        """
        For each user
        1. Shuffle pos and neg list
        2. Ensure positive negative lists are of same length
        """
        if len(pos_list) and len(neg_list):
            # sampling 1 negative sample for 1 pos sample
            min_len = min(len(pos_list), len(neg_list))
            np.random.shuffle(pos_list)
            np.random.shuffle(neg_list)
            for i in range(min_len):
                pos_neg_lists.append([userindex, [pos_list[i],neg_list[i]],[1,0]])
        
    return df['ClicknewsID'].values.tolist(), pos_neg_lists

In [41]:
%time TrainUsers, TrainSamples = parse_train_user(trainfilename, news_index)

400000it [00:04, 97143.54it/s] 

CPU times: user 21.9 s, sys: 1.6 s, total: 23.5 s
Wall time: 24.9 s





In [65]:
# len(TrainSamples[0])
print(len(TrainUsers))
print(len(TrainSamples))
[s for s in TrainSamples[:1]]

400000
763188


[[0, [88304, 90055], [1, 0]]]

In [66]:
with open('../../data2/TrainUsers.pkl', 'wb') as f:
    pickle.dump(TrainUsers, f)
with open('../../data2/TrainSamples.pkl', 'wb') as f:
    pickle.dump(TrainSamples, f)

In [67]:
def parse_valid_user(filename,news_index):
    """
    input: 
        filename: path to validation file
        news_index: newsid_index map {'N10000': 1, 'N10001': 2,...}
    output:
        ValidUsers: list of list. [[0padded, click, news, indexes],...]
        ValidSamples: list of list. [userId, [news_id1, news_id2, ..., news_idn], [pos_neglabel1, pos_neglabel2, ..., pos_neglabeln] 
    """
    df = pd.read_csv(filename, sep='\t')
    df.fillna(value=" ", inplace=True)
    
    df['ClicknewsID'] = df['ClicknewsID'].apply(lambda x: PadDoc(Doc2ID(x.split(),news_index)))
    
    df['pos']  = df['pos'].apply(lambda x: Doc2ID(x.split(),news_index))
    df['neg'] = df['neg'].apply(lambda x: Doc2ID(x.split(),news_index))
    
    pos_neg_lists = []
    for userindex, (pos_list, neg_list) in enumerate(zip(df['pos'].values.tolist(), df['neg'].values.tolist())):
        if len(pos_list) and len(neg_list):
            pos_neg_lists.append([userindex, pos_list+neg_list,[1]*len(pos_list)+[0]*len(neg_list)])
        
    return df['ClicknewsID'].values.tolist(), pos_neg_lists

In [68]:
%time ValidUsers, ValidSamples = parse_valid_user(validfilename,news_index)

CPU times: user 4.81 s, sys: 391 ms, total: 5.2 s
Wall time: 5.27 s


In [76]:
with open('../../data2/ValidUsers.pkl', 'wb') as f:
    pickle.dump(ValidUsers, f)
with open('../../data2/ValidSamples.pkl', 'wb') as f:
    pickle.dump(ValidSamples, f)

In [77]:
def parse_test_user(filename,news_index):
    """
    input: 
        filename: path to test file
        news_index: newsid_index map {'N10000': 1, 'N10001': 2,...}
    output:
        TestUsers:list of list. [[0padded, click, news, indexes],...]
        TestSamples: list of list. [[userindex, pos_id, rewrite_title]]
    """
    df = pd.read_csv(filename, sep='\t')
    
    df['clicknewsID'] = df['clicknewsID'].apply(lambda x: PadDoc(Doc2ID(x.split(','),news_index)))
    
    df['posnewID']  = df['posnewID'].apply(lambda x: Doc2ID(x.split(','),news_index))
    
    df['rewrite_titles'] = df['rewrite_titles'].apply(lambda x: [i.lower() for i in x.split(';;')] )
    
    pos_lists = []
    for userindex, (pos_lis, rewrite_title_lis) in enumerate(zip(df['posnewID'].values.tolist(), df['rewrite_titles'].values.tolist())):
        for pos, rewrite_title in zip(pos_lis, rewrite_title_lis):
            if rewrite_title.strip() == '':
                continue
            else:
                pos_lists.append([userindex, pos, rewrite_title])
    
    return df['clicknewsID'].values.tolist(), pos_lists

In [78]:
%time TestUsers, TestSamples = parse_test_user(testfilename,news_index)

CPU times: user 30.1 ms, sys: 3.02 ms, total: 33.2 ms
Wall time: 32.5 ms


In [80]:
TestSamples[0]

[0,
 14111,
 "legal battle looms over trump epa's rule change of obama's clean power plan rule"]

In [81]:
with open('../../data2/TestUsers.pkl', 'wb') as f:
    pickle.dump(TestUsers, f)
with open('../../data2/TestSamples.pkl', 'wb') as f:
    pickle.dump(TestSamples, f)

In [None]:
TestSamples[0]