In [1]:
import json
import pandas as pd
import random
import numpy as np
from tqdm.auto import tqdm
import csv
from collections import defaultdict
import re
import ast

In [2]:
negative_sampling_ratio = 5
least_fav_topic_count = 4
min_word_freq = 1
embedding_size = 300
val_split_pct = 0.2

In [3]:
!ls ../data/

filtered_train.csv           tokenized_vectors.json
final_embeddings.csv         tokenized_vectors_train.json
final_embeddings.npy         tokenized_vectors_val.json
glove.6B.300d.txt            train_data.csv
glove.6B.300d.txt.zip        user_least_fav.json
glove.840B.300d.txt          user_topic_dist.json
glove.840B.300d.txt.zip      word2int.csv
prepared_data.csv            word2int.json
prepared_data_tokenized.csv


In [4]:
spaces = ['\u200b', '\u200e', '\u202a', '\u202c', '\ufeff', '\uf0d8', '\u2061', '\x10', '\x7f', '\x9d', '\xad', '\xa0', '\u202f']

def remove_space(text):
    for space in spaces:
        text = text.replace(space, ' ')
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [5]:
df = pd.read_csv('../data/filtered_train.csv')

In [6]:
df.head()

Unnamed: 0,id,id.1,pred_class,user_favorite,title
0,18322229,29661868,News and Politics,0-_-0,Promise of Analog AI Feeds Neural Net Hardware...
1,29978099,27894903,Lifestyle,0-_-0,Show HN: WrittenRealms – a modern platform for...
2,16534781,27559017,News and Politics,0-_-0,Advancing AI theory with first-principles unde...
3,26271851,26767441,Technology,0-_-0,A common mistake when NumPy’s RNG with PyTorch
4,22151396,26371052,Culture and Arts,0-_-0,Problems with Eric Weinstein's “Geometric Unity”


In [7]:
df['title'] = (df['title']
               .str.lower()
               .apply(lambda x: re.sub("""(?<=\w)([!?,.-:/"/'])""", r' \1 ', x))
               .apply(remove_space)
               .str.strip())

In [8]:
df.head()

Unnamed: 0,id,id.1,pred_class,user_favorite,title
0,18322229,29661868,News and Politics,0-_-0,promise of analog ai feeds neural net hardware...
1,29978099,27894903,Lifestyle,0-_-0,show hn : writtenrealms – a modern platform fo...
2,16534781,27559017,News and Politics,0-_-0,advancing ai theory with first-principles unde...
3,26271851,26767441,Technology,0-_-0,a common mistake when numpy’s rng with pytorch
4,22151396,26371052,Culture and Arts,0-_-0,problems with eric weinstein ' s “geometric un...


In [9]:
df = df.drop(['id', 'id.1'], axis=1)

In [10]:
user_dist = (df
             .groupby('user_favorite')['pred_class']
             .value_counts(normalize=True)
             .reset_index(name='dist')
             .pivot(index='user_favorite', columns='pred_class', values='dist')
             .fillna(0.)
            )

In [11]:
with open('../data/user_topic_dist.json', 'w') as fp:
    json.dump(user_dist.T.to_dict(), fp)

In [12]:
least_fav = pd.melt(
    user_dist.reset_index(),
    id_vars=['user_favorite'],
    value_vars=['Business', 'Culture and Arts', 'Education', 'Entertainment', 'Lifestyle', 'News and Politics', 'Sports', 'Technology']
)
least_fav = least_fav.sort_values(by='value', ascending=False)

In [13]:
least_fav = least_fav.groupby('user_favorite')['pred_class'].apply(list).apply(lambda x: x[-least_fav_topic_count:])

In [14]:
with open('../data/user_least_fav.json', 'w') as fp:
    json.dump(least_fav.to_dict(), fp)

In [15]:
df.head()

Unnamed: 0,pred_class,user_favorite,title
0,News and Politics,0-_-0,promise of analog ai feeds neural net hardware...
1,Lifestyle,0-_-0,show hn : writtenrealms – a modern platform fo...
2,News and Politics,0-_-0,advancing ai theory with first-principles unde...
3,Technology,0-_-0,a common mistake when numpy’s rng with pytorch
4,Culture and Arts,0-_-0,problems with eric weinstein ' s “geometric un...


### sampling

In [16]:
user_titles = df.groupby('user_favorite')['title'].apply(list).reset_index()
final_data = []

for i, row in tqdm(user_titles.head(200).iterrows(), total=user_titles.shape[0]):
    pairs = []
    user = row['user_favorite']
    titles = row['title']
    user_least_fav = least_fav[user]
    
    
    lfav_titles = df.loc[(df['user_favorite'] != user) & (df['pred_class'].isin(user_least_fav)), 'title'].sample(frac=1.)
    lfav_titles_len = lfav_titles.shape[0]
    
    if lfav_titles_len < negative_sampling_ratio:
        continue
    
    for title_i, title in enumerate(titles):
        pair = [[title]]
        title_is = title_i % lfav_titles_len
        title_ie = title_is + negative_sampling_ratio
        browsed_news = titles[:title_i] + titles[title_i+1:]
        browsed_news = random.sample(browsed_news, len(browsed_news))
        
        pair[0].extend(lfav_titles.iloc[title_is:title_ie].tolist())
        pair.append(browsed_news)
        pairs.append(pair)
        
    final_data.append({'user': user, 'candidate_news': pairs})

  0%|          | 0/89144 [00:00<?, ?it/s]

In [17]:
final_data = pd.DataFrame(final_data).explode('candidate_news')

In [18]:
final_data['browsed_news'] = final_data['candidate_news'].apply(lambda x: x[1])
final_data['candidate_news'] = final_data['candidate_news'].apply(lambda x: x[0])

In [19]:
final_data.to_csv('../data/prepared_data.csv', header=True, index=False)

In [20]:
final_data = pd.read_csv('../data/prepared_data.csv')
final_data['candidate_news'] = final_data['candidate_news'].apply(ast.literal_eval)
final_data['browsed_news'] = final_data['browsed_news'].apply(ast.literal_eval)

### word2vec

In [21]:
word_freq = defaultdict(int)
word2int = {}

for titles in final_data['candidate_news']:
    for title in titles:
        for word in title.split():
            word_freq[word] += 1

for word in word_freq:
    if word_freq[word] >= min_word_freq:
        word2int[word] = len(word2int) + 1

In [22]:
glove_path = '../data/glove.6B.300d.txt'

In [23]:
source_embedding = pd.read_table(glove_path, index_col=0, sep=' ', header=None, quoting=csv.QUOTE_NONE, names=range(embedding_size))
source_embedding = source_embedding.reset_index()
source_embedding.columns = ['word'] + list(range(embedding_size))

In [24]:
word2int = pd.DataFrame(word2int.items(), columns=['word', 'index'])

In [25]:
final_embeddings = word2int.merge(source_embedding, how='left', on='word')
final_embeddings.shape

(12045, 302)

In [26]:
orig_missed_embeddings = final_embeddings.loc[final_embeddings[0].isnull(), ['word', 'index']]
final_embeddings = final_embeddings.loc[~final_embeddings[0].isnull()]

missed_embeddings = pd.DataFrame(
    data=np.random.normal(size=(len(orig_missed_embeddings), embedding_size)),
    index=orig_missed_embeddings['word']
).reset_index()
missed_embeddings = missed_embeddings.merge(orig_missed_embeddings, how='left', on='word')

final_embeddings = pd.concat([final_embeddings, missed_embeddings])
final_embeddings.shape

(12045, 302)

In [27]:
final_embeddings = final_embeddings.drop('word', axis=1)

In [28]:
final_embeddings = final_embeddings.sort_values(by='index').drop('index', axis=1).values
final_embeddings.shape

(12045, 300)

In [29]:
np.save('../data/final_embeddings.npy', final_embeddings)

word2int = word2int.set_index('word')['index'].to_dict()
with open('../data/word2int.json', 'w') as fp:
    json.dump(word2int, fp)

### tokenize titles

In [30]:
def tokenize(text, word2int):
    res = []
    for word in text.split():
        if word in word2int:
            res.append(word2int[word])
    return res

In [31]:
final_data['candidate_news'] = final_data['candidate_news'].apply(lambda x: [tokenize(word, word2int) for word in x])
final_data['browsed_news'] = final_data['browsed_news'].apply(lambda x: [tokenize(word, word2int) for word in x])

In [32]:
final_data.head()

Unnamed: 0,user,candidate_news,browsed_news
0,0-_-0,"[[1, 2, 3, 4, 5, 6, 7, 8, 9], [10, 11, 12, 13,...","[[99, 72, 100, 101, 102, 103, 104, 105], [192,..."
1,0-_-0,"[[51, 52, 53, 54, 55, 13, 56, 57, 58, 59, 60],...","[[256, 257, 102, 103, 78, 258, 77, 259, 260, 5..."
2,0-_-0,"[[70, 4, 71, 72, 73, 74, 2, 75, 6, 76], [30, 3...","[[107, 291, 292, 293, 294, 48, 295, 296, 297],..."
3,0-_-0,"[[13, 83, 84, 85, 86, 87, 72, 88], [13, 35, 27...","[[70, 4, 71, 72, 73, 74, 2, 75, 6, 76], [106, ..."
4,0-_-0,"[[99, 72, 100, 101, 102, 103, 104, 105], [42, ...","[[192, 193, 194, 2, 111, 195, 26, 17, 27, 170,..."


In [33]:
final_data = final_data.sample(frac=1.)
final_data = final_data.sort_values(by='user')

In [34]:
final_data['user_ix'] = final_data.groupby('user').ngroup()

In [35]:
unique_users = final_data['user_ix'].max() + 1
val_split = int(unique_users * (1 - val_split_pct))

final_data_train = final_data.loc[final_data['user_ix'] < val_split].reset_index(drop=True).drop('user_ix', axis=1)
final_data_val = final_data.loc[final_data['user_ix'] >= val_split].reset_index(drop=True).drop('user_ix', axis=1)

In [36]:
with open('../data/tokenized_vectors_train.json', 'w') as fp:
    json.dump(final_data_train.T.to_dict(), fp)
    
with open('../data/tokenized_vectors_val.json', 'w') as fp:
    json.dump(final_data_val.T.to_dict(), fp)

In [39]:
final_data_train.T.to_dict()[1530]

{'user': '0x210f',
 'candidate_news': [[6645, 6646],
  [1587, 93, 122],
  [10, 6642, 918, 2, 30, 1321, 658],
  [365, 252, 264, 619, 6644],
  [74, 10, 2262, 184, 3734, 93, 183],
  [107, 212, 3871, 279, 212, 320]],
 'browsed_news': [[246, 820, 48, 10, 2209, 889, 548, 170, 28, 64, 29],
  [5559, 2, 1105, 548, 170, 170, 62, 29],
  [10, 1798, 579],
  [51, 52, 53, 95, 10, 311, 1696, 1637, 48, 346, 114],
  [51, 52, 53, 6809, 55, 3038, 58, 4566],
  [246, 5899, 188, 48, 2590],
  [2506, 27, 63, 17, 53, 10, 163, 5769, 39, 5770, 1900, 2, 10, 5771, 5772],
  [2607, 183, 901],
  [212, 1862, 6436, 58, 314, 1800, 6325, 1763, 706, 6437],
  [217, 103, 200, 871, 872, 1508, 267, 93, 2060],
  [989, 252, 266, 2090, 381, 27, 17, 17, 1163, 2091],
  [106, 52, 53, 266, 109, 774, 13, 1382, 114],
  [675, 61, 705, 445, 26, 17, 27, 170, 29],
  [747, 5915],
  [5726, 706, 2487, 71, 26, 17, 27, 64, 29],
  [222, 6404],
  [2907, 53, 2908, 2373, 2909, 2910],
  [177, 102, 103, 188, 48, 2350, 954],
  [279, 140, 1305, 600, 68