In [15]:
import json
import pandas as pd
import random
import numpy as np
from tqdm.auto import tqdm
import csv
from collections import defaultdict
import re
import ast

In [16]:
negative_sampling_ratio = 5
least_fav_topic_count = 4
min_word_freq = 1
embedding_size = 300

In [17]:
!ls ../data/

datamodule.py               glove.840B.300d.txt.zip
dataset.py                  prepared_data.csv
filtered_train.csv          prepared_data_tokenized.csv
final_embeddings.csv        train_data.csv
final_embeddings.npy        user_least_fav.json
glove.6B.300d.txt           user_topic_dist.json
glove.6B.300d.txt.zip       word2int.csv
glove.840B.300d.txt         word2int.json


In [4]:
spaces = ['\u200b', '\u200e', '\u202a', '\u202c', '\ufeff', '\uf0d8', '\u2061', '\x10', '\x7f', '\x9d', '\xad', '\xa0', '\u202f']

def remove_space(text):
    for space in spaces:
        text = text.replace(space, ' ')
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [5]:
df = pd.read_csv('../data/filtered_train.csv')

In [6]:
df.head()

Unnamed: 0,id,id.1,pred_class,user_favorite,title
0,18322229,29661868,News and Politics,0-_-0,Promise of Analog AI Feeds Neural Net Hardware...
1,29978099,27894903,Lifestyle,0-_-0,Show HN: WrittenRealms – a modern platform for...
2,16534781,27559017,News and Politics,0-_-0,Advancing AI theory with first-principles unde...
3,26271851,26767441,Technology,0-_-0,A common mistake when NumPy’s RNG with PyTorch
4,22151396,26371052,Culture and Arts,0-_-0,Problems with Eric Weinstein's “Geometric Unity”


In [7]:
df['title'] = (df['title']
               .str.lower()
               .apply(lambda x: re.sub("""(?<=\w)([!?,.-:/"/'])""", r' \1 ', x))
               .apply(remove_space)
               .str.strip())

In [8]:
df.head()

Unnamed: 0,id,id.1,pred_class,user_favorite,title
0,18322229,29661868,News and Politics,0-_-0,promise of analog ai feeds neural net hardware...
1,29978099,27894903,Lifestyle,0-_-0,show hn : writtenrealms – a modern platform fo...
2,16534781,27559017,News and Politics,0-_-0,advancing ai theory with first-principles unde...
3,26271851,26767441,Technology,0-_-0,a common mistake when numpy’s rng with pytorch
4,22151396,26371052,Culture and Arts,0-_-0,problems with eric weinstein ' s “geometric un...


In [9]:
df = df.drop(['id', 'id.1'], axis=1)

In [10]:
user_dist = (df
             .groupby('user_favorite')['pred_class']
             .value_counts(normalize=True)
             .reset_index(name='dist')
             .pivot(index='user_favorite', columns='pred_class', values='dist')
             .fillna(0.)
            )

In [11]:
with open('../data/user_topic_dist.json', 'w') as fp:
    json.dump(user_dist.T.to_dict(), fp)

In [12]:
least_fav = pd.melt(
    user_dist.reset_index(),
    id_vars=['user_favorite'],
    value_vars=['Business', 'Culture and Arts', 'Education', 'Entertainment', 'Lifestyle', 'News and Politics', 'Sports', 'Technology']
)
least_fav = least_fav.sort_values(by='value', ascending=False)

In [13]:
least_fav = least_fav.groupby('user_favorite')['pred_class'].apply(list).apply(lambda x: x[-least_fav_topic_count:])

In [14]:
with open('../data/user_least_fav.json', 'w') as fp:
    json.dump(least_fav.to_dict(), fp)

In [15]:
df.head()

Unnamed: 0,pred_class,user_favorite,title
0,News and Politics,0-_-0,promise of analog ai feeds neural net hardware...
1,Lifestyle,0-_-0,show hn : writtenrealms – a modern platform fo...
2,News and Politics,0-_-0,advancing ai theory with first-principles unde...
3,Technology,0-_-0,a common mistake when numpy’s rng with pytorch
4,Culture and Arts,0-_-0,problems with eric weinstein ' s “geometric un...


### sampling

In [16]:
user_titles = df.groupby('user_favorite')['title'].apply(list).reset_index()
final_data = []

for i, row in tqdm(user_titles.head(200).iterrows(), total=user_titles.shape[0]):
    pairs = []
    user = row['user_favorite']
    titles = row['title']
    user_least_fav = least_fav[user]
    
    
    lfav_titles = df.loc[(df['user_favorite'] != user) & (df['pred_class'].isin(user_least_fav)), 'title'].sample(frac=1.)
    lfav_titles_len = lfav_titles.shape[0]
    
    if lfav_titles_len < negative_sampling_ratio:
        continue
    
    for title_i, title in enumerate(titles):
        pair = [[title]]
        title_is = title_i % lfav_titles_len
        title_ie = title_is + negative_sampling_ratio
        browsed_news = titles[:title_i] + titles[title_i+1:]
        browsed_news = random.sample(browsed_news, len(browsed_news))
        
        pair[0].extend(lfav_titles.iloc[title_is:title_ie].tolist())
        pair.append(browsed_news)
        pairs.append(pair)
        
    final_data.append({'user': user, 'candidate_news': pairs})

  0%|          | 0/89144 [00:00<?, ?it/s]

In [17]:
final_data = pd.DataFrame(final_data).explode('candidate_news')

In [18]:
final_data['browsed_news'] = final_data['candidate_news'].apply(lambda x: x[1])
final_data['candidate_news'] = final_data['candidate_news'].apply(lambda x: x[0])

In [19]:
final_data.to_csv('../data/prepared_data.csv', header=True, index=False)

In [74]:
final_data = pd.read_csv('../data/prepared_data.csv')
final_data['candidate_news'] = final_data['candidate_news'].apply(ast.literal_eval)
final_data['browsed_news'] = final_data['browsed_news'].apply(ast.literal_eval)

### word2vec

In [75]:
word_freq = defaultdict(int)
word2int = {}

for titles in final_data['candidate_news']:
    for title in titles:
        for word in title.split():
            word_freq[word] += 1

for word in word_freq:
    if word_freq[word] >= min_word_freq:
        word2int[word] = len(word2int) + 1

In [76]:
glove_path = '../data/glove.6B.300d.txt'

In [77]:
source_embedding = pd.read_table(glove_path, index_col=0, sep=' ', header=None, quoting=csv.QUOTE_NONE, names=range(embedding_size))
source_embedding = source_embedding.reset_index()
source_embedding.columns = ['word'] + list(range(embedding_size))

In [78]:
word2int = pd.DataFrame(word2int.items(), columns=['word', 'index'])

In [79]:
final_embeddings = word2int.merge(source_embedding, how='left', on='word')
final_embeddings.shape

(11944, 302)

In [80]:
orig_missed_embeddings = final_embeddings.loc[final_embeddings[0].isnull(), ['word', 'index']]
final_embeddings = final_embeddings.loc[~final_embeddings[0].isnull()]

missed_embeddings = pd.DataFrame(
    data=np.random.normal(size=(len(orig_missed_embeddings), embedding_size)),
    index=orig_missed_embeddings['word']
).reset_index()
missed_embeddings = missed_embeddings.merge(orig_missed_embeddings, how='left', on='word')

final_embeddings = pd.concat([final_embeddings, missed_embeddings])
final_embeddings.shape

(11944, 302)

In [81]:
final_embeddings = final_embeddings.drop('word', axis=1)

In [82]:
final_embeddings = final_embeddings.sort_values(by='index').drop('index', axis=1).values
final_embeddings.shape

(11944, 300)

In [83]:
np.save('../data/final_embeddings.npy', final_embeddings)

word2int = word2int.set_index('word')['index'].to_dict()
with open('../data/word2int.json', 'w') as fp:
    json.dump(word2int, fp)

### tokenize titles

In [84]:
def tokenize(text, word2int):
    res = []
    for word in text.split():
        if word in word2int:
            res.append(word2int[word])
    return res

In [85]:
final_data['candidate_news'] = final_data['candidate_news'].apply(lambda x: [tokenize(word, word2int) for word in x])
final_data['browsed_news'] = final_data['browsed_news'].apply(lambda x: [tokenize(word, word2int) for word in x])

In [86]:
final_data.head()

Unnamed: 0,user,candidate_news,browsed_news
0,0-_-0,"[[1, 2, 3, 4, 5, 6, 7, 8, 9], [10, 11, 12, 13,...","[[52, 75, 76, 77, 78, 79, 37, 80], [151, 152, ..."
1,0-_-0,"[[49, 33, 14, 50, 51, 52, 53, 54, 55, 56, 57],...","[[239, 240, 241, 242, 2, 243, 28, 29, 12, 244,..."
2,0-_-0,"[[63, 4, 64, 37, 65, 66, 2, 67, 6, 68], [22, 2...","[[94, 123, 124, 215, 216], [49, 33, 14, 6, 134..."
3,0-_-0,"[[52, 75, 76, 77, 78, 79, 37, 80], [32, 33, 14...","[[32, 33, 14, 34, 127, 19, 115, 250, 251, 55, ..."
4,0-_-0,"[[84, 37, 85, 86, 87, 88, 89, 90], [32, 33, 14...","[[49, 33, 14, 50, 51, 52, 53, 54, 55, 56, 57],..."


In [87]:
final_data['tokenized_vectors'] = final_data[['candidate_news', 'browsed_news']].apply(lambda x: [x[0], x[1]], axis=1)

In [91]:
final_data = final_data[['user', 'tokenized_vectors']]

In [92]:
final_data = final_data.set_index('user')['tokenized_vectors'].to_dict()
with open('../data/tokenized_vectors.json', 'w') as fp:
    json.dump(final_data, fp)

In [69]:
final_data.to_csv('../data/tokenized_vectors.csv', header=True, index=False)

In [70]:
final_data = pd.read_csv('../data/prepared_data_tokenized.csv')

In [73]:
final_data.head()['candidate_news'][0]

'[[1, 2, 3, 4, 5, 6, 7, 8, 9], [10, 11, 12, 13, 13, 14, 15, 16, 17, 18], [19, 20, 20, 21], [22, 23, 24, 25, 26, 27, 28, 29, 12, 30, 31], [32, 33, 14, 34, 35, 36, 35, 37, 38, 39, 40, 41], [32, 33, 14, 42, 16, 43, 44, 45, 46, 16, 47, 48, 41]]'

In [8]:
final_data.iloc[0]['candidate_news']

'[[1], [2], [3], [4], [5], [6], [7], [8], [9], [], [5], [10], [], [11], [12], [11], [13], [5], [14], [], [11], [7], [], [10], [9], [9], [15], [8], [], [12], [9], [16], [4], [11], [13], [], [12], [9], [17], [], [18], [11], [4], [15], [19], [11], [4], [9], [], [3], [7], [3], [9], [13], [7], [12], [9], [2], [20], [], [2], [8], [17], [11], [12], [10], [5], [4], [15], [], [21], [8], [], [22], [], [23], [], [23], [], [24], [], [7], [12], [17], [4], [5], [15], [16], [21], [17], [7], [5], [12], [], [17], [5], [], [21], [5], [6], [3], [16], [17], [9], [4], [], [12], [9], [17], [19], [5], [4], [25], [7], [12], [14], [2], [20], [], [2], [17], [18], [9], [], [3], [4], [11], [14], [6], [11], [17], [7], [21], [], [3], [4], [11], [14], [6], [11], [17], [7], [21], [], [3], [4], [5], [14], [4], [11], [6], [6], [9], [4], [2], [20], [], [2], [9], [26], [9], [8], [], [18], [7], [12], [17], [], [11], [17], [], [18], [7], [15], [15], [9], [12], [], [6], [9], [12], [17], [11], [13], [27], [18], [9], [11], [1

In [5]:
final_data.ilo

Unnamed: 0,user,candidate_news,browsed_news
0,0-_-0,"[[1], [2], [3], [4], [5], [6], [7], [8], [9], ...","[[1], [2], [11], [], [21], [5], [6], [6], [5],..."
1,0-_-0,"[[1], [2], [8], [18], [5], [19], [], [18], [12...","[[1], [2], [13], [9], [37], [7], [21], [11], [..."
2,0-_-0,"[[1], [2], [11], [15], [38], [11], [12], [21],...","[[1], [2], [15], [11], [17], [11], [], [8], [2..."
3,0-_-0,"[[1], [2], [11], [], [21], [5], [6], [6], [5],...","[[1], [2], [11], [8], [25], [], [18], [12], []..."
4,0-_-0,"[[1], [40], [3], [4], [5], [33], [13], [9], [6...","[[1], [2], [8], [18], [5], [19], [], [18], [12..."
