In [461]:
import pandas as pd
import re
import json
import random

In [462]:
spaces = [
    "\u200b",
    "\u200e",
    "\u202a",
    "\u202c",
    "\ufeff",
    "\uf0d8",
    "\u2061",
    "\x10",
    "\x7f",
    "\x9d",
    "\xad",
    "\xa0",
    "\u202f",
]


def remove_space(text):
    for space in spaces:
        text = text.replace(space, " ")
    text = text.strip()
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def tokenize(text, word2int):
    res = []
    for word in text.split():
        if word in word2int:
            res.append(word2int[word])
    return res

In [463]:
with open('data/new_stories.json', 'r') as fp:
    stories = json.load(fp)

In [464]:
df = pd.DataFrame(stories)

In [465]:
df['item_id'] = df['item_id'].astype(str)

In [467]:
df = df[['item_id', 'title', 'url']]

### tokenization

In [468]:
df['orig_title'] = df['title'].copy()

df["title"] = (
    df["title"]
    .str.lower()
    .apply(lambda x: re.sub(r"""(?<=\w)([!?,.-:/"/'])""", r" \1 ", x))
    .apply(remove_space)
    .str.strip()
)

In [469]:
with open("data/word2int.json", 'r') as fp:
    word2int = json.load(fp)

In [470]:
df["title"] = df["title"].apply(
    lambda x: tokenize(x, word2int)
)

In [471]:
df = df.loc[df['title'].apply(len) > 0]

### topic classification

In [472]:
df['topic'] = 'Sports'

CLASSES = [
    'Education', 'Business', 'Sports',
    'Technology',
    'News and Politics',
    'Lifestyle',
    'Culture and Arts',
    'Entertainment',
]

df['topic'] = df['topic'].apply(lambda x: random.sample(CLASSES, k=1)[0])

In [473]:
topic_data = df[['item_id', 'orig_title', 'topic', 'url']].set_index('item_id').T.to_dict()

with open('data/topic_data_pred.json', 'w') as fp:
    json.dump(topic_data, fp)

### generate embeddings

In [431]:
from config import TANRConfig
from models.module import TANRModule
import torch

In [432]:
config = TANRConfig()

In [433]:
candidate_news = df['title'].tolist()
candidate_news = torch.tensor([
    (news + [0] * config.num_words_title)[:config.num_words_title]
    for news in candidate_news
])
candidate_news = {'title': candidate_news}

In [434]:
embed_model = TANRModule(config)



In [435]:
candidate_news

{'title': tensor([[  37, 2429,  111,  ...,    0,    0,    0],
         [ 598,  138, 6108,  ...,    0,    0,    0],
         [  37, 9701,   14,  ...,    0,    0,    0],
         ...,
         [8713,    0,    0,  ...,    0,    0,    0],
         [ 105,   34,   35,  ..., 1994,  113,    0],
         [  15,  106,   87,  ...,    0,    0,    0]])}

In [436]:
news_vector = embed_model.get_news_vector(candidate_news)

In [437]:
# news_vector = {df['item_id'].iloc[i]: vec for i, vec in enumerate(news_vector)}

### User data

In [438]:
users = ['AlexClay', 'HowardStark', 'plibither8', 'JimMorrison723', 'NovaDev']

In [439]:
fav_data = pd.read_csv('data/filtered_train.csv')
fav_data = fav_data.loc[fav_data['user_favorite'].isin(users)]
fav_data['user_favorite'].value_counts()

plibither8        327
HowardStark        45
JimMorrison723     43
AlexClay           42
NovaDev            38
Name: user_favorite, dtype: int64

In [440]:
fav_data = fav_data[['user_favorite', 'title']]

In [441]:
fav_data["title"] = (
    fav_data["title"]
    .str.lower()
    .apply(lambda x: re.sub(r"""(?<=\w)([!?,.-:/"/'])""", r" \1 ", x))
    .apply(remove_space)
    .str.strip()
)

fav_data["title"] = fav_data["title"].apply(
    lambda x: tokenize(x, word2int)
)

fav_data = fav_data.loc[fav_data['title'].apply(len) > 0]

In [442]:
fav_data = fav_data.groupby('user_favorite')['title'].apply(list).reset_index()

In [443]:
fav_data['title'] = fav_data['title'].apply(lambda x: [(news + [0] * config.num_words_title)[:config.num_words_title] for news in x])
fav_data['title'] = fav_data['title'].apply(lambda x: x[-config.num_words_abstract:])

In [444]:
fav_data['title'] = fav_data['title'].apply(
    lambda x: [[0] * config.num_words_title] * (config.num_clicked_news_a_user - len(x)) + x
)

In [445]:
user_vector = fav_data.set_index('user_favorite')['title'].to_dict()

for k, v in user_vector.items():
    user_vector[k] = {'title': torch.tensor(v)}

In [446]:
for k, v in user_vector.items():
    user_vector[k] = embed_model.get_news_vector(v)

In [447]:
browsed_news = torch.stack(list(user_vector.values()))

In [448]:
user_vector = embed_model.get_user_vector(browsed_news).tolist()

In [449]:
user_vector = {fav_data['user_favorite'].iloc[i]: torch.tensor(vec) for i, vec in enumerate(user_vector)}

In [450]:
preds = {}

for user, vec in user_vector.items():
    preds[user] = embed_model.get_prediction(news_vector, vec).sigmoid().tolist()

In [451]:
for k, v in preds.items():
    preds[k] = [{df.iloc[i]['item_id']: prob} for i, prob in enumerate(v)]
    preds[k] = sorted(preds[k], key=lambda x: -list(x.values())[0])[:10]

In [452]:
with open('data/new_stories_user_pred.json', 'w') as fp:
    json.dump(preds, fp)