# Imports, data loading

In [1]:
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import normalize

behaviors_path = 'MINDsmall_train/behaviors.tsv'
news_path = 'MINDsmall_train/news.tsv'
news_data = pd.read_csv(news_path, sep='\t', header=None, names=['ArticleID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbsEntities'])

news_data['Title'] = news_data['Title'].fillna('')
news_data['Abstract'] = news_data['Abstract'].fillna('')

# Concatenating title and abstract for a comprehensive representation
news_data['content'] = news_data['Title'] + " " + news_data['Abstract']

# Vectorizing the content using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

tfidf_matrix = tfidf_vectorizer.fit_transform(news_data['content'])

# Loading entity embeddings

In [2]:
def load_entity_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            entity_id = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[entity_id] = vector
    return embeddings

entity_embeddings_path = './MINDsmall_train/entity_embedding.vec'
entity_embeddings = load_entity_embeddings(entity_embeddings_path)

In [3]:
def article_to_embedding(article_entities, entity_embeddings):
    try:
        q_entities = [entity['WikidataId'] for entity in json.loads(article_entities)]
    except:
        q_entities = []
    embeddings = [entity_embeddings[entity] for entity in q_entities if entity in entity_embeddings]
    
    if embeddings:
        article_embedding = np.mean(embeddings, axis=0)
    else:
        article_embedding = np.zeros(next(iter(entity_embeddings.values())).shape)
    return article_embedding

def embeddings_count(article_entities):
    try:
        return len([entity['WikidataId'] for entity in json.loads(article_entities)])
    except:
        return 0

def combine_embs_hstack(row): #Kanskje mean istedenfor hstack
    if row["AbsEmbeddingsCount"] == 0 and row["TitleEmbeddingsCount"] != 0:
        return np.hstack((row["TitleEmbs"], row["TitleEmbs"], row["TfidfEmbs"]))
    elif row["AbsEmbeddingsCount"] != 0 and row["TitleEmbeddingsCount"] == 0:
        return np.hstack((row["AbsEmbs"], row["AbsEmbs"], row["TfidfEmbs"]))
    return np.hstack((row["TitleEmbs"], row["AbsEmbs"], row["TfidfEmbs"]))


# Compute raw embeddings
news_data["TitleEmbs"] = news_data['TitleEntities'].apply(lambda x: article_to_embedding(x, entity_embeddings))
news_data["AbsEmbs"] = news_data['AbsEntities'].apply(lambda x: article_to_embedding(x, entity_embeddings))

# title_embs = np.array([article_to_embedding(entities, entity_embeddings) for entities in news_data['TitleEntities'].fillna('')])
# abs_embs = np.array([article_to_embedding(entities, entity_embeddings) for entities in news_data['AbsEntities'].fillna('')])

# Normalize embs
normalized_title_embs = normalize(news_data["TitleEmbs"].values.tolist(), axis=1)
normalized_abs_embs = normalize(news_data["AbsEmbs"].values.tolist(), axis=1)
tfidf_normalized = normalize(tfidf_matrix.toarray(), axis=1)

list_normalized_tfidf = [list(x) for x in tfidf_normalized]
list_normalized_title_embs = [list(x) for x in normalized_title_embs]
list_normalized_abs_embs = [list(x) for x in normalized_abs_embs]

news_data["TitleEmbs"] = list_normalized_title_embs
news_data["AbsEmbs"] = list_normalized_abs_embs
news_data["TfidfEmbs"] = list_normalized_tfidf

# compute count
news_data["AbsEmbeddingsCount"] = news_data['AbsEntities'].apply(lambda x: embeddings_count(x))
news_data["TitleEmbeddingsCount"] = news_data['TitleEntities'].apply(lambda x: embeddings_count(x))

news_data["CombinedEmbeddings"] = news_data.apply(lambda x: combine_embs_hstack(x), axis=1)

In [4]:
news_data['TfidfEmbs'][0]

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [5]:
embeddings_matrix = np.stack(news_data['CombinedEmbeddings'].values)
similarity_matrix = cosine_similarity(embeddings_matrix)
article_ids = list(news_data['ArticleID'])
article_index_dict = {id: index for index, id in enumerate(article_ids)}

In [7]:
all([article_ids[id] == index for index, id in article_index_dict.items()])

True

In [8]:
def find_n_closest_articles(article_id, similarity_matrix, article_ids, n=5):
    # Get the index of the article_id in the DataFrame
    article_idx = article_ids.index(article_id)
    
    # Get the similarity scores for all other articles from the similarity matrix
    similarity_scores = similarity_matrix[article_idx]
    
    # Get indices of the scores sorted from highest to lowest
    sorted_indices = np.argsort(similarity_scores)[::-1]
    
    # Find the top n closest article indices, ignoring the first one as it's the article itself
    closest_indices = sorted_indices[1:n+1]
    
    # Retrieve the corresponding article IDs
    closest_article_ids = [article_ids[i] for i in closest_indices]
    
    return closest_article_ids

In [9]:
target_article_id = 'N34418'
closest_articles = find_n_closest_articles(target_article_id, similarity_matrix, article_ids, n=5)
for article in closest_articles:
    title = news_data[news_data['ArticleID'] == article]['Title'].values[0]
    category = news_data[news_data['ArticleID'] == article]['Category'].values[0]
    subcategory = news_data[news_data['ArticleID'] == article]['SubCategory'].values[0]
    print(f"Article: {title}, Category: {category}, Subcategory: {subcategory}")

Article: Seahawks congratulate Sounders MLS Cup triumph, Category: sports, Subcategory: soccer
Article: Fan Voices: Make them hear you, Category: sports, Subcategory: soccer_mls
Article: Sounders stun LAFC, advance to third MLS Cup in four years, Category: sports, Subcategory: soccer
Article: Seattle Sounders 3, Toronto FC 1: Takeaways from MLS Cup, Category: sports, Subcategory: soccer
Article: From Hendersonville to MLS Cup: Toronto FC goalkeeper gives back to community with camp, Category: sports, Subcategory: soccer


In [10]:
behaviors_df = pd.read_csv(behaviors_path, sep='\t', header=None, names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions'])

In [11]:
def get_similarity_of_article_to_history(article_id, history_articles, similarity_matrix, article_index_dict):
    history_indices = [article_index_dict.get(article, -1) for article in history_articles]
    history_indices = [i for i in history_indices if i != -1]
    if article_id in article_index_dict:
        article_index = article_index_dict[article_id]
        similarities = similarity_matrix[article_index, history_indices]
        return np.mean(similarities) if len(history_indices) > 0 else 0.0
    return 0.0

In [12]:
def check_acc(history, impressions, sim_matrix, article_index_dict):
    impression_similarities = {}
    for h2 in impressions:
        sign = True if h2.split('-')[1] == '1' else False
        id = h2.split('-')[0]
        pred_similarity = get_similarity_of_article_to_history(id, history, sim_matrix, article_index_dict)
        impression_similarities[id] = (pred_similarity, sign)
    impression_similarities = sorted(impression_similarities.items(), key=lambda x: x[1][0], reverse=True)
    return impression_similarities[0][1][1], impression_similarities

In [13]:
res_bool = []
res_sim = []

for iter, row in tqdm(behaviors_df.iterrows()):
    if isinstance(row['History'], float):
        continue
    history = row['History'].split(' ')
    impressions = row['Impressions'].split(' ')
    truth, simdict = check_acc(history, impressions, similarity_matrix, article_index_dict)
    res_bool.append(truth)
    res_sim.append(simdict)

0it [00:00, ?it/s]

In [14]:
# how many top1 recommendations were correct
print("Top1 accuracy: ", sum(res_bool) / len(res_bool), " total: ", sum(res_bool), " out of ", len(res_sim),
      " avg imp count in top1: ", np.mean([len(x) for x in res_sim if x[0][1][1]]), 
      " random baseline is ", 1/np.mean([len(x) for x in res_sim if x[0][1][1]]))

# how many top5 recommendations were correct
top5 = 0
top5_len = []
for r in res_sim:
    for i in range(min(5, len(r))):
        if r[i][1][1]:
            top5_len.append(len(r))
            top5 += 1
            break
print("Top5 accuracy: ", top5 / len(res_bool), " total: ", top5, " out of ", len(res_sim),
      " avg imp count in top5: ", np.mean(top5_len),
      " random baseline is ", 5/np.mean(top5_len))

# how many top10 recommendations were correct, careful out of bounds
top10 = 0
top10_len = []
for r in res_sim:
    for i in range(min(10, len(r))):
        if r[i][1][1]:
            top10_len.append(len(r))
            top10 += 1
            break
print("Top10 accuracy: ", top10 / len(res_bool), " total: ", top10, " out of ", len(res_sim),
      " avg imp count in top10: ", np.mean(top10_len),
      " random baseline is ", 10/np.mean(top10_len))


top15 = 0
top15_len = []
for r in res_sim:
    for i in range(min(15, len(r))):
        if r[i][1][1]:
            top15_len.append(len(r))
            top15 += 1
            break
print("Top15 accuracy: ", top15 / len(res_bool), " total: ", top15, " out of ", len(res_sim),
      " avg imp count in top15: ", np.mean(top15_len),
      " random baseline is ", 15/np.mean(top15_len))

Top1 accuracy:  0.14588849063599757  total:  22427  out of  153727  avg imp count in top1:  17.87287644357248  random baseline is  0.055950702907680214
Top5 accuracy:  0.46419952253019964  total:  71360  out of  153727  avg imp count in top5:  20.385089686098656  random baseline is  0.24527731184865398
Top10 accuracy:  0.6403494506495281  total:  98439  out of  153727  avg imp count in top10:  23.812686028911305  random baseline is  0.4199442258575477
Top15 accuracy:  0.740663643992272  total:  113860  out of  153727  avg imp count in top15:  26.165861584401895  random baseline is  0.5732660455920879


# Recommending articles

In [15]:
def recommend_similar_article(history_articles, all_articles, similarity_matrix, article_index_dict, n):
    # Filter history articles to those with valid indices in the article_index_dict
    history_indices = [article_index_dict[article] for article in history_articles if article in article_index_dict]

    if not history_indices:
        return []

    # Vectorized operation to get similarity scores for all articles against all history articles
    similarity_scores = similarity_matrix[:, history_indices]
    average_similarities = np.mean(similarity_scores, axis=1)

    # Get valid indices and corresponding articles that are present in both all_articles and article_index_dict
    valid_indices = [article_index_dict[article] for article in all_articles if article in article_index_dict]
    valid_articles = [article for article in all_articles if article in article_index_dict]

    # Filtered array of average similarities for valid articles
    filtered_similarities = average_similarities[valid_indices]

    # Sort valid articles based on filtered average similarity in descending order
    sorted_indices = np.argsort(-filtered_similarities)
    top_indices = sorted_indices[:n]
    top_articles = [(valid_articles[i], filtered_similarities[top_indices[i]]) for i in range(len(top_indices))]

    return top_articles

# Kanskje visualisere likhet mellom alle i historien

In [16]:
interactions_preds_dict = {}
all_articles = set(news_data['ArticleID'].unique())

#randomly sample 10000 interactions

for iter, row in tqdm(behaviors_df.sample(10000, random_state=42).iterrows()):
    if isinstance(row['History'], float):
        continue
    interactionid = row['ImpressionID']
    history = row['History'].split(' ')
    impressions = row['Impressions'].split(' ')
    sorted_articles = recommend_similar_article(history, all_articles, similarity_matrix, article_index_dict, 40)
    interactions_preds_dict[interactionid] = (sorted_articles, impressions)

0it [00:00, ?it/s]

In [17]:
recs = []
for interactionid, (preds, impressions) in interactions_preds_dict.items():
    for article, predicted_sim in preds:
        if f'{article}-1' in impressions or f'{article}-0' in impressions:
            recs.append((interactionid, article, predicted_sim, 1 if f'{article}-1' in impressions else 0))

In [18]:
print(sum([x[3] for x in recs]))
print(len(recs))
print(sum([x[3] for x in recs]) / len(recs))

32
539
0.059369202226345084


In [19]:
recs_10 = []
for interactionid, (preds, impressions) in interactions_preds_dict.items():
    for article, predicted_sim in preds[:10]:
        if f'{article}-1' in impressions or f'{article}-0' in impressions:
            recs_10.append((interactionid, article, predicted_sim, 1 if f'{article}-1' in impressions else 0))
print(sum([x[3] for x in recs_10]))
print(len(recs_10))
print(sum([x[3] for x in recs_10]) / len(recs_10))

19
196
0.09693877551020408


# Checking predicted similarities vs all impressions and history, individually

In [20]:
similarity_hist_imp = {}

for iter, row in tqdm(behaviors_df.iterrows()):
    if isinstance(row['History'], float):
        continue
    interactionid = row['ImpressionID']
    history = row['History'].split(' ')
    impressions = row['Impressions'].split(' ')
    # get each article similarity to history
    hist_idx = [article_index_dict.get(article) for article in history if article in article_index_dict]
    imp_idx = [article_index_dict.get(article.split('-')[0]) for article in impressions if article.split('-')[0] in article_index_dict]

    hist_imp_sim = similarity_matrix[hist_idx, :][:, imp_idx]
    
    similarity_hist_imp[interactionid] = hist_imp_sim
    if iter == 10000:
        break

0it [00:00, ?it/s]

In [21]:
checking_sim = []
for i, row in behaviors_df.iterrows():
    iids = []
    impression_id = row['ImpressionID']
    history = row['History']
    if isinstance(history, float):
        continue
    for impression in row['Impressions'].split(' '):
        # if impression.split('-')[1] == '1':
        iids.append(impression)
    
    for iid in iids:
        iid_idx = article_index_dict.get(iid.split('-')[0])
        chosen = int(iid.split('-')[1])
        sims = []
        for h in history.split(' '):
            hist_idx = article_index_dict.get(h)
            # print(f"Now checking: {iid} and {h}, similarity: {similarity_matrix[iid_idx, hist_idx]}, iid_idx: {iid_idx}, hist_idx: {hist_idx}")
            # break
            sims.append(similarity_matrix[iid_idx, hist_idx])
        checking_sim.append((impression_id, sims, chosen, np.mean(sims), np.median(sims), np.max(sims), np.min(sims)))
    if i == 2000:
        break

In [28]:
checking_sim_chosen = [x for x in checking_sim if x[2] == 1]
# plot the similarity-progression in the history for the chosen articles, checking_sim_chosen[n][1] is the list of similarities
chosen_sims = {} 
for chosen_sim in checking_sim_chosen:
    for i, sim in enumerate(chosen_sim[1]):
        if i not in chosen_sims:
            chosen_sims[i] = [sim]
        chosen_sims[i].append(sim)

In [41]:
chosen_avg_mean = []
chosen_avg_median = []
chosen_avg_max = []
chosen_avg_min = []

not_chosen_avg_mean = []
not_chosen_avg_median = []
not_chosen_avg_max = []
not_chosen_avg_min = []


chosen_first_history_vals = []
chosen_last_history_vals = []

chosen_first_half_hist_vals = []
chosen_second_half_hist_vals = []

for sim in checking_sim:
    if sim[2] == 1:
        chosen_avg_mean.append(sim[3])
        chosen_avg_median.append(sim[4])
        chosen_avg_max.append(sim[5])
        chosen_avg_min.append(sim[6])
        chosen_first_history_vals.append(sim[1][0])
        chosen_last_history_vals.append(sim[1][-1])
        chosen_first_half_hist_vals.extend(sim[1][:len(sim[1])//2])
        chosen_second_half_hist_vals.extend(sim[1][len(sim[1])//2:])
    else:
        not_chosen_avg_mean.append(sim[3])
        not_chosen_avg_median.append(sim[4])
        not_chosen_avg_max.append(sim[5])
        not_chosen_avg_min.append(sim[6])

print("Chosen average mean: ", np.mean(chosen_avg_mean))
print("Chosen average median: ", np.mean(chosen_avg_median))
print("Chosen average max: ", np.mean(chosen_avg_max))
print("Chosen average min: ", np.mean(chosen_avg_min))

print("Not chosen average mean: ", np.mean(not_chosen_avg_mean))
print("Not chosen average median: ", np.mean(not_chosen_avg_median))
print("Not chosen average max: ", np.mean(not_chosen_avg_max))
print("Not chosen average min: ", np.mean(not_chosen_avg_min))

print()
print("Chosen first history mean: ", np.mean(chosen_first_history_vals))
print("Chosen first history median: ", np.median(chosen_first_history_vals))
print("Chosen first history max: ", np.max(chosen_first_history_vals))
print("Chosen first history min: ", np.min(chosen_first_history_vals))

print()
print("Chosen last history mean: ", np.mean(chosen_last_history_vals))
print("Chosen last history median: ", np.median(chosen_last_history_vals))
print("Chosen last history max: ", np.max(chosen_last_history_vals))
print("Chosen last history min: ", np.min(chosen_last_history_vals))

print()
print("Chosen first half history mean: ", np.mean(chosen_first_half_hist_vals))
print("Chosen first half history median: ", np.median(chosen_first_half_hist_vals))
print("Chosen first half history max: ", np.max(chosen_first_half_hist_vals))
print("Chosen first half history min: ", np.min(chosen_first_half_hist_vals))

print()
print("Chosen second half history mean: ", np.mean(chosen_second_half_hist_vals))
print("Chosen second half history median: ", np.median(chosen_second_half_hist_vals))
print("Chosen second half history max: ", np.max(chosen_second_half_hist_vals))
print("Chosen second half history min: ", np.min(chosen_second_half_hist_vals))

Chosen average mean:  0.12220226161141695
Chosen average median:  0.1097030126364747
Chosen average max:  0.3598597403552351
Chosen average min:  -0.052103100464929795
Not chosen average mean:  0.09639943552229116
Not chosen average median:  0.08533578509223544
Not chosen average max:  0.29917559162290674
Not chosen average min:  -0.05192935092406395

Chosen first history mean:  0.11353611625340301
Chosen first history median:  0.06319438214486366
Chosen first history max:  1.0
Chosen first history min:  -0.2224091287965253

Chosen last history mean:  0.12263551495478624
Chosen last history median:  0.07441929792132591
Chosen last history max:  1.0
Chosen last history min:  -0.2853365833829339

Chosen first half history mean:  0.12114822759450043
Chosen first half history median:  0.0764871071490434
Chosen first half history max:  1.0000000000000002
Chosen first half history min:  -0.4101518084216964

Chosen second half history mean:  0.12264855649817724
Chosen second half history medi

In [104]:
bidx_hist = article_index_dict.get('N55189')
idx_iid = article_index_dict.get('N55689')
similarity_matrix[idx_iid, bidx_hist] == similarity_matrix[bidx_hist, idx_iid]

True

In [27]:
print(len(similarity_hist_imp[1]))
print(similarity_hist_imp[1])

9
[[ 0.20304136  0.34363462]
 [ 0.28050044  0.24382612]
 [ 0.12461182 -0.02835792]
 [ 0.18872453  0.36146383]
 [ 0.04357624  0.        ]
 [ 0.          0.        ]
 [ 0.18503037  0.14891398]
 [ 0.29415855  0.19921355]
 [ 0.14528373  0.28617642]]


In [48]:
users = behaviors_df['UserID'].unique()
user_histories = {}
for user in users:
    histories = behaviors_df[behaviors_df['UserID'] == user].History.values
    user_histories[user] = histories

In [51]:
equal = 0
not_eq = 0
not_eqs = []
for u_hist in user_histories.values():
    if not all(x == u_hist[0] for x in u_hist):
        not_eq += 1
        not_eqs.append(u_hist)
    else:
        equal += 1

In [57]:
b = [x for x in not_eqs if not all(y for y in x if not isinstance(y, float))]
b

[]

# Evals
#### Making testset from the last day of data (typically how data is gathered in industry), and adding prev impressions to history for each user 

In [63]:
behaviors_df["Time"] = pd.to_datetime(behaviors_df["Time"])
last_day = behaviors_df["Time"].max().date()

test_df = behaviors_df[behaviors_df["Time"].dt.date == last_day]
train_df = behaviors_df[behaviors_df["Time"].dt.date != last_day]
print(len(test_df), len(train_df))

30270 126695


In [80]:
def get_all_user_history_sets(df):
    user_histories = {}
    for user in df['UserID'].unique():
        hist_set = set()
        histories = df[df['UserID'] == user].History.values
        if isinstance(histories[0], float):
            continue
        histories = [history.split(' ') for history in histories]
        hist_list = [hist for history in histories for hist in history]
        impressions = df[df['UserID'] == user].Impressions.values
        impressions = [impression.split(' ') for impression in impressions]
        impressions = [imp for impression in impressions for imp in impression]
        imps = [impression.split('-')[0] if impression.split('-')[1] == '1' else None for impression in impressions]
        imps = [imp for imp in imps if imp is not None]
        hist_set.update(imps)
        hist_set.update(hist_list)
        if user in user_histories:
            user_histories[user].update(hist_set)
        else:
            user_histories[user] = hist_set
    return user_histories

In [108]:
user_histories = get_all_user_history_sets(train_df)
test_df['UserHistory'] = test_df['UserID'].apply(lambda x: ' '.join(list(user_histories[x])) if x in user_histories else '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['UserHistory'] = test_df['UserID'].apply(lambda x: ' '.join(list(user_histories[x])) if x in user_histories else '')


In [109]:
test_df

Unnamed: 0,ImpressionID,UserID,Time,History,Impressions,UserHistory
2,3,U73700,2019-11-14 07:01:48,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,N62058 N24233 N25792 N47817 N33164 N60384 N188...
10,11,U89744,2019-11-14 08:38:04,N24422 N25287 N39121 N41777 N58226 N119 N29197...,N47572-0 N45523-0 N64560-0 N53245-0 N8509-0 N5...,N62058 N29197 N28088 N45794 N52631 N13057 N260...
13,14,U29155,2019-11-14 12:26:47,N60785 N11885 N38939 N25114 N44984 N4830 N2068...,N44698-0 N37204-0 N36612-0 N64174-0 N29212-0 N...,N18708 N60785 N25114 N6890 N59139 N12576 N2386...
20,21,U70879,2019-11-14 10:45:51,N47823 N44013 N17354 N26531 N22570 N16215 N298...,N38442-0 N50601-0 N36016-0 N42457-0 N23446-0 N...,N62058 N29197 N3388 N50890 N3086 N14029 N15788...
39,40,U27024,2019-11-14 14:24:04,N38629 N50155 N29177 N56426 N63842 N36565 N307...,N20394-0 N28072-0 N29212-0 N47572-0 N54321-0 N...,N7242 N4607 N42801 N50155 N39634 N45794 N32098...
...,...,...,...,...,...,...
156947,156948,U30039,2019-11-14 11:44:21,N39556 N932 N13079 N26729 N14454 N60615 N24002...,N10960-0 N29369-0 N53515-0 N20676-0 N23814-0 N...,N11005 N34004 N40716 N41375 N28088 N38179 N476...
156959,156960,U72015,2019-11-14 16:20:44,N53895 N48715 N5469,N14478-0 N9621-0 N22257-0 N23391-0 N61595-0 N2...,
156960,156961,U21593,2019-11-14 22:24:05,N7432 N58559 N1954 N43353 N14343 N13008 N28833...,N2235-0 N22975-0 N64037-0 N47652-0 N11378-0 N4...,N5831 N25114 N4607 N42470 N63003 N19347 N61773...
156962,156963,U75630,2019-11-14 10:58:13,N29898 N59704 N4408 N9803 N53644 N26103 N812 N...,N55913-0 N62318-0 N53515-0 N10960-0 N9135-0 N5...,N62058 N18708 N5283 N14029 N41375 N40207 N2610...


## Making predictions on each impression in test_df

In [110]:
interactions_preds_dict = {}
all_articles = set(news_data['ArticleID'].unique())

#randomly sample 10000 interactions

for iter, row in tqdm(test_df.sample(200, random_state=42).iterrows()):
    if row['UserHistory'] == '':
        continue
    interactionid = row['ImpressionID']
    history = row['UserHistory'].split(' ')
    impressions = row['Impressions'].split(' ')
    sorted_articles = recommend_similar_article(history, all_articles, similarity_matrix, article_index_dict, 40)
    interactions_preds_dict[interactionid] = (sorted_articles, impressions)

0it [00:00, ?it/s]

KeyboardInterrupt: 

In [None]:
recs = []
for interactionid, (preds, impressions) in interactions_preds_dict.items():
    for article, predicted_sim in preds:
        if f'{article}-1' in impressions or f'{article}-0' in impressions:
            recs.append((interactionid, article, predicted_sim, 1 if f'{article}-1' in impressions else 0))

In [None]:
print(sum([x[3] for x in recs]))
print(len(recs))
print(sum([x[3] for x in recs]) / len(recs))
recs_10 = []
for interactionid, (preds, impressions) in interactions_preds_dict.items():
    for article, predicted_sim in preds[:10]:
        if f'{article}-1' in impressions or f'{article}-0' in impressions:
            recs_10.append((interactionid, article, predicted_sim, 1 if f'{article}-1' in impressions else 0))
print(sum([x[3] for x in recs_10]))
print(len(recs_10))
print(sum([x[3] for x in recs_10]) / len(recs_10))