# Imports, data loading

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import json

news_path = 'MINDsmall_train/news.tsv'
news_data = pd.read_csv(news_path, sep='\t', header=None, names=['ArticleID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbsEntities'])

news_data['Title'] = news_data['Title'].fillna('')
news_data['Abstract'] = news_data['Abstract'].fillna('')

# Concatenating title and abstract for a comprehensive representation
news_data['content'] = news_data['Title'] + " " + news_data['Abstract']

# Vectorizing the content using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10)

tfidf_matrix = tfidf_vectorizer.fit_transform(news_data['content'])

In [25]:
news_data

Unnamed: 0,ArticleID,Category,SubCategory,Title,Abstract,URL,TitleEntities,AbsEntities,content
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[],"The Brands Queen Elizabeth, Prince Charles, an..."
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...",50 Worst Habits For Belly Fat These seemingly ...
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId...",The Cost of Trump's Aid Freeze in the Trenches...
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ...",I Was An NBA Wife. Here's How It Affected My M...
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","How to Get Rid of Skin Tags, According to a De..."
...,...,...,...,...,...,...,...,...,...
51277,N16909,weather,weathertopstories,"Adapting, Learning And Soul Searching: Reflect...",Woolsey Fire Anniversary: A community is forev...,https://assets.msn.com/labs/mind/BBWzQJK.html,"[{""Label"": ""Woolsey Fire"", ""Type"": ""N"", ""Wikid...","[{""Label"": ""Woolsey Fire"", ""Type"": ""N"", ""Wikid...","Adapting, Learning And Soul Searching: Reflect..."
51278,N47585,lifestyle,lifestylefamily,Family says 13-year-old Broadway star died fro...,,https://assets.msn.com/labs/mind/BBWzQYV.html,"[{""Label"": ""Broadway theatre"", ""Type"": ""F"", ""W...",[],Family says 13-year-old Broadway star died fro...
51279,N7482,sports,more_sports,St. Dominic soccer player tries to kick cancer...,"Sometimes, what happens on the sidelines can b...",https://assets.msn.com/labs/mind/BBWzQnK.html,[],[],St. Dominic soccer player tries to kick cancer...
51280,N34418,sports,soccer_epl,How the Sounders won MLS Cup,"Mark, Jeremiah and Casey were so excited they ...",https://assets.msn.com/labs/mind/BBWzQuK.html,"[{""Label"": ""MLS Cup"", ""Type"": ""U"", ""WikidataId...",[],"How the Sounders won MLS Cup Mark, Jeremiah an..."


# Loading entity embeddings

In [26]:
def load_entity_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            entity_id = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[entity_id] = vector
    return embeddings

entity_embeddings_path = './MINDsmall_train/entity_embedding.vec'
entity_embeddings = load_entity_embeddings(entity_embeddings_path)

In [28]:
def article_to_embedding(article_entities, entity_embeddings):
    try:
        q_entities = [entity['WikidataId'] for entity in json.loads(article_entities)]
    except:
        q_entities = []
    embeddings = [entity_embeddings[entity] for entity in q_entities if entity in entity_embeddings]
    
    if embeddings:
        article_embedding = np.mean(embeddings, axis=0)
    else:
        article_embedding = np.zeros(next(iter(entity_embeddings.values())).shape)
    return article_embedding

def embeddings_count(article_entities):
    try:
        return len([entity['WikidataId'] for entity in json.loads(article_entities)])
    except:
        return 0

news_data["TitleEmbeddings"] = np.array(news_data['TitleEntities'].apply(lambda x: article_to_embedding(x, entity_embeddings)))
news_data["AbsEmbeddings"] = np.array(news_data['AbsEntities'].apply(lambda x: article_to_embedding(x, entity_embeddings)))

news_data["TitleEmbeddingsCount"] = news_data['TitleEntities'].apply(lambda x: embeddings_count(x))
news_data["AbsEmbeddingsCount"] = news_data['AbsEntities'].apply(lambda x: embeddings_count(x))

In [32]:
news_data = news_data[['ArticleID', 'TitleEmbeddings', 'AbsEmbeddings', 'TitleEmbeddingsCount', 'AbsEmbeddingsCount']]
news_data

Unnamed: 0,ArticleID,TitleEmbeddings,AbsEmbeddings,TitleEmbeddingsCount,AbsEmbeddingsCount
0,N55528,"[0.0040573333, -0.03991733, -0.008374, 0.07914...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3,0
1,N19639,"[-0.013597, -0.009758, 0.01712, -0.051993, 0.0...","[-0.013597, -0.009758, 0.01712, -0.051993, 0.0...",1,1
2,N61837,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.065324, -0.088163, -0.015203, -0.031949, 0...",0,1
3,N53526,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.003752, -0.061771, -0.037073, 0.02677, -0.0...",0,1
4,N38324,"[0.014257, 0.018351, 0.005199, 0.001773, 0.031...","[-0.008593666, -0.00016800004, 0.027127668, -0...",1,3
...,...,...,...,...,...
51277,N16909,"[0.055938, -0.002104, 0.03155, -0.022865, -0.0...","[0.055938, -0.002104, 0.03155, -0.022865, -0.0...",1,1
51278,N47585,"[0.069955, -0.007432, 0.075966, 0.00397, -0.08...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,0
51279,N7482,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,0
51280,N34418,"[0.00011349935, 0.0340165, 0.00024800003, -0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2,0


In [4]:
from sklearn.preprocessing import normalize

# Normalize both matrices
tfidf_matrix_normalized = normalize(tfidf_matrix, axis=1)
article_embeddings_normalized = normalize(article_embeddings, axis=1)

# Combine TF-IDF and entity embeddings by concatenating them
combined_features = np.hstack([tfidf_matrix_normalized.toarray(), article_embeddings_normalized])