# MIND Dataset Preprocessing

## Libraries, Imports, Setup

In [1]:
import pandas as pd
import numpy as np
import random
import string
import ast
import wordninja
import warnings
from scipy import sparse
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/christopherstephan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/christopherstephan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Loading Datasets

In [2]:
# Creating the path to the training dataset
behaviours_train_path = "/Users/christopherstephan/Documents/IE/Term 3/CAPSTONE/Datasets/MINDlarge_train/behaviors.tsv"  
news_train_path = "/Users/christopherstephan/Documents/IE/Term 3/CAPSTONE/Datasets/MINDlarge_train/news.tsv"

# Loading TSV file from the specified path
behaviours = pd.read_csv(behaviours_train_path, sep="\t", header=None)
news = pd.read_csv(news_train_path, sep="\t", header=None)

In [3]:
# Define the Column Names for the datasets
behaviours.columns = ["impression_id", "user_id", "time", "history", "impressions"]
news.columns = ["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]

In [4]:
# Path to files
entity_embedding_path = "/Users/christopherstephan/Documents/IE/Term 3/CAPSTONE/Datasets/MINDlarge_train/entity_embedding.vec"  
relation_embedding_path = "/Users/christopherstephan/Documents/IE/Term 3/CAPSTONE/Datasets/MINDlarge_train/relation_embedding.vec"

with open(entity_embedding_path, "r") as f:
    for _ in range(5):  # Read first 5 lines
        print(f.readline())

with open(relation_embedding_path, "r") as f:
    for _ in range(5):  # Read first 5 lines
        print(f.readline())

Q34433	0.017808	-0.073256	0.102521	-0.059926	-0.060665	0.027027	-0.091728	-0.003057	-0.170798	0.111819	0.006821	-0.049873	-0.050532	-0.003127	-0.074472	-0.115891	-0.067093	-0.095272	0.019178	-0.083725	-0.060890	0.017744	0.049417	-0.026014	-0.048549	0.017528	0.044163	0.022111	-0.081519	0.046278	-0.183939	-0.063143	-0.014518	-0.080644	-0.099994	0.085905	-0.083003	-0.092844	-0.216481	0.125441	0.179819	0.036735	-0.085375	0.021276	-0.154971	0.039009	0.016059	0.067725	-0.148213	0.158773	-0.028527	0.125790	0.006361	0.067541	0.077552	0.060792	-0.044511	-0.005862	-0.068080	-0.063204	-0.094127	0.115441	-0.016472	0.106616	0.047839	-0.151805	-0.111083	-0.142330	-0.120680	-0.050393	-0.073787	0.017424	-0.081620	0.062599	-0.022102	-0.102688	-0.128149	-0.075895	0.095134	0.000984	0.010143	-0.068552	-0.026573	0.019735	-0.000981	-0.126635	0.008300	0.170557	0.002250	-0.157175	-0.077962	0.013433	0.045894	-0.071253	0.086445	-0.120466	0.059235	-0.071865	0.058854	0.024765	

Q41	-0.063388	-0.181451	0.057501	-0

In [5]:
# Load entity embeddings
entity_embeddings = {}
with open(entity_embedding_path, "r") as f:
    for line in f:
        values = line.strip().split()
        entity = values[0]  # First value is entity name
        vector = np.array(values[1:], dtype=np.float32)  # Rest are vector values
        entity_embeddings[entity] = vector


# Load relation embeddings
relation_embeddings = {}
with open(relation_embedding_path, "r") as f:
    for line in f:
        values = line.strip().split()
        entity = values[0]  # First value is entity name
        vector = np.array(values[1:], dtype=np.float32)  # Rest are vector values
        relation_embeddings[entity] = vector

## Cleaning, Preprocessing, Feature Engineering

### Behaviours

In [8]:
def clean_behaviours(behaviours):
    """
    Cleans the behaviors dataset by handling missing values and removing duplicates.

    Parameters:
        behaviours (pd.DataFrame): DataFrame containing the behaviors dataset.

    Returns:
        pd.DataFrame: Cleaned behaviors dataset.
    """
    behaviours["history"].fillna("No_History", inplace=True)

    # Remove duplicate rows
    behaviours.drop_duplicates(inplace=True)

    return behaviours

def convert_time(behaviours):
    """
    Converts the 'Time' column to datetime format and extracts the hour.

    Parameters:
        behaviours (pd.DataFrame): DataFrame containing the behaviors dataset.

    Returns:
        pd.DataFrame: Updated behaviors dataset with formatted datetime and extracted hour.
    """
    behaviours["time"] = pd.to_datetime(behaviours["time"], format="%m/%d/%Y %I:%M:%S %p", errors="coerce")
    behaviours["hour"] = behaviours["time"].dt.hour  # Extract hour
    return behaviours

def extract_num_clicks(behaviours):
    """
    Extracts the number of clicks per session based on the 'Impressions' column.

    Parameters:
        behaviours (pd.DataFrame): DataFrame containing the behaviors dataset.

    Returns:
        pd.DataFrame: Updated DataFrame with 'num_clicks' feature.
    """
    def count_clicks(impressions):
        """
        Counts the number of clicked news articles in an impression session.

        Parameters:
            impressions (str): String of impressions formatted as "N1-1 N2-0".

        Returns:
            int: Number of clicked items.
        """
        if not isinstance(impressions, str) or impressions.strip() == "":
            return 0
        return sum([int(i.split('-')[1]) for i in impressions.split() if '-' in i])

    behaviours["num_clicks"] = behaviours["impressions"].apply(count_clicks)

    return behaviours

def extract_history_length(behaviours):
    """
    Extracts the length of the user's reading history.

    Parameters:
        behaviours (pd.DataFrame): DataFrame containing the behaviors dataset.

    Returns:
        pd.DataFrame: Updated DataFrame with 'history_length' feature.
    """
    behaviours["history_length"] = behaviours["history"].apply(lambda x: 0 if x == "No_History" else len(x.split()))
    return behaviours

def split_clicked_nonclicked(behaviours):
    """
    Splits the impressions column into separate clicked and non-clicked news lists.

    Parameters:
        behaviours (pd.DataFrame): DataFrame containing the behaviors dataset.

    Returns:
        pd.DataFrame: Updated DataFrame with 'clicked_news' and 'non_clicked_news'.
    """
    def split_impressions(impressions):
        """
        Splits impressions into clicked and non-clicked news IDs.

        Parameters:
            impressions (str): String of impressions formatted as "N1-1 N2-0".

        Returns:
            tuple: Two lists (clicked_news, non_clicked_news).
        """
        clicked = []
        non_clicked = []
        if isinstance(impressions, str) and impressions.strip():
            for item in impressions.split():
                parts = item.split('-')
                if len(parts) == 2:
                    news_id, clicked_status = parts
                    if clicked_status == "1":
                        clicked.append(news_id)
                    else:
                        non_clicked.append(news_id)
        return clicked, non_clicked

    behaviours[["clicked_news", "non_clicked_news"]] = behaviours["impressions"].apply(lambda x: pd.Series(split_impressions(x)))
    return behaviours

def convert_user_id(behaviours):
    """
    Converts 'User ID' into a numerical index for easier model use.

    Parameters:
        behaviours (pd.DataFrame): DataFrame containing the behaviors dataset.

    Returns:
        pd.DataFrame: Updated DataFrame with numerical 'user_index'.
    """
    user_mapping = {uid: idx for idx, uid in enumerate(behaviours["user_id"].unique())}
    behaviours["user_index"] = behaviours["user_id"].map(user_mapping)
    return behaviours

def compute_user_avg_clicks(behaviours):
    """
    Computes the average number of clicks per user.

    Parameters:
        behaviours (pd.DataFrame): DataFrame containing the behaviors dataset.

    Returns:
        pd.DataFrame: Updated behaviors dataset with 'user_avg_clicks' feature.
    """
    user_click_counts = behaviours.groupby("user_id")["num_clicks"].sum()
    behaviours["user_avg_clicks"] = behaviours["user_id"].map(user_click_counts)
    return behaviours

def compute_recency(behaviours):
    """
    Computes recency of last interaction by measuring the time since the last recorded interaction.

    Parameters:
        behaviours (pd.DataFrame): DataFrame containing the behaviors dataset.

    Returns:
        pd.DataFrame: Updated behaviors dataset with 'recency' feature.
    """
    max_time = behaviours["time"].max()
    behaviours["recency"] = (max_time - behaviours["time"]).dt.total_seconds() / 3600  # Convert to hours
    return behaviours

In [9]:
def preprocess_behaviours(behaviours):
    """
    Full preprocessing pipeline for the behaviors dataset.

    Parameters:
        behaviours (pd.DataFrame): DataFrame containing the behaviors dataset.

    Returns:
        pd.DataFrame: Fully processed behaviors dataset.
    """
    behaviours = clean_behaviours(behaviours)
    behaviours = convert_time(behaviours)
    behaviours = extract_num_clicks(behaviours)
    behaviours = extract_history_length(behaviours)
    behaviours = split_clicked_nonclicked(behaviours)
    behaviours = convert_user_id(behaviours)
    behaviours = compute_user_avg_clicks(behaviours)
    behaviours = compute_recency(behaviours)

    return behaviours

In [10]:
# Run the full preprocessing pipeline
processed_behaviours = preprocess_behaviours(behaviours)

processed_behaviours.head()

Unnamed: 0,impression_id,user_id,time,history,impressions,hour,num_clicks,history_length,clicked_news,non_clicked_news,user_index,user_avg_clicks,recency
0,1,U87243,2019-11-10 11:30:54,N8668 N39081 N65259 N79529 N73408 N43615 N29379 N32031 N110232 N101921 N12614 N129591 N105760 N60457 N1229 N64932,N78206-0 N26368-0 N7578-0 N58592-0 N19858-0 N58258-0 N18478-0 N2591-0 N97778-0 N32954-0 N94157-1 N39404-0 N108809-0 N78699-1 N71090-1 N40282-0 N31174-1 N37924-0 N27822-0,11,4,16,"[N94157, N78699, N71090, N31174]","[N78206, N26368, N7578, N58592, N19858, N58258, N18478, N2591, N97778, N32954, N39404, N108809, N40282, N37924, N27822]",0,9,108.484722
1,2,U598644,2019-11-12 13:45:29,N56056 N8726 N70353 N67998 N83823 N111108 N107520 N53168 N78756 N106411 N81824 N85691 N16173 N24446 N120926 N19175 N126159 N26373 N72369 N73228 N111873 N88638 N29503 N31055,N47996-0 N82719-0 N117066-0 N8491-0 N123784-0 N21431-0 N106162-0 N59646-0 N118348-0 N25587-1 N37544-0 N20780-0 N36266-1 N46329-0 N64102-0 N120930-0 N62024-0 N11017-0 N76539-0 N26359-0 N108379-0 N87913-0 N125161-0 N23946-0 N83702-0 N62788-0 N6306-0 N66902-0 N93643-0,13,2,24,"[N25587, N36266]","[N47996, N82719, N117066, N8491, N123784, N21431, N106162, N59646, N118348, N37544, N20780, N46329, N64102, N120930, N62024, N11017, N76539, N26359, N108379, N87913, N125161, N23946, N83702, N62788, N6306, N66902, N93643]",1,11,58.241667
2,3,U532401,2019-11-13 11:23:03,N128643 N87446 N122948 N9375 N82348 N129412 N54948 N36094 N44660 N3948 N21332 N81364 N83062 N87788 N59280 N31323,N103852-0 N53474-0 N127836-0 N47925-1,11,1,16,[N47925],"[N103852, N53474, N127836]",2,4,36.615556
3,4,U593596,2019-11-12 12:24:09,N31043 N39592 N4104 N8223 N114581 N92747 N12070 N61321 N40052 N50176 N101119 N76810 N37509,N38902-0 N76434-0 N71593-0 N100073-0 N108736-0 N3061-0 N89658-0 N33030-0 N110473-0 N125655-0 N100206-0 N37544-0 N60370-0 N59935-0 N114935-1 N114135-0 N46262-0 N61065-0 N46329-0 N27368-0 N84381-0 N83412-0 N2624-0 N34269-0 N19053-0 N115724-0 N20990-0 N100380-0 N2074-0 N128367-0 N8719-0 N109344-0 N33078-0 N100539-0 N119665-0 N7937-0 N121580-0 N108581-0 N92498-0 N18947-0 N109642-0 N79048-0 N47277-0 N53225-0 N41710-0 N122819-0 N30417-0 N79044-0 N76539-0 N57763-0 N54239-0 N62203-0,12,1,13,[N114935],"[N38902, N76434, N71593, N100073, N108736, N3061, N89658, N33030, N110473, N125655, N100206, N37544, N60370, N59935, N114135, N46262, N61065, N46329, N27368, N84381, N83412, N2624, N34269, N19053, N115724, N20990, N100380, N2074, N128367, N8719, N109344, N33078, N100539, N119665, N7937, N121580, N108581, N92498, N18947, N109642, N79048, N47277, N53225, N41710, N122819, N30417, N79044, N76539, N57763, N54239, N62203]",3,10,59.597222
4,5,U239687,2019-11-14 20:03:01,N65250 N122359 N71723 N53796 N41663 N41484 N112765 N100765 N87642 N83576 N33881 N110863 N67875 N100466 N36761 N116312 N82374 N93136 N10659 N4857 N32369 N21104 N96120 N50191 N77001 N59349 N66535 N19719 N127567 N96764 N54850 N61319 N113547 N13277 N36604 N32800 N116279 N73179 N124109 N16608 N98215 N6229 N17492 N64552 N91231 N63676 N67779 N82799 N9375 N26586 N48453 N10711 N101263 N21773 N115066 N70097 N18031 N15471 N454 N56610 N112933 N75756 N72571 N41544 N48112 N42209 N44759 N76970 N69300 N108284 N76557 N104961 N46069 N81745 N117275 N12252 N116750 N18171 N87002 N60556 N45946 N20573 N6718 N50901 N3290 N10869 N40259 N14986 N97548 N51302 N39307 N86320 N89086 N29786 N37094 N38627 N51450 N107878 N126611 N57342 N81982 N87654 N3435 N83253 N123097 N113417 N71728 N100119 N57900 N7391 N13604 N71728 N111704 N3565 N53215 N39384 N44431 N129207 N31418 N32534 N124532 N15922 N118298 N19577 N34004 N104234 N103357 N7992 N87859 N80126 N86141 N4289 N127530 N36450 N91985 N9273 N105191 N2927 N49485 N34520 N129412 N75078 N97753 N88892 N79285 N115007 N68429 N56365 N60132 N91985 N79724 N32641 N29709 N36823 N96616 N29709 N46994 N128125 N100655 N18914 N43911 N119601 N43312 N4012 N102366 N92079 N71665 N54177 N43104 N21264 N44671 N98656 N108393 N94450 N75220 N101271 N59645 N55120 N128115 N262 N117511 N70198 N88893 N9892 N9035 N102952 N117229 N65740 N72408 N117791 N80178 N477 N38485 N104029 N93006 N121168 N29466 N64862 N36573 N61712 N49594 N94499 N66959 N11266 N71922 N114335 N24344 N45083 N90679 N100033 N128429 N72135 N3113 N25166 N89408 N106403 N23264 N5967 N12820 N31820 N93575 N51431 N53933 N121845 N25163 N82045 N10996 N104737 N86842 N52677 N50720 N71857 N1118 N71977 N65749 N124752 N18515 N55558 N93359 N72976 N30755 N98095 N89631 N124966 N59447 N65044 N2664 N23666 N98051 N18548 N24446 N113698 N91597 N61321 N29936 N16202 N3188 N108312 N48258 N4266 N91979 N33197 N107039 N70664 N92747 N14032 N74723 N97466 N82926 N3242 N49418 N22773 N110586 N149 N76018 N39270 N34431 N59033 N26572 N78741 N10705 N13901 N89509 N5326 N94706 N77135 N46477 N5653 N114343 N64300 N48084 N91968 N13434 N73137 N1886 N90966 N18904 N1713 N40765 N120031 N21641 N67466 N35667 N10490 N112667 N108016 N41056 N67497 N120196 N50684 N11952 N41214 N31494 N64035 N70868 N73183 N81315 N78570 N88019 N54753 N75342 N73240 N11653 N116736 N127001 N18681 N97815 N51166 N41083 N93681 N38254 N101083 N7416 N126000 N101119 N78377 N80304 N22444 N98657,N76209-0 N48841-0 N67937-0 N62235-0 N6307-0 N34254-0 N89112-0 N27471-0 N72919-0 N48758-0 N57001-0 N45782-0 N25249-0 N12907-0 N46931-0 N55816-0 N34424-0 N49048-0 N123028-0 N44109-0 N56451-0 N44264-0 N80770-0 N127593-0 N120147-0 N20553-0 N32433-0 N2297-0 N51048-0 N107893-0 N117695-0 N88596-0 N67588-0 N113680-0 N78508-0 N20312-0 N83707-0 N20250-0 N103772-0 N99529-0 N46223-0 N110603-0 N104990-0 N24735-0 N96840-0 N102499-0 N51163-0 N26122-0 N54845-0 N29477-0 N55761-0 N59866-0 N41737-0 N23748-0 N45724-0 N51569-0 N32863-0 N122559-0 N13761-0 N38861-0 N114449-0 N35236-0 N81570-0 N61964-0 N18070-0 N121758-0 N5496-0 N122150-0 N40795-0 N91390-0 N92199-0 N64785-0 N9447-0 N25756-0 N4232-0 N34629-0 N83374-0 N63342-0 N92449-0 N72485-0 N69201-0 N28821-0 N92077-0 N79044-0 N123120-0 N76189-0 N45410-0 N27862-0 N32419-0 N75305-0 N76665-0 N3623-0 N33011-0 N126134-0 N48205-0 N65446-0 N14804-0 N110439-0 N75646-0 N33702-0 N44620-0 N93873-0 N91238-0 N100289-0 N33539-0 N68624-0 N77712-0 N19455-0 N92300-0 N25814-0 N25443-0 N84574-0 N112156-0 N117411-0 N70883-0 N4371-0 N103810-0 N87146-0 N23077-0 N27836-0 N11846-0 N67955-0 N86258-1 N88329-0 N16161-0 N94999-0 N129503-0 N87070-0 N120708-0,20,1,339,[N86258],"[N76209, N48841, N67937, N62235, N6307, N34254, N89112, N27471, N72919, N48758, N57001, N45782, N25249, N12907, N46931, N55816, N34424, N49048, N123028, N44109, N56451, N44264, N80770, N127593, N120147, N20553, N32433, N2297, N51048, N107893, N117695, N88596, N67588, N113680, N78508, N20312, N83707, N20250, N103772, N99529, N46223, N110603, N104990, N24735, N96840, N102499, N51163, N26122, N54845, N29477, N55761, N59866, N41737, N23748, N45724, N51569, N32863, N122559, N13761, N38861, N114449, N35236, N81570, N61964, N18070, N121758, N5496, N122150, N40795, N91390, N92199, N64785, N9447, N25756, N4232, N34629, N83374, N63342, N92449, N72485, N69201, N28821, N92077, N79044, N123120, N76189, N45410, N27862, N32419, N75305, N76665, N3623, N33011, N126134, N48205, N65446, N14804, N110439, N75646, N33702, ...]",4,97,3.949444


### News

In [11]:
def clean_news(news):
    """
    Cleans the news dataset by handling missing values and duplicates.
    
    Parameters:
        news (pd.DataFrame): DataFrame containing the news dataset.
        
    Returns:
        pd.DataFrame: Cleaned news dataset.
    """
    # Remove rows with missing values (articles without an abstract)
    news.dropna(inplace=True)
    
    # Remove duplicate news articles
    news.drop_duplicates(inplace=True)

    return news

def preprocess_text(news):
    """
    Processes text fields in the news dataset by:
    - Splitting concatenated words in Category/SubCategory
    - Converting text to lowercase
    - Removing punctuation
    - Removing stopwords
    
    Parameters:
        news (pd.DataFrame): DataFrame containing the news dataset.
        
    Returns:
        pd.DataFrame: Updated news dataset with processed text.
    """
    def split_and_replace(word):
        return ' '.join(wordninja.split(word))

    # Apply function to Category and SubCategory
    news['category'] = news['category'].apply(split_and_replace)
    news['subcategory'] = news['subcategory'].apply(split_and_replace)

    # Convert text fields to lowercase
    columns_to_lower = ['category', 'subcategory', 'title', 'abstract']
    news[columns_to_lower] = news[columns_to_lower].applymap(lambda x: x.lower() if isinstance(x, str) else x)

    # Create 'Content' column by merging multiple text fields
    news['content'] = news[['category', 'subcategory', 'title', 'abstract']].apply(' '.join, axis=1)

    # Remove punctuation
    news['content'] = news['content'].str.replace(f"[{string.punctuation}]", "", regex=True)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    news['content'] = news['content'].apply(lambda text: " ".join(
        [word for word in text.split() if word.lower() not in stop_words]
    ))

    return news

def extract_word_count(news):
    """
    Adds a new feature 'Content_WC' that represents the word count in the content field.
    
    Parameters:
        news (pd.DataFrame): DataFrame containing the news dataset.
        
    Returns:
        pd.DataFrame: Updated news dataset with word count feature.
    """
    news['content_word_count'] = news['content'].str.split().str.len()
    return news

def extract_entities(news):
    """
    Extracts Wikidata entity IDs from the 'Title Entities' and 'Abstract Entities' columns.
    
    Parameters:
        news (pd.DataFrame): DataFrame containing the news dataset.
        
    Returns:
        pd.DataFrame: Updated news dataset with extracted entity IDs.
    """
    # Convert string representations of lists to actual lists
    news['title_entities'] = news['title_entities'].apply(ast.literal_eval)
    news['abstract_entities'] = news['abstract_entities'].apply(ast.literal_eval)

    # Extract Wikidata IDs from title and abstract entities
    news['title_wikidata_id'] = news['title_entities'].apply(lambda x: ' '.join([d['WikidataId'] for d in x]))
    news['abstract_wikidata_id'] = news['abstract_entities'].apply(lambda x: ' '.join([d['WikidataId'] for d in x]))

    # Combine both into a single column
    news['all_wikidata_ids'] = news['title_wikidata_id'] + ' ' + news['abstract_wikidata_id']
    
    return news

def extract_relations(news):
    """
    Extracts Wikidata relation (property) IDs from the 'Title Entities' and 'Abstract Entities' columns.

    Parameters:
        news (pd.DataFrame): DataFrame containing the news dataset.

    Returns:
        pd.DataFrame: Updated news dataset with extracted relation IDs.
    """
    news['title_relations'] = news['title_entities'].apply(lambda x: ' '.join([d['Type'] for d in x]))
    news['abstract_relations'] = news['abstract_entities'].apply(lambda x: ' '.join([d['Type'] for d in x]))

    # Combine into a single column
    news['all_relation_ids'] = news['title_relations'] + ' ' + news['abstract_relations']
    
    return news

In [12]:
def preprocess_news(news):
    """
    Full preprocessing pipeline for the news dataset.
    
    Parameters:
        news (pd.DataFrame): DataFrame containing the news dataset.
        embedding_file (str): Path to the entity embedding file.
        
    Returns:
        pd.DataFrame: Fully processed news dataset.
    """
    news = clean_news(news)
    news = preprocess_text(news)
    news = extract_word_count(news)
    news = extract_entities(news)
    news = extract_relations(news)

    return news

In [13]:
# Run the full preprocessing pipeline
processed_news = preprocess_news(news)

  news[columns_to_lower] = news[columns_to_lower].applymap(lambda x: x.lower() if isinstance(x, str) else x)


### Entity Embeddings

In [15]:
def compute_entity_vectors(news, entity_embeddings):
    """
    Computes the average embedding vector for each news article based on extracted entity IDs.

    Parameters:
        news (pd.DataFrame): DataFrame containing the news dataset.
        entity_embeddings (dict): Dictionary mapping Wikidata IDs to embedding vectors.

    Returns:
        pd.DataFrame: Updated news dataset with 'average_vector' feature.
    """
    def calculate_average_vector(vector_ids, entity_embeddings):
        """
        Helper function to compute the mean vector for a given list of entity IDs.

        Parameters:
            vector_ids (str): Space-separated string of entity IDs.
            entity_embeddings (dict): Dictionary of pre-loaded entity embeddings.

        Returns:
            np.ndarray or np.nan: Averaged vector if entities exist, otherwise NaN.
        """
        vector_ids = vector_ids.split() if isinstance(vector_ids, str) else []
        vectors = [entity_embeddings.get(entity) for entity in vector_ids if entity in entity_embeddings]

        if vectors:
            return np.mean(vectors, axis=0)
        else:
            return np.nan  # Keep NaN for missing values

    # Apply function to each row in 'All Wikidata IDs' column
    news['entity_vector'] = news['all_wikidata_ids'].apply(lambda x: calculate_average_vector(x, entity_embeddings))

    return news


In [16]:
# Call the function
processed_news = compute_entity_vectors(processed_news, entity_embeddings)

### Relation Embeddings

In [17]:
def compute_relation_vectors(news, relation_embeddings):
    """
    Computes the average relation embedding vector for each news article.

    Parameters:
        news (pd.DataFrame): DataFrame containing the news dataset.
        relation_embeddings (dict): Dictionary mapping property IDs (Pxxx) to vectors.

    Returns:
        pd.DataFrame: Updated news dataset with 'relation_vector' feature.
    """
    def calculate_relation_vector(relation_ids, relation_embeddings):
        relation_ids = relation_ids.split() if isinstance(relation_ids, str) else []
        vectors = [relation_embeddings.get(r) for r in relation_ids if r in relation_embeddings]

        if vectors:
            return np.mean(vectors, axis=0)
        else:
            return np.nan  # Keep NaN for missing values

    # Apply function to each row in 'All Relation IDs' column
    news['relation_vector'] = news['all_relation_ids'].apply(lambda x: calculate_relation_vector(x, relation_embeddings))

    return news

In [18]:
# Call the function
processed_news = compute_relation_vectors(processed_news, relation_embeddings)

In [19]:
processed_news.head()

Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities,content,content_word_count,title_wikidata_id,abstract_wikidata_id,all_wikidata_ids,title_relations,abstract_relations,all_relation_ids,entity_vector,relation_vector
0,N88753,lifestyle,lifestyle royals,"the brands queen elizabeth, prince charles, and prince philip swear by","shop the notebooks, jackets, and more that the royals can't live without.",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{'Label': 'Prince Philip, Duke of Edinburgh', 'Type': 'P', 'WikidataId': 'Q80976', 'Confidence': 1.0, 'OccurrenceOffsets': [48], 'SurfaceForms': ['Prince Philip']}, {'Label': 'Charles, Prince of Wales', 'Type': 'P', 'WikidataId': 'Q43274', 'Confidence': 1.0, 'OccurrenceOffsets': [28], 'SurfaceForms': ['Prince Charles']}, {'Label': 'Elizabeth II', 'Type': 'P', 'WikidataId': 'Q9682', 'Confidence': 0.97, 'OccurrenceOffsets': [11], 'SurfaceForms': ['Queen Elizabeth']}]",[],lifestyle lifestyle royals brands queen elizabeth prince charles prince philip swear shop notebooks jackets royals cant live without,18,Q80976 Q43274 Q9682,,Q80976 Q43274 Q9682,P P P,,P P P,"[0.0040573333, -0.03991733, -0.008374, 0.07914233, -0.020229666, -0.022212999, 0.040847663, 0.0014350001, 0.06182866, -0.031630997, 0.023040334, 0.04310767, 0.023592, -0.061693, 0.015099, -0.079108335, 0.028382666, 0.106527664, -0.008534334, -0.019020667, 0.011595999, -0.092747994, 0.07943733, -0.082109995, -0.070289664, -0.104519, 0.011144333, -0.009591, 0.044645667, -0.081025, -0.063390665, -0.020261666, 0.081290334, 0.031477336, -0.07450067, 0.012981, -0.0034046664, 0.010823333, -0.065875, 0.017982, -0.055929665, 0.08072767, -0.07246833, 0.025114998, -0.110455334, 0.041048, -0.011875999, -0.00434, -0.0027070008, -0.0019773333, 0.022897668, 0.066106, 0.010144333, 0.04570767, 0.044172335, 0.07795266, 0.014355333, 0.07451, -0.03834967, -0.011252667, 0.09210467, -0.077347666, 0.035256002, -0.058462333, 0.061819002, 0.08261534, 0.029722666, 0.029846, -0.035560668, 0.052448332, -0.0043619997, 0.026619999, -0.054742664, 0.00033166757, -0.019636666, -0.0045693335, 0.033881333, -0.021329336, 0.0013953336, 0.082415, 0.069087334, -0.0113659995, -0.00054833293, 0.013892, -0.011874001, -0.063751005, 0.07637099, 0.037085667, -0.0019800004, -0.027107665, -0.073342, 0.0146160005, 0.055785332, 0.024075666, 0.026349334, -0.052181665, 0.046933997, -0.024919666, 0.07169867, -0.036127668]",
1,N45436,news,news science and technology,walmart slashes prices on last-generation ipads,apple's new ipad releases bring big deals on last year's models.,https://assets.msn.com/labs/mind/AABmf2I.html,"[{'Label': 'IPad', 'Type': 'J', 'WikidataId': 'Q2796', 'Confidence': 0.999, 'OccurrenceOffsets': [42], 'SurfaceForms': ['iPads']}, {'Label': 'Walmart', 'Type': 'O', 'WikidataId': 'Q483551', 'Confidence': 1.0, 'OccurrenceOffsets': [0], 'SurfaceForms': ['Walmart']}]","[{'Label': 'IPad', 'Type': 'J', 'WikidataId': 'Q2796', 'Confidence': 0.999, 'OccurrenceOffsets': [12], 'SurfaceForms': ['iPad']}, {'Label': 'Apple Inc.', 'Type': 'O', 'WikidataId': 'Q312', 'Confidence': 0.999, 'OccurrenceOffsets': [0], 'SurfaceForms': ['Apple']}]",news news science technology walmart slashes prices lastgeneration ipads apples new ipad releases bring big deals last years models,19,Q2796 Q483551,Q2796 Q312,Q2796 Q483551 Q2796 Q312,J O,J O,J O J O,"[0.011153751, 0.0022815005, 0.033238, -0.0060794977, -0.0118165, -0.0076985005, 0.02201475, -0.0164965, -0.01626675, 0.085608505, 0.0712205, 0.091341, 0.066110745, 0.016277503, -0.03425325, -0.020384248, -0.029653748, 0.05821325, -0.0339095, -0.048551247, -0.04525, -0.0260525, 0.0038565006, -0.025450751, 0.011129252, -0.01636625, -0.040598, -0.0368445, 0.029721, 0.027449999, -0.035932, 0.04201025, 0.03802625, 0.02801875, 0.03152425, -0.082879, 0.030481748, -0.068886, -0.04187625, 0.043128, -0.07453974, 0.07487525, 0.0071719997, 0.03655, -0.0570635, -0.09142325, 0.0098495, 0.037899252, -0.0417815, 0.083532505, -0.0044905003, 0.03357475, -0.00652825, 0.078316, 0.00019525003, 0.0044232495, -0.047806002, -0.018328, 0.020638, 0.0199645, -0.055554498, 0.08263625, 0.02033625, 0.0599715, 0.028447751, -0.02271225, -0.080792, -0.05614975, 0.0227995, -0.03574375, -0.04488375, 0.019835997, 0.056202497, 0.078902245, -0.13049924, -0.03696525, 0.0008089999, -0.013265001, 0.0336465, 0.039681498, 0.03683575, -0.006379, -0.014397751, 0.0501765, 0.0016087494, -0.061637748, 0.0369365, 0.0082395, 0.045153752, -0.0655035, 0.0232625, -0.0011995006, -0.026635502, -0.05911775, 0.03674875, -0.082116, 0.051999748, -0.039464504, 0.016650751, -0.015847249]",
2,N23144,health,weight loss,50 worst habits for belly fat,these seemingly harmless habits are holding you back and keeping you from shedding that unwanted belly fat for good.,https://assets.msn.com/labs/mind/AAB19MK.html,"[{'Label': 'Adipose tissue', 'Type': 'C', 'WikidataId': 'Q193583', 'Confidence': 1.0, 'OccurrenceOffsets': [20], 'SurfaceForms': ['Belly Fat']}]","[{'Label': 'Adipose tissue', 'Type': 'C', 'WikidataId': 'Q193583', 'Confidence': 1.0, 'OccurrenceOffsets': [97], 'SurfaceForms': ['belly fat']}]",health weight loss 50 worst habits belly fat seemingly harmless habits holding back keeping shedding unwanted belly fat good,19,Q193583,Q193583,Q193583 Q193583,C,C,C C,"[-0.013597, -0.009758, 0.01712, -0.051993, 0.037963, 0.045238, 0.077176, -0.033402, 0.032126, 0.000231, -0.041168, 0.006686, -0.042137, 0.003012, -0.007492, 0.004021, -0.044446, 0.060362, 0.036527, 0.000915, -0.0881, 0.013638, 0.00664, 0.023534, -0.006483, -0.030089, -0.01769, -0.076201, 0.010523, -0.015482, -0.029464, -0.000577, 0.011122, 0.020931, 0.019831, -0.020024, 0.046285, -0.087435, -0.014035, -0.032755, 0.000882, 0.058146, -0.063754, -0.01612, -0.028519, -0.006055, 0.043857, 0.012439, 0.008123, 0.068035, -0.025732, -0.082524, 0.012157, 0.005102, 0.050669, -0.029057, -0.097563, 0.01637, 0.104147, -0.013962, -0.085338, 0.002964, 0.002371, 0.033141, -0.094885, -0.041572, -0.083391, -0.017752, -0.021868, 0.003052, 0.065026, 0.018371, -0.009355, 0.078348, 0.028948, -0.043366, 0.007608, 0.031137, 0.054513, 0.030671, -0.021131, -0.048048, -0.013872, -0.000741, -0.059595, -0.109701, 0.046302, 0.016114, 0.05821, -0.015325, -0.085929, -0.059981, 0.004588, -0.028985, -0.059973, -0.035562, 0.106053, -0.10042, 0.051723, 0.001144]",
4,N93187,news,news world,the cost of trump's aid freeze in the trenches of ukraine's war,"lt. ivan molchanets peeked over a parapet of sand bags at the front line of the war in ukraine. next to him was an empty helmet propped up to trick snipers, already perforated with multiple holes.",https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{'Label': 'Ukraine', 'Type': 'G', 'WikidataId': 'Q212', 'Confidence': 0.946, 'OccurrenceOffsets': [87], 'SurfaceForms': ['Ukraine']}]",news news world cost trumps aid freeze trenches ukraines war lt ivan molchanets peeked parapet sand bags front line war ukraine next empty helmet propped trick snipers already perforated multiple holes,31,,Q212,Q212,,G,G,"[-0.065324, -0.088163, -0.015203, -0.031949, 0.091263, -0.228807, -0.005629, -0.199308, 0.158042, 0.157506, 0.112573, 0.11353, -0.05826, -0.000931, -0.182976, 0.095299, 0.011588, 0.081224, 0.041808, -0.114242, 0.050847, 0.000613, -0.025367, 0.104747, -0.019483, -0.044524, 0.014938, -0.020598, 0.074194, 0.143272, -0.090134, 0.024378, 0.016504, 0.073597, -0.000753, 0.014563, -0.086061, -0.097182, -0.176203, 0.091886, -0.090757, -0.057736, 0.070559, -0.00314, -0.208355, 0.07014, -0.089793, 0.102981, -0.055766, 0.080064, -0.171543, 0.084593, -0.021262, -0.070932, 0.094243, 0.094618, -0.060624, -0.103646, 0.016896, -0.022969, -0.06224, 0.083993, 0.050859, 0.170461, -0.040692, -0.231475, -0.054997, -0.139666, -0.182166, -0.154107, -0.103189, -0.191087, 0.114531, 0.107967, 0.049184, -0.003082, -0.130202, 0.236282, 0.037841, -0.041989, 0.095108, -8.8e-05, 0.004326, 0.030816, -0.018787, -0.009449, -0.073514, 0.034427, -0.075651, -0.108221, -0.147075, -0.088895, -0.102027, -0.058871, -0.156666, -0.099688, 0.020397, -0.086153, -0.046413, 0.066968]",
5,N75236,health,voices,i was an nba wife. here's how it affected my mental health.,"i felt like i was a fraud, and being an nba wife didn't help that. in fact, it nearly destroyed me.",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{'Label': 'National Basketball Association', 'Type': 'O', 'WikidataId': 'Q155223', 'Confidence': 1.0, 'OccurrenceOffsets': [40], 'SurfaceForms': ['NBA']}]",health voices nba wife heres affected mental health felt like fraud nba wife didnt help fact nearly destroyed,18,,Q155223,Q155223,,O,O,"[0.003752, -0.061771, -0.037073, 0.02677, -0.090658, 0.012813, -0.092285, 0.074664, -0.066141, 0.035619, 0.001939, 0.118046, 0.106249, -0.11127, -0.00702, 0.034989, 0.029654, 0.008367, -0.050007, -0.030704, -0.065894, 0.08123, -0.04539, -0.040012, 0.090774, 0.067699, -0.028125, -0.026579, -0.079355, 0.059613, 0.020613, 0.042591, 0.009983, -0.037203, 0.038242, -0.09384, -0.154576, -0.004471, -0.140145, -0.110083, 0.083281, 0.048921, 0.051276, 0.108326, 0.007419, -0.085701, -0.047435, 0.114793, -0.048882, -0.002836, 0.038695, -0.034756, -0.000584, 0.052104, -0.003848, 0.034386, 0.05696, -0.023321, -0.03635, -0.004535, 0.006874, 0.076202, -0.050935, 0.000466, -0.039141, -0.017551, -0.077054, -0.000394, -0.054463, -0.058872, -0.053088, 0.023609, 0.09061, 0.046973, -0.114502, -0.05578, -0.056306, 0.013576, -0.008754, 0.129473, 0.010713, 0.107132, 0.053372, 0.049199, 0.025105, -0.046849, -0.022099, -0.007698, 0.069537, -0.060824, -0.007802, 0.051335, 0.028594, -0.113466, -0.007955, -0.051726, 0.01268, -0.058242, -0.035847, 0.010902]",


### Export Processed Data to Parquet

In [None]:
# Save to Parquet format
processed_news.to_parquet("processed_news.parquet", index=False)
processed_behaviours.to_parquet("processed_behaviours.parquet", index=False)