# MIND Dataset Preprocessing

## Libraries, Imports, Setup

In [1]:
import pandas as pd
import numpy as np
import random
import string
import ast
import wordninja
import warnings
from scipy import sparse
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/christopherstephan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/christopherstephan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Loading Datasets

In [None]:
# Creating the path to the training dataset
behaviours_train_val_path = "/Users/christopherstephan/Documents/IE/Term 3/CAPSTONE/Datasets/MINDlarge_train/behaviors.tsv"  
news_train_val_path = "/Users/christopherstephan/Documents/IE/Term 3/CAPSTONE/Datasets/MINDlarge_train/news.tsv"

#to the testing dataset
behaviours_test_path = "/Users/christopherstephan/Documents/IE/Term 3/CAPSTONE/Datasets/MINDlarge_dev/behaviors.tsv"  
news_test_path = "/Users/christopherstephan/Documents/IE/Term 3/CAPSTONE/Datasets/MINDlarge_dev/news.tsv"

# Loading TSV file from the specified path
behaviours_train_val = pd.read_csv(behaviours_train_val_path, sep="\t", header=None)
news_train = pd.read_csv(news_train_val_path, sep="\t", header=None)

#test files
behaviours_test = pd.read_csv(behaviours_test_path, sep="\t", header=None)
news_test = pd.read_csv(news_test_path, sep="\t", header=None)

In [None]:
# Define the Column Names for the datasets
behaviours_train_val.columns = ["impression_id", "user_id", "time", "history", "impressions"]
news_train.columns = ["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]

behaviours_test.columns = ["impression_id", "user_id", "time", "history", "impressions"]
news_test.columns = ["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]

In [4]:
# Path to files
entity_embedding_path = "/Users/christopherstephan/Documents/IE/Term 3/CAPSTONE/Datasets/MINDlarge_train/entity_embedding.vec"  
relation_embedding_path = "/Users/christopherstephan/Documents/IE/Term 3/CAPSTONE/Datasets/MINDlarge_train/relation_embedding.vec"

with open(entity_embedding_path, "r") as f:
    for _ in range(5):  # Read first 5 lines
        print(f.readline())

with open(relation_embedding_path, "r") as f:
    for _ in range(5):  # Read first 5 lines
        print(f.readline())

Q34433	0.017808	-0.073256	0.102521	-0.059926	-0.060665	0.027027	-0.091728	-0.003057	-0.170798	0.111819	0.006821	-0.049873	-0.050532	-0.003127	-0.074472	-0.115891	-0.067093	-0.095272	0.019178	-0.083725	-0.060890	0.017744	0.049417	-0.026014	-0.048549	0.017528	0.044163	0.022111	-0.081519	0.046278	-0.183939	-0.063143	-0.014518	-0.080644	-0.099994	0.085905	-0.083003	-0.092844	-0.216481	0.125441	0.179819	0.036735	-0.085375	0.021276	-0.154971	0.039009	0.016059	0.067725	-0.148213	0.158773	-0.028527	0.125790	0.006361	0.067541	0.077552	0.060792	-0.044511	-0.005862	-0.068080	-0.063204	-0.094127	0.115441	-0.016472	0.106616	0.047839	-0.151805	-0.111083	-0.142330	-0.120680	-0.050393	-0.073787	0.017424	-0.081620	0.062599	-0.022102	-0.102688	-0.128149	-0.075895	0.095134	0.000984	0.010143	-0.068552	-0.026573	0.019735	-0.000981	-0.126635	0.008300	0.170557	0.002250	-0.157175	-0.077962	0.013433	0.045894	-0.071253	0.086445	-0.120466	0.059235	-0.071865	0.058854	0.024765	

Q41	-0.063388	-0.181451	0.057501	-0

In [5]:
# Load entity embeddings
entity_embeddings = {}
with open(entity_embedding_path, "r") as f:
    for line in f:
        values = line.strip().split()
        entity = values[0]  # First value is entity name
        vector = np.array(values[1:], dtype=np.float32)  # Rest are vector values
        entity_embeddings[entity] = vector


# Load relation embeddings
relation_embeddings = {}
with open(relation_embedding_path, "r") as f:
    for line in f:
        values = line.strip().split()
        entity = values[0]  # First value is entity name
        vector = np.array(values[1:], dtype=np.float32)  # Rest are vector values
        relation_embeddings[entity] = vector

## Train, Val, Test

In [None]:
# Convert the time column to datetime format
behaviours_train_val["time"] = pd.to_datetime(behaviours_train_val["time"], format="%m/%d/%Y %I:%M:%S %p")
behaviours_test["time"] = pd.to_datetime(behaviours_test["time"], format="%m/%d/%Y %I:%M:%S %p")

In [None]:
# create a function to split data into train and val
def split_behaviours(behaviours_train_val, train_ratio=0.82):
    # Ensure the DataFrame is sorted by time before splitting
    behaviours_train_val = behaviours_train_val.sort_values(by='time')  # Replace 'timestamp' with the actual time column
    
    # Calculate split index
    split_index = int(len(behaviours_train_val) * train_ratio)

    # Split the dataset
    behaviours_train = behaviours_train_val.iloc[:split_index]
    behaviours_val = behaviours_train_val.iloc[split_index:]

    return behaviours_train, behaviours_val

# Example usage
# behaviours = pd.read_csv("your_data.csv")  # Load your dataset
behaviours_train, behaviours_val = split_behaviours(behaviours_train_val)

# Displaying the split sizes
print(f"Training set size: {len(behaviours_train)}, Validation set size: {len(behaviours_val)}, and the Test set size: {len(behaviours_test)}")

In [None]:
print(behaviours_train['time'].min())
print(behaviours_train['time'].max())
print(behaviours_val['time'].min())
print(behaviours_val['time'].max())
print(behaviours_test['time'].min())
print(behaviours_test['time'].max())

## Cleaning, Preprocessing, Feature Engineering

### Behaviours

In [None]:
def clean_behaviours(behaviours):
    """
    Cleans the behaviors dataset by handling missing values and removing duplicates.
    """
    behaviours["history"].fillna("No_History", inplace=True)
    behaviours.drop_duplicates(inplace=True)
    return behaviours

def convert_time(behaviours):
    """
    Converts the 'Time' column to datetime format and extracts the hour.
    """
    behaviours["time"] = pd.to_datetime(behaviours["time"], format="%m/%d/%Y %I:%M:%S %p", errors="coerce")
    behaviours["hour"] = behaviours["time"].dt.hour
    return behaviours

def extract_num_clicks(behaviours):
    """
    Extracts the number of clicks per session based on the 'Impressions' column.
    """
    def count_clicks(impressions):
        if not isinstance(impressions, str) or impressions.strip() == "":
            return 0
        return sum([int(i.split('-')[1]) for i in impressions.split() if '-' in i])
    
    behaviours["num_clicks"] = behaviours["impressions"].apply(count_clicks)
    return behaviours

def extract_history_length(behaviours):
    """
    Extracts the length of the user's reading history.
    """
    behaviours["history_length"] = behaviours["history"].apply(lambda x: 0 if x == "No_History" else len(x.split()))
    return behaviours

def split_clicked_nonclicked(behaviours):
    """
    Splits the impressions column into separate clicked and non-clicked news lists.
    """
    def split_impressions(impressions):
        clicked, non_clicked = [], []
        if isinstance(impressions, str) and impressions.strip():
            for item in impressions.split():
                parts = item.split('-')
                if len(parts) == 2:
                    news_id, clicked_status = parts
                    if clicked_status == "1":
                        clicked.append(news_id)
                    else:
                        non_clicked.append(news_id)
        return clicked, non_clicked

    behaviours[["clicked_news", "non_clicked_news"]] = behaviours["impressions"].apply(lambda x: pd.Series(split_impressions(x)))
    return behaviours

def convert_user_id(behaviours, user_mapping=None):
    """
    Converts 'User ID' into a numerical index for easier model use.
    """
    if user_mapping is None:
        user_mapping = {uid: idx for idx, uid in enumerate(behaviours["user_id"].unique())}
    behaviours["user_index"] = behaviours["user_id"].map(user_mapping)
    return behaviours, user_mapping

def compute_user_avg_clicks(behaviours):
    """
    Computes the average number of clicks per user.
    """
    user_click_counts = behaviours.groupby("user_id")["num_clicks"].sum()
    behaviours["user_avg_clicks"] = behaviours["user_id"].map(user_click_counts)
    return behaviours

def compute_recency(behaviours):
    """
    Computes recency of last interaction by measuring the time since the last recorded interaction.
    """
    max_time = behaviours["time"].max()
    behaviours["recency"] = (max_time - behaviours["time"]).dt.total_seconds() / 3600
    return behaviours

In [None]:
def preprocess_behaviours(behaviours_train, behaviours_val, behaviours_test):
    """
    Full preprocessing pipeline for the behaviors dataset.
    """
    for dataset in [behaviours_train, behaviours_val, behaviours_test]:
        dataset = clean_behaviours(dataset)
        dataset = convert_time(dataset)
        dataset = extract_num_clicks(dataset)
        dataset = extract_history_length(dataset)
        dataset = split_clicked_nonclicked(dataset)
        dataset = compute_user_avg_clicks(dataset)
        dataset = compute_recency(dataset)
    
    behaviours_train, user_mapping = convert_user_id(behaviours_train)
    behaviours_val, _ = convert_user_id(behaviours_val, user_mapping)
    behaviours_test, _ = convert_user_id(behaviours_test, user_mapping)
    
    return behaviours_train, behaviours_val, behaviours_test

In [None]:
# Run the full preprocessing pipeline
processed_behaviours_train, processed_behaviours_val, processed_behaviours_test  = preprocess_behaviours(behaviours_train, behaviours_val, behaviours_test)

In [None]:
processed_behaviours_test.head(3)

### News

In [None]:
def clean_news(news):
    """
    Cleans the news dataset by handling missing values and duplicates.
    """
    news.dropna(inplace=True)
    news.drop_duplicates(inplace=True)
    return news

def preprocess_text(news):
    """
    Processes text fields in the news dataset by:
    - Splitting concatenated words in Category/SubCategory
    - Converting text to lowercase
    - Removing punctuation
    - Removing stopwords
    """
    def split_and_replace(word):
        return ' '.join(wordninja.split(word))

    news['category'] = news['category'].apply(split_and_replace)
    news['subcategory'] = news['subcategory'].apply(split_and_replace)
    
    columns_to_lower = ['category', 'subcategory', 'title', 'abstract']
    news[columns_to_lower] = news[columns_to_lower].applymap(lambda x: x.lower() if isinstance(x, str) else x)
    
    news['content'] = news[['category', 'subcategory', 'title', 'abstract']].apply(' '.join, axis=1)
    news['content'] = news['content'].str.replace(f"[{string.punctuation}]", "", regex=True)
    
    stop_words = set(stopwords.words('english'))
    news['content'] = news['content'].apply(lambda text: " ".join(
        [word for word in text.split() if word.lower() not in stop_words]
    ))
    
    return news

def extract_word_count(news):
    """
    Adds a new feature 'Content_WC' that represents the word count in the content field.
    """
    news['content_word_count'] = news['content'].str.split().str.len()
    return news

def extract_entities(news):
    """
    Extracts Wikidata entity IDs from the 'Title Entities' and 'Abstract Entities' columns.
    """
    news['title_entities'] = news['title_entities'].apply(ast.literal_eval)
    news['abstract_entities'] = news['abstract_entities'].apply(ast.literal_eval)
    
    news['title_wikidata_id'] = news['title_entities'].apply(lambda x: ' '.join([d['WikidataId'] for d in x]))
    news['abstract_wikidata_id'] = news['abstract_entities'].apply(lambda x: ' '.join([d['WikidataId'] for d in x]))
    
    news['all_wikidata_ids'] = news['title_wikidata_id'] + ' ' + news['abstract_wikidata_id']
    
    return news

def extract_relations(news):
    """
    Extracts Wikidata relation (property) IDs from the 'Title Entities' and 'Abstract Entities' columns.
    """
    news['title_relations'] = news['title_entities'].apply(lambda x: ' '.join([d['Type'] for d in x]))
    news['abstract_relations'] = news['abstract_entities'].apply(lambda x: ' '.join([d['Type'] for d in x]))
    
    news['all_relation_ids'] = news['title_relations'] + ' ' + news['abstract_relations']
    
    return news

In [None]:
def preprocess_news(news_train, news_test):
    """
    Full preprocessing pipeline for the news dataset.
    """
    for dataset in [news_train, news_test]:
        dataset = clean_news(dataset)
        dataset = preprocess_text(dataset)
        dataset = extract_word_count(dataset)
        dataset = extract_entities(dataset)
        dataset = extract_relations(dataset)
    
    return news_train, news_test

In [None]:
# Run the full preprocessing pipeline
processed_news_train, processed_news_test = preprocess_news(news_train, news_test)

  news[columns_to_lower] = news[columns_to_lower].applymap(lambda x: x.lower() if isinstance(x, str) else x)


In [None]:
processed_news_train.head(3)

### Entity Embeddings

In [15]:
def compute_entity_vectors(news, entity_embeddings):
    """
    Computes the average embedding vector for each news article based on extracted entity IDs.

    Parameters:
        news (pd.DataFrame): DataFrame containing the news dataset.
        entity_embeddings (dict): Dictionary mapping Wikidata IDs to embedding vectors.

    Returns:
        pd.DataFrame: Updated news dataset with 'average_vector' feature.
    """
    def calculate_average_vector(vector_ids, entity_embeddings):
        """
        Helper function to compute the mean vector for a given list of entity IDs.

        Parameters:
            vector_ids (str): Space-separated string of entity IDs.
            entity_embeddings (dict): Dictionary of pre-loaded entity embeddings.

        Returns:
            np.ndarray or np.nan: Averaged vector if entities exist, otherwise NaN.
        """
        vector_ids = vector_ids.split() if isinstance(vector_ids, str) else []
        vectors = [entity_embeddings.get(entity) for entity in vector_ids if entity in entity_embeddings]

        if vectors:
            return np.mean(vectors, axis=0)
        else:
            return np.nan  # Keep NaN for missing values

    # Apply function to each row in 'All Wikidata IDs' column
    news['entity_vector'] = news['all_wikidata_ids'].apply(lambda x: calculate_average_vector(x, entity_embeddings))

    return news


In [None]:
# Call the function
processed_news_train = compute_entity_vectors(processed_news_train, entity_embeddings)
processed_news_test = compute_entity_vectors(processed_news_test, entity_embeddings)

### Relation Embeddings

In [17]:
def compute_relation_vectors(news, relation_embeddings):
    """
    Computes the average relation embedding vector for each news article.

    Parameters:
        news (pd.DataFrame): DataFrame containing the news dataset.
        relation_embeddings (dict): Dictionary mapping property IDs (Pxxx) to vectors.

    Returns:
        pd.DataFrame: Updated news dataset with 'relation_vector' feature.
    """
    def calculate_relation_vector(relation_ids, relation_embeddings):
        relation_ids = relation_ids.split() if isinstance(relation_ids, str) else []
        vectors = [relation_embeddings.get(r) for r in relation_ids if r in relation_embeddings]

        if vectors:
            return np.mean(vectors, axis=0)
        else:
            return np.nan  # Keep NaN for missing values

    # Apply function to each row in 'All Relation IDs' column
    news['relation_vector'] = news['all_relation_ids'].apply(lambda x: calculate_relation_vector(x, relation_embeddings))

    return news

In [None]:
# Call the function
processed_news_train = compute_relation_vectors(processed_news_train, relation_embeddings)
processed_news_test = compute_relation_vectors(processed_news_test, relation_embeddings)

In [None]:
processed_news_train.head(1)

### Export Processed Data to Parquet

In [None]:
# Save to Parquet format
processed_news_train.to_parquet("processed_news_train.parquet", index=False)
processed_news_test.to_parquet("processed_news_test.parquet", index=False)
processed_behaviours_train.to_parquet("processed_behaviours_train.parquet", index=False)
processed_behaviours_val.to_parquet("processed_behaviours_val.parquet", index=False)
processed_behaviours_test.to_parquet("processed_behaviours_test.parquet", index=False)