# Imports and Preparing the data of behavior.tsv and news.tsv #

In [1]:
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score
import numpy as np
import pandas as pd
import re
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.decomposition import IncrementalPCA, PCA
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from scipy.cluster.hierarchy import dendrogram, linkage
from collections import defaultdict


# Load the data
base_path = Path.cwd() / 'data'
news_path = base_path / 'news.tsv'
behaviors_path = base_path / 'behaviors.tsv'

news = pd.read_csv(news_path, sep='\t', names=['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities'])
behaviors = pd.read_csv(behaviors_path, sep='\t', names=['impression_id', 'user_id', 'time', 'history', 'impressions'])

# Preprocessing with Data Cleaning Remove duplicates and Text cleaning

In [2]:
# Data Cleaning
# Check if attributes such as Category, Subcategory, Title, and Abstract in news.tsv are complete.
# Remove news items with many missing values or replace them with:
# Category: unknown
# Subcategory: general
# Title and Abstract: a placeholder text like "Missing Data."
# For behaviors.tsv, remove users with missing or empty history.
news.fillna({'category': 'unknown', 'subcategory': 'general', 'title': 'Missing Title', 'abstract': 'Missing Abstract'}, inplace=True)
news.dropna(subset=['category', 'subcategory', 'title', 'abstract'], inplace=True)
# behaviors.dropna(subset=['history', 'impressions'], inplace=True)
behaviors.dropna(subset=['history'], inplace=True)

# Remove duplicates
# Remove news items in news.tsv that have the same values for Title and Abstract.
# Remove duplicate impressions (Impression ID) in behaviors.tsv
news.drop_duplicates(subset=['title', 'abstract'], inplace=True)
#behaviors.drop_duplicates(subset=['impression_id'], inplace=True)

# Text cleaning
# Break down Title and Abstract into tokens:
# Remove special characters, numbers, and HTML.
# Convert all words to lowercase.
# Remove stopwords (e.g., using nltk or spacy).
# Perform lemmatization to reduce words to their base form.
def clean_text(text):
    # Clean text by removing special characters, numbers, and converting to lowercase
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"'ll", " will", text)
    text = re.sub(r"'ve", " have", text)
    text = re.sub(r"'re", " are", text)
    text = re.sub(r"'d", " would", text)
    text = re.sub(r"'s", " is", text)
    
    # Remove special characters
    text = re.sub(r'\W+', ' ', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    return text.lower()

def remove_stopwords_and_lemmatize(text):
    # Remove stopwords and perform basic stemming/lemmatization
    stopwords = set([
        'the', 'and', 'is', 'in', 'to', 'of', 'a', 'an', 'on', 'for', 'with', 'as', 'by', 'at', 'from', 'this',
        'that', 'it', 'or', 'but', 'not', 'be', 'are', 'was', 'were', 'can', 'will', 'would', 'should', 'has', 'have',
        'had', 'do', 'does', 'did', 'which', 'if', 'then', 'than', 'so', 'such', 'there', 'about', 'into', 'over',
        'after'
    ])

    def advanced_lemmatization(word):
        # Remove common suffixes like 'ed', 'ing', 'es', 'er', 'ly'
        suffixes = ['ed', 'ing', 'es', 'er', 'ly', 'able', 'ness']
        for suffix in suffixes:
            if word.endswith(suffix):
                return word[:-len(suffix)]
        return word

    # List of simple verbs
    irregular_verbs = {
        'ran': 'run',
        'ate': 'eat',
        'wrote': 'write',
        'went': 'go',
        'saw': 'see',
        'had': 'have',
        'was': 'be',
        'were': 'be'
    }

    def lemmatize_irregular_verbs(word):
        # Lemmatisiere unregelmäßige Verben
        return irregular_verbs.get(word, word)

    words = text.split()
    lemmatized = []

    for word in words:
        # Outsort verbs
        word = lemmatize_irregular_verbs(word)
        # Remove suffix
        word = advanced_lemmatization(word)
        lemmatized.append(word)

    return " ".join(word for word in lemmatized if word not in stopwords)

def preprocess_text(text):
    # Full preprocessing pipeline: clean text, remove stopwords, and lemmatize
    cleaned_text = clean_text(text)
    return remove_stopwords_and_lemmatize(cleaned_text)


def preprocess_text_parallel(texts):
    processed_texts = [preprocess_text(text) for text in texts]
    return processed_texts


# Clean Title and Abstract
news['clean_title'] = preprocess_text_parallel(news['title'])
news['clean_abstract'] = preprocess_text_parallel(news['abstract'])

# Data Preparation with TF-IDF, One-hot and PCA

In [3]:
#TF-IDF
# Convert clean_title and clean_abstract into numerical vectors
tfidf_vectorizer = TfidfVectorizer(max_features=500)
tfidf_title = tfidf_vectorizer.fit_transform(news['clean_title'])
tfidf_abstract = tfidf_vectorizer.fit_transform(news['clean_abstract'])

# Combine sparse matrices
news_features = hstack([tfidf_title, tfidf_abstract])

# One-hot encode categories and subcategories
category_encoded = pd.get_dummies(news['category'])
subcategory_encoded = pd.get_dummies(news['subcategory'])

# Combine categorical and TF-IDF features
final_features = hstack([news_features, category_encoded.values, subcategory_encoded.values])

# PCA for faster processing instead of t-SNE for better visualization
pca = IncrementalPCA(n_components=50, batch_size=1000)
reduced_features = pca.fit_transform(news_features)

# KMeans Visualization with PCA

In [4]:
# Clustering with KMeans
optimal_k = 3

kmeans = KMeans(n_clusters=optimal_k, random_state=42)
news['cluster'] = kmeans.fit_predict(reduced_features)
news

Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities,clean_title,clean_abstract,cluster
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[],brands queen elizabeth prince charl prince phi...,shop notebooks jackets more royals ca live wit...,1
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...",worst habits bel fat,these seeming harmless habits hold you back ke...,0
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId...",cost trump aid freeze trench ukraine war,lt ivan molchanets peek ov parapet sand bags f...,1
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ...",i nba wife here how affect my mental health,i felt like i fraud nba wife help fact near de...,1
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...",how get rid skin tags accord dermatologist,they seem harmless very good reason you ignore...,0
...,...,...,...,...,...,...,...,...,...,...,...
51277,N16909,weather,weathertopstories,"Adapting, Learning And Soul Searching: Reflect...",Woolsey Fire Anniversary: A community is forev...,https://assets.msn.com/labs/mind/BBWzQJK.html,"[{""Label"": ""Woolsey Fire"", ""Type"": ""N"", ""Wikid...","[{""Label"": ""Woolsey Fire"", ""Type"": ""N"", ""Wikid...",adapt learn soul search reflect woolsey fire,woolsey fire anniversary community forev chang...,1
51278,N47585,lifestyle,lifestylefamily,Family says 13-year-old Broadway star died fro...,Missing Abstract,https://assets.msn.com/labs/mind/BBWzQYV.html,"[{""Label"": ""Broadway theatre"", ""Type"": ""F"", ""W...",[],fami says year old broadway star di massive as...,miss abstract,2
51279,N7482,sports,more_sports,St. Dominic soccer player tries to kick cancer...,"Sometimes, what happens on the sidelines can b...",https://assets.msn.com/labs/mind/BBWzQnK.html,[],[],st dominic socc play tri kick canc curb,sometim what happens sidelin even more importa...,1
51280,N34418,sports,soccer_epl,How the Sounders won MLS Cup,"Mark, Jeremiah and Casey were so excited they ...",https://assets.msn.com/labs/mind/BBWzQuK.html,"[{""Label"": ""MLS Cup"", ""Type"": ""U"", ""WikidataId...",[],how sounders won mls cup,mark jeremiah casey excit they postgame podcast,1


# Start with content based recommendation

In [5]:
# copy_news = news.copy()
# copy_news["cluster"]

# def find_cluster(news_df, news_id):
#     result = news_df[news_df['news_id'] == news_id]['cluster']
#     if not result.empty:
#         return result.iloc[0]  # Return the cluster value
#     else:
#         return None  # Return None if news_id not found
# #
# # Example usage
# news_id_to_find = "some_news_id"
# cluster = find_cluster(copy_news, "N41777")
# print(f"The cluster for news_id {news_id_to_find} is: {cluster}")



In [6]:
# print(copy_news[copy_news['news_id'] == "N41777"])

# Data Pre-Processing

In [7]:
# Split 'history' into lists of baskets based on ids
# Group by user and get rid of duplicates in the history
user_histories = []
user_data_for_df = []
for user_id, user_data in behaviors.groupby('user_id'):
    one_history_string = " ".join(user_data['history'])
    splitted_without_duplicates = set(one_history_string.split())
    articles_list_per_user = list(splitted_without_duplicates)
    user_histories.append(articles_list_per_user)
    user_data_for_df.append([user_id, articles_list_per_user])

In [8]:
news_title_dict = dict(zip(news['news_id'], news['title']))

# Count article reads and sort after highest count

In [9]:
# Count occurence of each article
articles_count = {}

for history in user_histories:
    for article in history:
        if article in articles_count:
            articles_count[article] += 1
        else:
            articles_count[article] = 1

# Append cluster to articles count and sort clusters
cluster_dict = defaultdict(list)
for _, row in news.iterrows():
    news_id = row['news_id']
    cluster = row['cluster']
    
    if news_id in articles_count:
        cluster_dict[cluster].append((articles_count[news_id], news_id))

for cluster in cluster_dict:
    cluster_dict[cluster] = sorted(cluster_dict[cluster], key=lambda x: x[0], reverse=True)

top_sorted_articles = {}

for cluster, articles in cluster_dict.items():
    top_sorted_articles[cluster] = [news_id for _, news_id in articles]

# Create Dataframe for dealing with top cluster per user

In [10]:
user_histories_df = pd.DataFrame(user_data_for_df, columns=['user_id', 'full_history'])
user_histories_df = behaviors[['user_id']].drop_duplicates().merge(user_histories_df, on='user_id')
user_histories_df

Unnamed: 0,user_id,full_history
0,U13740,"[N34694, N31801, N63302, N42782, N55189, N4579..."
1,U91836,"[N2511, N3142, N62285, N59359, N16617, N29802,..."
2,U73700,"[N25792, N24233, N47289, N26378, N21087, N1073..."
3,U34670,"[N45729, N41375, N31825, N29757, N33013, N871,..."
4,U8125,"[N33740, N14904, N10078, N56514]"
...,...,...
49103,U6794,"[N37920, N60184, N42458, N54416, N3595, N41777..."
49104,U23127,"[N60350, N13429, N64395, N5477, N59419, N51591..."
49105,U43157,"[N30410, N12988, N62285, N14006, N24721, N1725..."
49106,U66493,"[N56889, N50638, N34069, N26151, N22570, N5733..."


# Get cluster for every article in user history

In [11]:
user_histories_df["clusters"] = None
news_dict = dict(zip(news["news_id"], news["cluster"]))

for idx, row in user_histories_df.iterrows():
    user_id = row['user_id']
    full_history = row['full_history']
    cluster_list = []
    for article in full_history:
        if article in news_dict:
            cluster_list.append(news_dict[article])
    user_histories_df.at[idx, "clusters"] = cluster_list
user_histories_df

Unnamed: 0,user_id,full_history,clusters
0,U13740,"[N34694, N31801, N63302, N42782, N55189, N4579...","[1, 1, 1, 1, 1, 1, 1, 1, 1]"
1,U91836,"[N2511, N3142, N62285, N59359, N16617, N29802,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, ..."
2,U73700,"[N25792, N24233, N47289, N26378, N21087, N1073...","[1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1]"
3,U34670,"[N45729, N41375, N31825, N29757, N33013, N871,...","[1, 1, 0, 1, 1, 1, 1, 1, 1, 1]"
4,U8125,"[N33740, N14904, N10078, N56514]","[0, 1, 1, 1]"
...,...,...,...
49103,U6794,"[N37920, N60184, N42458, N54416, N3595, N41777...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
49104,U23127,"[N60350, N13429, N64395, N5477, N59419, N51591...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, ..."
49105,U43157,"[N30410, N12988, N62285, N14006, N24721, N1725...","[1, 1, 1, 1, 1, 1, 1, 1]"
49106,U66493,"[N56889, N50638, N34069, N26151, N22570, N5733...","[1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1]"


# Important !!

# Some articles appear to not be in the cluster dataframe -> Look into it

In [12]:
empty_list_rows = user_histories_df[user_histories_df['clusters'].apply(lambda x: isinstance(x, list) and len(x) == 0)]
print(empty_list_rows)

      user_id full_history clusters
451     U9828     [N41777]       []
9659   U59528     [N31172]       []
12417  U73096     [N47020]       []
12617  U71157     [N17164]       []
16684  U86734     [N47020]       []
17532  U24863     [N56527]       []
23686  U14089      [N1920]       []
27946  U60814     [N33391]       []
28742  U45998     [N12584]       []
34457  U30155     [N47020]       []
41333  U12146     [N49824]       []
48233  U29489     [N16662]       []


# Count top cluster per user

In [13]:
for idx, row in user_histories_df.iterrows():
    cluster_counts = {}
    clusters = row['clusters']
    if clusters != []:
        for cluster in clusters:
            cluster_counts[cluster] = cluster_counts.get(cluster, 0) + 1
        if cluster_counts:
            top_cluster = max(cluster_counts, key=cluster_counts.get)
    else:
        top_cluster = None
    user_histories_df.at[idx, "top_cluster"] = top_cluster
# Get rid of this line if handling missing articles:
user_histories_df["top_cluster"] = user_histories_df["top_cluster"].astype("Int64")
##
user_histories_df

Unnamed: 0,user_id,full_history,clusters,top_cluster
0,U13740,"[N34694, N31801, N63302, N42782, N55189, N4579...","[1, 1, 1, 1, 1, 1, 1, 1, 1]",1
1,U91836,"[N2511, N3142, N62285, N59359, N16617, N29802,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, ...",1
2,U73700,"[N25792, N24233, N47289, N26378, N21087, N1073...","[1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1]",1
3,U34670,"[N45729, N41375, N31825, N29757, N33013, N871,...","[1, 1, 0, 1, 1, 1, 1, 1, 1, 1]",1
4,U8125,"[N33740, N14904, N10078, N56514]","[0, 1, 1, 1]",1
...,...,...,...,...
49103,U6794,"[N37920, N60184, N42458, N54416, N3595, N41777...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1
49104,U23127,"[N60350, N13429, N64395, N5477, N59419, N51591...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, ...",1
49105,U43157,"[N30410, N12988, N62285, N14006, N24721, N1725...","[1, 1, 1, 1, 1, 1, 1, 1]",1
49106,U66493,"[N56889, N50638, N34069, N26151, N22570, N5733...","[1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1]",1


In [14]:
# Print occurence of top clusters
value_counts = user_histories_df["top_cluster"].value_counts()
print(value_counts)

top_cluster
1    48158
0      809
2      129
Name: count, dtype: Int64


In [15]:
rows_with_missing_values = user_histories_df[user_histories_df.isna().any(axis=1)]
print(rows_with_missing_values)

      user_id full_history clusters  top_cluster
451     U9828     [N41777]       []         <NA>
9659   U59528     [N31172]       []         <NA>
12417  U73096     [N47020]       []         <NA>
12617  U71157     [N17164]       []         <NA>
16684  U86734     [N47020]       []         <NA>
17532  U24863     [N56527]       []         <NA>
23686  U14089      [N1920]       []         <NA>
27946  U60814     [N33391]       []         <NA>
28742  U45998     [N12584]       []         <NA>
34457  U30155     [N47020]       []         <NA>
41333  U12146     [N49824]       []         <NA>
48233  U29489     [N16662]       []         <NA>


In [16]:
user_histories_df.dropna(subset=['top_cluster'], inplace=True)

# Get top cluster for any user then recommend the first top 10 articles in that cluster that haven't been read by the user

In [20]:
# Get top cluster and recommend top 10 articles (that haven't been read)
"""
For 0 for example:
For 1 for example: U23127
For 2 for example: U83901
"""
user_id = "U83901" #"U23127"
user_top_cluster = user_histories_df[user_histories_df["user_id"] == user_id]["top_cluster"].iloc[0]
user_history = user_histories_df[user_histories_df["user_id"] == user_id]["full_history"].iloc[0]

top_articles_from_cluster = top_sorted_articles[int(user_top_cluster)]

recommend_list = []
count = 0
for top_article in top_articles_from_cluster:
    if top_article not in article:
        recommend_list.append(top_article)
        count += 1
    if count == 10:
        break
print(recommend_list)

['N26364', 'N46513', 'N2735', 'N61319', 'N22260', 'N21317', 'N21984', 'N39041', 'N40555', 'N56742']


In [21]:
print("What user have read so far:")
for i in user_history:
    print(i)
print()
print("What is recommended to the user:")
for i in recommend_list:
    print(i)

What user have read so far:
N9457

What is recommended to the user:
N26364
N46513
N2735
N61319
N22260
N21317
N21984
N39041
N40555
N56742


# Print titles of the recommended articles

In [22]:
recommend_list_text = []
for article in recommend_list:
    title = str(news_title_dict[article])
    recommend_list_text.append(title)
    print(title)

Reports: LSU LB Michael Divinity removed from team days before Alabama game
Newly Signed Raven Makes Comeback After Losing Job, Ring
Jimmy Garoppolo addresses Erin Andrews interview by saying he uses 'baby' 500 times a game
After throwing a punch, Dabo Swinney made CB Andrew Booth ride the manager bus back to Clemson
Black cat visits field during Cowboys-Giants game on 'Monday Night Football'
New Mexico DE Nahje Flowers dies at 21
Girl, 7, critically wounded in shooting while trick-or-treating in Little Village on Southwest Side; suspect in custody
Six people are dead after a mass shooting in Puerto Rico
This Is What Happens When You Take Ibuprofen Too Often, According to a Doctor
The decisions that have backfired on the Yankees in the ALCS
