# Imports and Preparing the data of behavior.tsv and news.tsv #

In [25]:
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score
import numpy as np
import pandas as pd
import re
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.decomposition import IncrementalPCA, PCA
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from scipy.cluster.hierarchy import dendrogram, linkage
from collections import defaultdict


# Load the data
base_path = Path.cwd() / 'data'
news_path = base_path / 'news.tsv'
behaviors_path = base_path / 'behaviors.tsv'

news = pd.read_csv(news_path, sep='\t', names=['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities'])
behaviors = pd.read_csv(behaviors_path, sep='\t', names=['impression_id', 'user_id', 'time', 'history', 'impressions'])

# Preprocessing of Data

In [26]:
# Data Cleaning
# Check if attributes such as Category, Subcategory, Title, and Abstract in news.tsv are complete.
# Remove news items with many missing values or replace them with:
# Category: unknown
# Subcategory: general
# Title and Abstract: a placeholder text like "Missing Data."
# For behaviors.tsv, remove users with missing or empty history.
news.fillna({'category': 'unknown', 'subcategory': 'general', 'title': 'Missing Title', 'abstract': 'Missing Abstract'}, inplace=True)
news.dropna(subset=['category', 'subcategory', 'title', 'abstract'], inplace=True)
behaviors.dropna(subset=['history', 'impressions'], inplace=True)

# Identify duplicates in the news table based on Title and Abstract
duplicated_news = news[news.duplicated(subset=['title', 'abstract'], keep=False)].copy()

# Create a mapping of duplicate news IDs to their canonical ID
duplicated_news['original_id'] = (
    duplicated_news.groupby(['title', 'abstract'])['news_id']
    .transform('first')
)

# Filter out rows where the news_id is the same as the original_id (self-matches)
duplicate_pairs = duplicated_news[duplicated_news['news_id'] != duplicated_news['original_id']]

# Create a DataFrame with two columns: 'original_id' and 'duplicate_id'
duplicate_mapping_df = duplicate_pairs[['original_id', 'news_id']].rename(columns={'news_id': 'duplicate_id'})

# Convert the DataFrame to a list of dictionaries or any desired structure if needed
duplicate_mapping_list = duplicate_mapping_df.to_dict(orient='records')

# Count duplicates before removing them
news_duplicates_before = len(news) - len(news.drop_duplicates(subset=['title', 'abstract']))
behaviors_duplicates_before = len(behaviors) - len(behaviors.drop_duplicates(subset=['impression_id']))

# Remove duplicates
# Remove news items in news.tsv that have the same values for Title and Abstract.
# Remove duplicate impressions (Impression ID) in behaviors.tsv
news.drop_duplicates(subset=['title', 'abstract'], inplace=True)
behaviors.drop_duplicates(subset=['impression_id'], inplace=True)

# Count duplicates after removing them (should be zero)
news_duplicates_after = len(news) - len(news.drop_duplicates(subset=['title', 'abstract']))
behaviors_duplicates_after = len(behaviors) - len(behaviors.drop_duplicates(subset=['impression_id']))

# Print the results
print(f"Removed {news_duplicates_before} duplicate news articles.")
print(f"Removed {behaviors_duplicates_before} duplicate impressions.")
print(f"Remaining duplicate news articles after cleaning: {news_duplicates_after}.")
print(f"Remaining duplicate impressions after cleaning: {behaviors_duplicates_after}.")

# Erstelle einen String aus den bereinigten Titeln
title_text = " ".join(news['title'])
abstract_text = " ".join(news['abstract'])

# Text cleaning
# Break down Title and Abstract into tokens:
# Remove special characters, numbers, and HTML.
# Convert all words to lowercase.
# Remove stopwords (e.g., using nltk or spacy).
# Perform lemmatization to reduce words to their base form.
def clean_text(text):
    # Clean text by removing special characters, numbers, and converting to lowercase
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"'ll", " will", text)
    text = re.sub(r"'ve", " have", text)
    text = re.sub(r"'re", " are", text)
    text = re.sub(r"'d", " would", text)
    text = re.sub(r"'s", " is", text)
    
    # Remove special characters
    text = re.sub(r'\W+', ' ', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    text = re.sub(r'\d+', '<NUMBER>', text)
    return text.lower()

def remove_stopwords_and_lemmatize(text):
    # Remove stopwords and perform basic stemming/lemmatization
    stopwords = set([
    'the', 'and', 'is', 'in', 'to', 'of', 'a', 'an', 'on', 'for', 'with', 'as', 'by', 'at', 'from', 'this', 'that', 'it', 'or', 'but', 'not', 'be', 'are', 'was', 'were', 'can', 'will', 'would', 'should', 'has', 'have', 'had', 'do', 'does', 'did', 'which', 'if', 'then', 'than', 'so', 'such', 'there', 'about', 'into', 'over',
    'after', 'abc', 'accord', 'news', 'report', 'release', 'authoriti', 'people', 'act', 'aft', 'add', 'new', 'after','appear', 'he', 'she', 'they', 'we', 'you', 'i', 'his', 'her', 'their', 'its', 'ours', 'theirs', 'your', 'my', 'mine', 'ourselves', 'yourselves', 'has', 'had', 'have', 'having', 'be', 'is', 'am', 'are', 'was', 'were', 'will', 'would', 'can', 'could', 'shall', 'should', 'might', 'must', 'do', 'does', 'did', 'doing', 'done', 'doesn’t', 'didn’t', 'above', 'below', 'between', 'under', 'within', 'against', 'during', 'before', 'after', 'under', 'among', 'throughout', 'along', 'around', 'across', 'besides', 'towards', 'despite', 'except', 'as', 'like', 'by', 'nor', 'because', 'until', 'while', 'on', 'off', 'up', 'down', 'over', 'under', 'with', 'without', 'very', 'really', 'just', 'quite', 'more', 'less', 'only', 'already', 'still', 'even', 'never', 'always', 'sometimes', 'usually', 'likely', 'so', 'however', 'therefore', 'too', 'then', 'instead', 'almost', 'again', 'further', 'also', 'together', 'finally', 'actually', 'necessarily', 'let', 'let’s', 'might', 'may', 'must', 'ought', 'shall', 'should', 'could', 'would', 'can', 'can\'t', 'didn\'t', 'don\'t', 'doesn\'t', 'wasn\'t', 'weren\'t', 'won\'t', 'didn\'t', 'cnn', 'bbc', 'apple', 'microsoft', 'google', 'amazon', 'twitter', 'facebook', 'microsoft', 'tesla', 'elon', 'johnson', 'clinton', 'trump', 'obama', 'biden', 'eu', 'un', 'nato', 'fifa', 'nba', 'mlb', 'spacex', 'said', 'says', 'states', 'reported', 'reports', 'released', 'reveals', 'discusses', 'confirmed', 'announced', 'according', 'according to', 'statement', 'revealed', 'details', 'claim', 'claims', 'said', 'said to', 'called', 'describes', 'explained', 'just', 'basically', 'literally', 'actually', 'seriously', 'absolutely', 'completely', 'totally', 'clearly', 'evidently', 'definitely', 'pretty', 'exactly', 'day', 'week', 'month', 'year', 'time', 'morning', 'evening', 'night', 'today', 'tomorrow', 'yesterday', 'hour', 'minute', 'second', 'one', 'two', 'three', 'four', 'five', 'hundred', 'thousand', 'million', 'billion', 'percent', 'kg', 'm', 'cm', 'inch', 'km', 'g', 'url', 'www', 'http', 'https', 'www', 'doc', 'pdf', 'jpg', 'jpeg', 'png', 'mp3', 'zip', 'ppt', 'xls', 'fact', 'such', 'as', 'in', 'even', 'case', 'of', 'with',
    # Add words from clusters
    'afc', 'north', 'south', 'aaron', 'rodger', 'adam', 'gase', 'angel', 'lakers', 'alex', 'bregman', 'alexandria', 'ocasio',  'rams', 'al', 'baghdadi', 'ukraine', 'ambassador', 'action', 'afternoon','age', 'admitt', 'addition', 'agency', 'ag', 'advance', 'academy', 'additional', 'aid', 'ago',  'affect', 'afford', 'active', 'accept', 'accus', 'agre', 'advis', 'account', 'access', 'abstract', 'adults',  'adult', 'advisory', 'abandon', 'accident', 'america', 'address', 'administration', 'acr', 'ahead', 'alleged',  'airlin', 'allen', 'abuse', 'activity', 'actors', 'actress', 'agents', 'agreement', 'alarm', 'alexand', 'andy',   'alleged', 'answers', 'actor', 'allow', 'administration', 'agency', 'al', 'air', 'aim', 'alert', 'aid', 'home', 'win', 'u', 'miss', 'go', 'team', 'monday', 'tuesday', 'first', 'last', 'man', 'top', 'back', 'best', 'open', 'us', 'wednesday', 'play', 'thursday', 'sign', 'sunday', 'now', 'start', 'look', 'season', 'game', 'watch', 'want', 's', 'make', 'old', 'friday', 'saturday', 'woman'
])

    
    def advanced_lemmatization(word):
        # Remove common suffixes like 'ed', 'ing', 'es', 'er', 'ly'
        suffixes = ['ed', 'ing', 'es', 'er', 'ly', 'able', 'ness']
        for suffix in suffixes:
            if word.endswith(suffix):
                return word[:-len(suffix)]
        return word

    # List of simple verbs
    irregular_verbs = {
        'ran': 'run',
        'ate': 'eat',
        'wrote': 'write',
        'went': 'go',
        'saw': 'see',
        'had': 'have',
        'was': 'be',
        'were': 'be'
    }

    def lemmatize_irregular_verbs(word):
        return irregular_verbs.get(word, word)

    words = text.split()
    lemmatized = []

    for word in words:
        if word.lower() not in stopwords:
            # Outsort verbs
            word = lemmatize_irregular_verbs(word)
            # Remove suffix
            word = advanced_lemmatization(word)
            lemmatized.append(word)

    return " ".join(word for word in lemmatized if word not in stopwords)

def preprocess_text(text):
    # Full preprocessing pipeline: clean text, remove stopwords, and lemmatize
    cleaned_text = clean_text(text)
    return remove_stopwords_and_lemmatize(cleaned_text)


def preprocess_text_parallel(texts):
    processed_texts = [preprocess_text(text) for text in texts]
    return processed_texts


# Clean Title and Abstract
news['clean_title'] = preprocess_text_parallel(news['title'])
news['clean_abstract'] = preprocess_text_parallel(news['abstract'])


from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Erstelle einen String aus den bereinigten Titeln
title_text = " ".join(news['clean_title'])
abstract_text = " ".join(news['clean_abstract'])

# TF-IDF for clean_title
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=2000, stop_words='english')
tfidf_title = tfidf_vectorizer.fit_transform(news['clean_title'])

# Print dimensions after transforming titles
print(f"TF-IDF Title Shape: {tfidf_title.shape}")  # Number of titles x 2000 (max_features)

# TF-IDF for clean_abstract
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=2000, stop_words='english')
tfidf_abstract = tfidf_vectorizer.fit_transform(news['clean_abstract'])

# Print dimensions after transforming abstracts
print(f"TF-IDF Abstract Shape: {tfidf_abstract.shape}")  # Number of abstracts x 2000 (max_features)

# Combine sparse matrices (titles + abstracts)
news_features = hstack([tfidf_title, tfidf_abstract])
print(f"Combined TF-IDF Features Shape: {news_features.shape}")  # Number of items x 4000 (2000 + 2000)

# One-hot encode categories and subcategories
category_encoded = pd.get_dummies(news['category'])
subcategory_encoded = pd.get_dummies(news['subcategory'])

# Print dimensions of one-hot encoded features
print(f"Category Encoding Shape: {category_encoded.shape}")  # Number of items x unique categories
print(f"Subcategory Encoding Shape: {subcategory_encoded.shape}")  # Number of items x unique subcategories

# Combine all features (TF-IDF + category + subcategory)
final_features = hstack([news_features, category_encoded.values, subcategory_encoded.values])
print(f"Final Combined Features Shape: {final_features.shape}")  # Number of items x (4000 + categories + subcategories)

# PCA for dimensionality reduction
pca = IncrementalPCA(n_components=100, batch_size=1000)
reduced_features = pca.fit_transform(news_features)

# Print dimensions after PCA
print(f"Reduced Features Shape (after PCA): {reduced_features.shape}")  # Number of items x 100


Removed 613 duplicate news articles.
Removed 0 duplicate impressions.
Remaining duplicate news articles after cleaning: 0.
Remaining duplicate impressions after cleaning: 0.
TF-IDF Title Shape: (50669, 2000)
TF-IDF Abstract Shape: (50669, 2000)
Combined TF-IDF Features Shape: (50669, 4000)
Category Encoding Shape: (50669, 17)
Subcategory Encoding Shape: (50669, 263)
Final Combined Features Shape: (50669, 4280)
Reduced Features Shape (after PCA): (50669, 100)


# KMeans Visualization with PCA

In [27]:
# Clustering with KMeans
optimal_k = 20

kmeans = KMeans(n_clusters=optimal_k, random_state=42)
news['cluster'] = kmeans.fit_predict(reduced_features)
news

Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities,clean_title,clean_abstract,cluster
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[],brands queen elizabeth prince charl prince phi...,shop notebooks jackets royals ca live,3
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...",worst habits bel fat,these seeming harmless habits hold keep shedd ...,3
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId...",cost freeze trench war,lt ivan molchanets peek parapet sand bags fron...,3
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ...",wife here how mental health,felt fraud wife help near destroy me,3
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...",how get rid skin tags dermatologist,seem harmless good reason ignore them post how...,3
...,...,...,...,...,...,...,...,...,...,...,...
51277,N16909,weather,weathertopstories,"Adapting, Learning And Soul Searching: Reflect...",Woolsey Fire Anniversary: A community is forev...,https://assets.msn.com/labs/mind/BBWzQJK.html,"[{""Label"": ""Woolsey Fire"", ""Type"": ""N"", ""Wikid...","[{""Label"": ""Woolsey Fire"", ""Type"": ""N"", ""Wikid...",adapt learn soul search reflect woolsey fire,woolsey fire anniversary community forev chang...,3
51278,N47585,lifestyle,lifestylefamily,Family says 13-year-old Broadway star died fro...,Missing Abstract,https://assets.msn.com/labs/mind/BBWzQYV.html,"[{""Label"": ""Broadway theatre"", ""Type"": ""F"", ""W...",[],fami broadway star di massive asthma attack,,3
51279,N7482,sports,more_sports,St. Dominic soccer player tries to kick cancer...,"Sometimes, what happens on the sidelines can b...",https://assets.msn.com/labs/mind/BBWzQnK.html,[],[],st dominic socc tri kick canc curb,what happens sidelin important what happens fi...,3
51280,N34418,sports,soccer_epl,How the Sounders won MLS Cup,"Mark, Jeremiah and Casey were so excited they ...",https://assets.msn.com/labs/mind/BBWzQuK.html,"[{""Label"": ""MLS Cup"", ""Type"": ""U"", ""WikidataId...",[],how sounders won mls cup,mark jeremiah casey excit postgame podcast,3


# Start with content based recommendation

In [5]:
# copy_news = news.copy()
# copy_news["cluster"]

# def find_cluster(news_df, news_id):
#     result = news_df[news_df['news_id'] == news_id]['cluster']
#     if not result.empty:
#         return result.iloc[0]  # Return the cluster value
#     else:
#         return None  # Return None if news_id not found
# #
# # Example usage
# news_id_to_find = "some_news_id"
# cluster = find_cluster(copy_news, "N41777")
# print(f"The cluster for news_id {news_id_to_find} is: {cluster}")



In [6]:
# print(copy_news[copy_news['news_id'] == "N41777"])

# Test if history cleaning, helps

In [28]:
# Create the duplicate mapping dictionary
duplicate_mapping_dict = dict(zip(duplicate_mapping_df['duplicate_id'], duplicate_mapping_df['original_id']))

# Clean the history in behaviors by replacing duplicate IDs with their original IDs
def clean_user_history(history):
    # Split the history column into a list of news IDs
    news_list = history.split()
    # Replace duplicate IDs with original IDs using the mapping dictionary
    replaced_news_list = [duplicate_mapping_dict.get(news_id, news_id) for news_id in news_list]
    # Join the cleaned list back into a single string
    return " ".join(replaced_news_list)

# Apply the cleaning function to the history column
behaviors['history'] = behaviors['history'].apply(clean_user_history)

# Check for remaining duplicate IDs in the history
all_history_ids = set(" ".join(behaviors['history']).split())
remaining_invalid_ids = all_history_ids & set(duplicate_mapping_dict.keys())
print(f"Remaining duplicate news IDs in behaviors history: {remaining_invalid_ids}")

Remaining duplicate news IDs in behaviors history: set()


# Data Pre-Processing

In [29]:
# Split 'history' into lists of baskets based on ids
# Group by user and get rid of duplicates in the history
user_histories = []
user_data_for_df = []
for user_id, user_data in behaviors.groupby('user_id'):
    one_history_string = " ".join(user_data['history'])
    splitted_without_duplicates = set(one_history_string.split())
    articles_list_per_user = list(splitted_without_duplicates)
    user_histories.append(articles_list_per_user)
    user_data_for_df.append([user_id, articles_list_per_user])

In [30]:
news_title_dict = dict(zip(news['news_id'], news['title']))

# Count article reads and sort after highest count

In [31]:
# Count occurence of each article
articles_count = {}

for history in user_histories:
    for article in history:
        if article in articles_count:
            articles_count[article] += 1
        else:
            articles_count[article] = 1

# Append cluster to articles count and sort clusters
cluster_dict = defaultdict(list)
for _, row in news.iterrows():
    news_id = row['news_id']
    cluster = row['cluster']
    
    if news_id in articles_count:
        cluster_dict[cluster].append((articles_count[news_id], news_id))

for cluster in cluster_dict:
    cluster_dict[cluster] = sorted(cluster_dict[cluster], key=lambda x: x[0], reverse=True)

top_sorted_articles = {}

for cluster, articles in cluster_dict.items():
    top_sorted_articles[cluster] = [news_id for _, news_id in articles]

# Create Dataframe for dealing with top cluster per user

In [32]:
user_histories_df = pd.DataFrame(user_data_for_df, columns=['user_id', 'full_history'])
user_histories_df = behaviors[['user_id']].drop_duplicates().merge(user_histories_df, on='user_id')
user_histories_df

Unnamed: 0,user_id,full_history
0,U13740,"[N10414, N45794, N63302, N55189, N18445, N1934..."
1,U91836,"[N16617, N2511, N25785, N7023, N5398, N35458, ..."
2,U73700,"[N21087, N26378, N24233, N47289, N62058, N6038..."
3,U34670,"[N43142, N33013, N41375, N31825, N51891, N2203..."
4,U8125,"[N33740, N56514, N10078, N14904]"
...,...,...
49103,U6794,"[N22058, N60184, N47847, N3595, N37920, N27448..."
49104,U23127,"[N57162, N16874, N34627, N18073, N44510, N6439..."
49105,U43157,"[N64775, N17254, N14006, N30410, N43086, N2472..."
49106,U66493,"[N57336, N34069, N50638, N26151, N22570, N4367..."


# Get cluster for every article in user history

In [33]:
user_histories_df["clusters"] = None
news_dict = dict(zip(news["news_id"], news["cluster"]))

for idx, row in user_histories_df.iterrows():
    user_id = row['user_id']
    full_history = row['full_history']
    cluster_list = []
    for article in full_history:
        if article in news_dict:
            cluster_list.append(news_dict[article])
    user_histories_df.at[idx, "clusters"] = cluster_list
user_histories_df

Unnamed: 0,user_id,full_history,clusters
0,U13740,"[N10414, N45794, N63302, N55189, N18445, N1934...","[3, 5, 3, 3, 3, 3, 3, 3, 3]"
1,U91836,"[N16617, N2511, N25785, N7023, N5398, N35458, ...","[3, 3, 3, 3, 3, 3, 3, 10, 10, 17, 7, 3, 3, 3, ..."
2,U73700,"[N21087, N26378, N24233, N47289, N62058, N6038...","[9, 3, 3, 9, 11, 3, 3, 3, 0, 3, 3, 3, 3, 15, 1..."
3,U34670,"[N43142, N33013, N41375, N31825, N51891, N2203...","[3, 3, 13, 3, 6, 13, 3, 3, 10, 3]"
4,U8125,"[N33740, N56514, N10078, N14904]","[3, 9, 3, 3]"
...,...,...,...
49103,U6794,"[N22058, N60184, N47847, N3595, N37920, N27448...","[3, 13, 3, 3, 3, 0, 13, 3, 3, 0, 0]"
49104,U23127,"[N57162, N16874, N34627, N18073, N44510, N6439...","[3, 4, 4, 3, 3, 3, 1, 5, 3, 11, 18, 3, 3, 10, ..."
49105,U43157,"[N64775, N17254, N14006, N30410, N43086, N2472...","[3, 3, 3, 3, 3, 3, 3, 4]"
49106,U66493,"[N57336, N34069, N50638, N26151, N22570, N4367...","[3, 1, 18, 3, 3, 3, 3, 3, 3, 13, 3, 3]"


# Important !!

# Some articles appear to not be in the cluster dataframe -> Look into it

In [34]:
missing_clusters = news[news['cluster'].isnull()]
print(f"News without Cluster: {missing_clusters}")


News without Cluster: Empty DataFrame
Columns: [news_id, category, subcategory, title, abstract, url, title_entities, abstract_entities, clean_title, clean_abstract, cluster]
Index: []


In [35]:
# All unique news_ids from the behaviors table
behavior_news_ids = set(" ".join(behaviors['history']).split())

# All unique news_ids from the news table
news_ids = set(news['news_id'])

# IDs that exist in behaviors but are missing in the news table
missing_news_ids = behavior_news_ids - news_ids
print(f"News IDs in behaviors but not in news: {missing_news_ids}")

missing_count = len(missing_news_ids)


print(f"Some many news are missing: {missing_count}")

News IDs in behaviors but not in news: set()
Some many news are missing: 0


In [36]:
empty_list_rows = user_histories_df[user_histories_df['clusters'].apply(lambda x: isinstance(x, list) and len(x) == 0)]
print(empty_list_rows)

Empty DataFrame
Columns: [user_id, full_history, clusters]
Index: []


# Count top cluster per user

In [37]:
for idx, row in user_histories_df.iterrows():
    cluster_counts = {}
    clusters = row['clusters']
    if clusters != []:
        for cluster in clusters:
            cluster_counts[cluster] = cluster_counts.get(cluster, 0) + 1
        if cluster_counts:
            top_cluster = max(cluster_counts, key=cluster_counts.get)
    else:
        top_cluster = None
    user_histories_df.at[idx, "top_cluster"] = top_cluster
# Get rid of this line if handling missing articles:
user_histories_df["top_cluster"] = user_histories_df["top_cluster"].astype("Int64")
##
user_histories_df

Unnamed: 0,user_id,full_history,clusters,top_cluster
0,U13740,"[N10414, N45794, N63302, N55189, N18445, N1934...","[3, 5, 3, 3, 3, 3, 3, 3, 3]",3
1,U91836,"[N16617, N2511, N25785, N7023, N5398, N35458, ...","[3, 3, 3, 3, 3, 3, 3, 10, 10, 17, 7, 3, 3, 3, ...",3
2,U73700,"[N21087, N26378, N24233, N47289, N62058, N6038...","[9, 3, 3, 9, 11, 3, 3, 3, 0, 3, 3, 3, 3, 15, 1...",3
3,U34670,"[N43142, N33013, N41375, N31825, N51891, N2203...","[3, 3, 13, 3, 6, 13, 3, 3, 10, 3]",3
4,U8125,"[N33740, N56514, N10078, N14904]","[3, 9, 3, 3]",3
...,...,...,...,...
49103,U6794,"[N22058, N60184, N47847, N3595, N37920, N27448...","[3, 13, 3, 3, 3, 0, 13, 3, 3, 0, 0]",3
49104,U23127,"[N57162, N16874, N34627, N18073, N44510, N6439...","[3, 4, 4, 3, 3, 3, 1, 5, 3, 11, 18, 3, 3, 10, ...",3
49105,U43157,"[N64775, N17254, N14006, N30410, N43086, N2472...","[3, 3, 3, 3, 3, 3, 3, 4]",3
49106,U66493,"[N57336, N34069, N50638, N26151, N22570, N4367...","[3, 1, 18, 3, 3, 3, 3, 3, 3, 13, 3, 3]",3


In [38]:
# Print occurence of top clusters
value_counts = user_histories_df["top_cluster"].value_counts()
print(value_counts)

top_cluster
3     46817
1       466
0       270
5       242
9       169
6       140
13      139
10      135
4       115
11      113
7        99
17       83
14       80
15       60
2        52
12       48
18       40
19       26
8        14
Name: count, dtype: Int64


In [39]:
rows_with_missing_values = user_histories_df[user_histories_df.isna().any(axis=1)]
print(rows_with_missing_values)

Empty DataFrame
Columns: [user_id, full_history, clusters, top_cluster]
Index: []


In [40]:
user_histories_df.dropna(subset=['top_cluster'], inplace=True)

# Get top cluster for any user then recommend the first top 10 articles in that cluster that haven't been read by the user

In [41]:
# Get top cluster and recommend top 10 articles (that haven't been read)
"""
For 0 for example:
For 1 for example: U23127
For 2 for example: U83901
"""
user_id = "U83901" #"U23127"
user_top_cluster = user_histories_df[user_histories_df["user_id"] == user_id]["top_cluster"].iloc[0]
user_history = user_histories_df[user_histories_df["user_id"] == user_id]["full_history"].iloc[0]

top_articles_from_cluster = top_sorted_articles[int(user_top_cluster)]

recommend_list = []
count = 0
for top_article in top_articles_from_cluster:
    if top_article not in article:
        recommend_list.append(top_article)
        count += 1
    if count == 10:
        break
print(recommend_list)

['N42620', 'N31801', 'N55189', 'N43142', 'N29177', 'N16715', 'N18870', 'N55743', 'N52551', 'N61864']


In [42]:
print("What user have read so far:")
for i in user_history:
    print(i)
print()
print("What is recommended to the user:")
for i in recommend_list:
    print(i)

What user have read so far:
N9457

What is recommended to the user:
N42620
N31801
N55189
N43142
N29177
N16715
N18870
N55743
N52551
N61864


# Print titles of the recommended articles

In [43]:
recommend_list_text = []
for article in recommend_list:
    title = str(news_title_dict[article])
    recommend_list_text.append(title)
    print(title)

Heidi Klum's 2019 Halloween Costume Transformation Is Mind-Blowing   But, Like, What Is It?
Joe Biden reportedly denied Communion at a South Carolina church because of his stance on abortion
'Wheel Of Fortune' Guest Delivers Hilarious, Off The Rails Introduction
Former NBA first-round pick Jim Farmer arrested in sex sting operation
Miguel Cervantes' Wife Reveals Daughter, 3, 'Died in My Arms' After Entering Hospice Care
Mitch McConnell snubbed by Elijah Cummings' pallbearer in handshake line at U.S. Capitol ceremony
Here Are the Biggest Deals We're Anticipating for Black Friday
17 photos that show the ugly truth of living in a tiny house
Pamela Anderson gets backlash after wearing a Native American headdress for Halloween
The News In Cartoons
