<a href="https://colab.research.google.com/github/NeSma237/Content-Based-News-Recommendation-System/blob/main/03_content_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("arashnic/mind-news-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/mind-news-dataset


In [2]:
!pip install nltk



In [3]:
import pandas as pd
import os
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import string

# Download NLTK stopwords if not already
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import os

base_path = '/kaggle/input/mind-news-dataset'
print("Contents of base path:")
print(os.listdir(base_path))


Contents of base path:
['MINDsmall_train', 'news.tsv']


In [5]:
news_path = os.path.join(base_path, 'MINDsmall_train', 'news.tsv')  # example subdir
columns = ['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbstractEntities']

news_df = pd.read_csv(news_path, sep='\t', names=columns, encoding='utf-8')
news_df.head()


Unnamed: 0,NewsID,Category,SubCategory,Title,Abstract,URL,TitleEntities,AbstractEntities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [6]:
def clean_text(text):
    if pd.isnull(text):
        return ""
    # Lowercase, remove punctuation, and stopwords
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply to Title and Abstract
news_df['clean_title'] = news_df['Title'].apply(clean_text)
news_df['clean_abstract'] = news_df['Abstract'].apply(clean_text)



In [7]:
# Example using combined text
news_df['combined_text'] = news_df['clean_title'] + ' ' + news_df['clean_abstract']

vectorizer = TfidfVectorizer(max_features=5000)  # or any number of features you want
tfidf_matrix = vectorizer.fit_transform(news_df['combined_text'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (51282, 5000)


In [8]:
# Load behaviors.tsv
behavior_path = os.path.join(base_path, 'MINDsmall_train', 'behaviors.tsv')  # adjust if needed

behavior_columns = ['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']
behaviors_df = pd.read_csv(behavior_path, sep='\t', names=behavior_columns)
behaviors_df.head()

Unnamed: 0,ImpressionID,UserID,Time,History,Impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [9]:
# Create a map from NewsID to TF-IDF vector row
from scipy.sparse import vstack

news_id_to_index = {news_id: idx for idx, news_id in enumerate(news_df['NewsID'])}


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# (Optional) Re-load if needed
# news_df = pd.read_csv('path_to_cleaned_news.csv')

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(news_df['combined_text'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)

# Compute cosine similarity (sparse-friendly)
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)



TF-IDF matrix shape: (51282, 5000)


In [9]:
def get_similar_articles(article_index, top_n=5):
    sim_scores = similarity_matrix[article_index]
    similar_indices = sim_scores.argsort()[::-1][1:top_n+1]

    return news_df.iloc[similar_indices][['NewsID', 'Title', 'Category']]


In [10]:
# Pick an article (e.g., first one)
index = 0
print("Original Article:", news_df.iloc[index]['Title'])
get_similar_articles(index, top_n=5)


Original Article: The Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By


NameError: name 'similarity_matrix' is not defined

In [None]:
import numpy as np

np.save("article_similarity_matrix.npy", similarity_matrix)
