In [7]:
# Third-party Library Imports
import pandas as pd                  # Data processing
import numpy as np                   # Math
from typing import List, Tuple, Dict # Type hinting
import ast                           # Literal evaluation

# Prediction making
from sklearn.metrics.pairwise import cosine_similarity      # Measuring the similarity
from sklearn.feature_extraction.text import TfidfVectorizer # Vectorizing the articles
from sklearn.preprocessing import normalize                 # Normalizing the TFIDF matrix for cosine similarity calculation

# Compute cosine similarity between articles
from nltk.corpus import stopwords                           # Removing stopwords (preprocessing)

In [8]:
articles_df = pd.read_csv("..//..//..//data/shared_articles.csv", nrows=1000)
articles_df = articles_df[(articles_df["eventType"] == "CONTENT SHARED") & (articles_df["lang"] == "en")]

<h2> Model Creation </h2>

In [9]:
stopwords_list = stopwords.words("english")

MAX_FEATURES = 1000

#Trains a model whose vectors size is MAX_FEATURES, composed by the main unigrams and bigrams found in the corpus, ignoring stopwords
vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 2),
                     min_df=0.003,
                     max_df=0.5,
                     max_features=MAX_FEATURES,
                     stop_words=stopwords_list)

item_ids     = articles_df["contentId"].tolist()                                         # Convert IDs to a list 
tfidf_matrix = vectorizer.fit_transform(articles_df["title"] + "" + articles_df["text"]) # Concatenate title and text for more information
tfidf_matrix

<781x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 104922 stored elements in Compressed Sparse Row format>

In [10]:
tfidf_matrix.shape

(781, 1000)

In [11]:
feature_names = np.array(vectorizer.get_feature_names_out())
dense_tfidf_matrix = tfidf_matrix.toarray()             

all_tags = []

# Iterate over each row (article) in the TF-IDF matrix
for i in range(dense_tfidf_matrix.shape[0]):
    tfidf_values = dense_tfidf_matrix[i] # Get the TF-IDF representation

    top_indices = np.argsort(tfidf_values)[-5:][::-1] # Get top 5 most popular feature names
    top_words = feature_names[top_indices] # Extract feature names for given article

    # Print the results for the current article
    print(f"Top 5 words for Article {i}: {top_words}")
    all_tags.append(f'{top_words.tolist()}')

Top 5 words for Article 0: ['ethereum' 'bitcoin' 'system' 'said' 'blockchain']
Top 5 words for Article 1: ['latest' 'check' 'rate' 'note' 'ceo']
Top 5 words for Article 2: ['center' 'video' 'google' 'data' 'google cloud']
Top 5 words for Article 3: ['ibm' 'blockchain' 'bitcoin' 'linux' 'technology']
Top 5 words for Article 4: ['blockchain' 'computing' 'cloud' 'event' 'conference']
Top 5 words for Article 5: ['fintech' 'banks' 'bitcoin' 'banking' 'financial']
Top 5 words for Article 6: ['blockchain' 'financial' 'trust' 'bitcoin' 'system']
Top 5 words for Article 7: ['bitcoin' 'organization' 'currency' 'organizations' 'interest']
Top 5 words for Article 8: ['ethereum' 'bitcoin' 'currency' 'said' 'value']
Top 5 words for Article 9: ['load' 'load balancing' 'http' 'balancing' 'backend']
Top 5 words for Article 10: ['ssl' 'proxy' 'health' 'instances' 'load']
Top 5 words for Article 11: ['billion' 'systems' 'company' 'markets' 'services']
Top 5 words for Article 12: ['sharing' 'uber' 'demand

In [12]:
normalized_tfidf_matrix = normalize(tfidf_matrix, norm='l2', axis=1) # Normalize matrix so cosine similarity can be accurately calculated

cosine_similarities = cosine_similarity(normalized_tfidf_matrix) # Compute cosine similarity between all the articles

# Iterate over each article
for i in range(cosine_similarities.shape[0]):
    # Get the indices of the top 5 most similar articles (excluding the article itself)
    top_indices = np.argsort(cosine_similarities[i])[-6:-1][::-1]

    # Print the results for the current article
    print(f"Top 5 most similar articles to Article {i}: {top_indices}")

Top 5 most similar articles to Article 0: [  8 424 157  71 194]
Top 5 most similar articles to Article 1: [273 769 159 276  41]
Top 5 most similar articles to Article 2: [121  82 207 439 732]
Top 5 most similar articles to Article 3: [191 221 307 276 265]
Top 5 most similar articles to Article 4: [274  63 218 192 304]
Top 5 most similar articles to Article 5: [682 308 238  40 175]
Top 5 most similar articles to Article 6: [727 315 276 220  17]
Top 5 most similar articles to Article 7: [ 25  96  98 175 156]
Top 5 most similar articles to Article 8: [  0 424 157  71 194]
Top 5 most similar articles to Article 9: [345  10 365 367 388]
Top 5 most similar articles to Article 10: [  9 345  72 388 367]
Top 5 most similar articles to Article 11: [ 89  91 330 240  77]
Top 5 most similar articles to Article 12: [ 42 495 532  30 288]
Top 5 most similar articles to Article 13: [ 77 126  42 257 659]
Top 5 most similar articles to Article 14: [  0   8 157 424  71]
Top 5 most similar articles to Arti