In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
import faiss
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

In [2]:
articles = pd.read_csv('articles.csv')

In [3]:
articles.head()

Unnamed: 0,author,claps,reading_time,link,title,text
0,Justin Lee,8.3K,11,https://medium.com/swlh/chatbots-were-the-next...,Chatbots were the next big thing: what happene...,"Oh, how the headlines blared:\nChatbots were T..."
1,Conor Dewey,1.4K,7,https://towardsdatascience.com/python-for-data...,Python for Data Science: 8 Concepts You May Ha...,If you’ve ever found yourself looking up the s...
2,William Koehrsen,2.8K,11,https://towardsdatascience.com/automated-featu...,Automated Feature Engineering in Python – Towa...,Machine learning is increasingly moving from h...
3,Gant Laborde,1.3K,7,https://medium.freecodecamp.org/machine-learni...,Machine Learning: how to go from Zero to Hero ...,If your understanding of A.I. and Machine Lear...
4,Emmanuel Ameisen,935,11,https://blog.insightdatascience.com/reinforcem...,Reinforcement Learning from scratch – Insight ...,Want to learn about applied Artificial Intelli...


In [None]:
# Initialize the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")


In [5]:
# Function to generate embeddings using BERT
def get_embeddings(text_list):
    inputs = tokenizer(text_list, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()  

# Generate embeddings for the article titles
title_embeddings = get_embeddings(articles['title'].tolist())

# Dimensionality Reduction 
pca = PCA(n_components=50)
reduced_embeddings = pca.fit_transform(title_embeddings)

# Initialize FAISS index with reduced dimensions
dimension = reduced_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(reduced_embeddings, dtype='float32'))

In [6]:
# Function to compute similarity 
def compute_similarity(query_embedding, title_embeddings):
    cos_sim = cosine_similarity(query_embedding, title_embeddings)
    return cos_sim

In [7]:
# Function to recommend articles based on a query
def recommend_articles(query, top_k=5):
    query_embedding = get_embeddings([query])
    query_embedding_reduced = pca.transform(query_embedding)
    # Perform fast ANN search with FAISS
    D, I = index.search(np.array(query_embedding_reduced, dtype='float32'), top_k)
    initial_results = articles.iloc[I[0]]
    cos_sim = compute_similarity(query_embedding, title_embeddings[I[0]])

    # Re-rank results by cosine similarity 
    ranked_indices = np.argsort(-cos_sim.flatten())
    final_results = initial_results.iloc[ranked_indices]

    return final_results

In [8]:
# Example usage
query = "Future of Artificial Intelligence"
recommended_articles = recommend_articles(query, top_k=5)
print(recommended_articles[['title', 'author', 'claps', 'reading_time', 'link']])

                                                 title               author  \
119  How Artificial Intelligence can improve online...         Espen Waldal   
334  Spiking Neural Networks, the Next Generation o...           Devin Soni   
251  An augmentation based deep neural network appr...          Vivek Yadav   
333  Artificial Intelligence, AI in 2018 and beyond...  Eugenio Culurciello   
168  Beethoven, Picasso, and Artificial Intelligenc...       Chris Kalahiki   

    claps  reading_time                                               link  
119    57             6  https://medium.com/bakken-b%C3%A6ck/how-artifi...  
334  5.8K             4  https://towardsdatascience.com/spiking-neural-...  
251   425            11  https://chatbotslife.com/using-augmentation-to...  
333  2.8K            13  https://towardsdatascience.com/artificial-inte...  
168    30            15  https://towardsdatascience.com/beethoven-picas...  
