In [11]:
import faiss
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances

In [2]:
metadata = pd.read_csv('final_data.csv').dropna()

In [3]:
metadata.head()

Unnamed: 0,category,headline,links,abstract,keywords,text,sentiment_score,sentiment_label
0,WELLNESS,143 Miles in 35 Days: Lessons Learned,https://www.huffingtonpost.com/entry/running-l...,Resting is part of training. I've confirmed wh...,running-lessons,143 mile 35 day lesson learn rest training i'v...,0.2732,neutral
1,WELLNESS,Talking to Yourself: Crazy or Crazy Helpful?,https://www.huffingtonpost.com/entry/talking-t...,Think of talking to yourself as a tool to coac...,talking-to-yourself-crazy,talk crazi crazi helpful think talk tool coach...,0.9022,positive
2,WELLNESS,Crenezumab: Trial Will Gauge Whether Alzheimer...,https://www.huffingtonpost.com/entry/crenezuma...,The clock is ticking for the United States to ...,crenezumab-alzheimers-disease-drug,crenezumab trial gaug alzheimer drug prevent s...,0.0258,neutral
3,WELLNESS,"Oh, What a Difference She Made",https://www.huffingtonpost.com/entry/meaningfu...,"If you want to be busy, keep trying to be perf...",meaningful-life,oh differ want busy tri perfect want happy foc...,0.8402,positive
4,WELLNESS,Bad Love Advice: 9 Lessons To Unlearn,https://www.huffingtonpost.com/entry/bad-love-...,"By Carey Moss for YouBeauty.com Love rom-coms,...",bad-love-advice-from-movies,bad love advice 9 lesson unlearn carey moss yo...,0.8779,positive


In [5]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42098 entries, 0 to 44428
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   category         42098 non-null  object 
 1   headline         42098 non-null  object 
 2   links            42098 non-null  object 
 3   abstract         42098 non-null  object 
 4   keywords         42098 non-null  object 
 5   text             42098 non-null  object 
 6   sentiment_score  42098 non-null  float64
 7   sentiment_label  42098 non-null  object 
dtypes: float64(1), object(7)
memory usage: 2.9+ MB


In [6]:
# Preprocess the text data
metadata['text'] = metadata['headline'] + " " + metadata['abstract']
metadata['text'] = metadata['text'].apply(lambda x: x.lower())
metadata['text'] = metadata['text'].apply(lambda x: re.sub(r'\W+', ' ', x))

In [7]:
# Initialize the HashingVectorizer
vectorizer = HashingVectorizer(n_features=1000, binary=True)

In [8]:
# Generate the matrix of hashed features
tfidf_matrix = vectorizer.fit_transform(metadata['text'])

In [9]:
# Create a FAISS index
d = tfidf_matrix.shape[1]
index = faiss.IndexFlatL2(d)

In [10]:
# Add the vectors to the index
index.add(tfidf_matrix.toarray())

In [12]:
def get_recommendations(query, vectorizer, index, metadata, k=10):
    # Transform the query into a vector
    query_vector = vectorizer.transform([query])
    
    def search(index, query_vector, k=10):
        D, I = index.search(query_vector, k)
        return I[0]

    # Use the index to find similar items
    neighbor_indices = search(index, query_vector.toarray(), k)
    
    # Fetch sentiment scores and labels corresponding to recommended headlines
    recommendations = []
    for idx in neighbor_indices:
        headline = metadata.iloc[idx]['headline']
        links = metadata.iloc[idx]['links']
        sentiment_score = metadata.iloc[idx]['sentiment_score']
        sentiment_label = metadata.iloc[idx]['sentiment_label']
        recommendations.append([headline, sentiment_score, sentiment_label, links])
    
    # Create a DataFrame from the recommendations
    recommendations_df = pd.DataFrame(recommendations, columns=['Headline', 'Sentiment Score', 'Sentiment Label','Links'])
    
    return recommendations_df.to_string(index=False)


x = input("Search: ")
query = x
recommendations_df = get_recommendations(query, vectorizer, index, metadata)
print("Recommendations for:", query)
print(recommendations_df)

Recommendations for: selena gomez
                                                           Headline  Sentiment Score Sentiment Label                                                                                                                                  Links
       Kygo Just Teased A Brand New Collaboration With Selena Gomez           0.0000         neutral                                           https://www.huffingtonpost.com/entry/selena-gomez-kygo-new-music_us_589e1278e4b0ab2d2b1503d9
      Selena Gomez Shows Love For The Weeknd In Her Instagram Story           0.8555        positive                                               https://www.huffingtonpost.com/entry/selena-gomez-the-weeknd_us_58b44942e4b0780bac2ba469
               14 Toronto Film Festival Movies Worth Your Attention           0.2263         neutral                                https://www.huffingtonpost.com/entry/toronto-film-festival-best-movies-2016_us_57e2d379e4b08d73b82f180a
         Selena Gomez 