In [1]:
import faiss
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

In [17]:
metadata = pd.read_csv('sentinlinks.csv', nrows = 47547).dropna()

In [18]:
metadata.head()

Unnamed: 0,category,headline,links,abstract,keywords,text,sentiment_score,sentiment_label
0,WELLNESS,143 Miles in 35 Days: Lessons Learned,https://www.huffingtonpost.com/entry/running-l...,Resting is part of training. I've confirmed wh...,running-lessons,143 Miles in 35 Days: Lessons Learned Resting ...,-0.2846,negative
1,WELLNESS,Talking to Yourself: Crazy or Crazy Helpful?,https://www.huffingtonpost.com/entry/talking-t...,Think of talking to yourself as a tool to coac...,talking-to-yourself-crazy,Talking to Yourself: Crazy or Crazy Helpful? T...,0.8074,positive
2,WELLNESS,Crenezumab: Trial Will Gauge Whether Alzheimer...,https://www.huffingtonpost.com/entry/crenezuma...,The clock is ticking for the United States to ...,crenezumab-alzheimers-disease-drug,Crenezumab: Trial Will Gauge Whether Alzheimer...,0.4404,positive
3,WELLNESS,"Oh, What a Difference She Made",https://www.huffingtonpost.com/entry/meaningfu...,"If you want to be busy, keep trying to be perf...",meaningful-life,"Oh, What a Difference She Made If you want to ...",0.8402,positive
4,WELLNESS,Green Superfoods,https://www.huffingtonpost.com/entry/green-sup...,"First, the bad news: Soda bread, corned beef a...",green-superfoods,"Green Superfoods First, the bad news: Soda bre...",0.4664,positive


In [19]:
def map_sentiment(score):
    if score > 0.3:
        return 'positive'
    elif 0.3 >= score >= -0.3:
        return 'neutral'
    else:
        return 'negative'

# Apply the function to create a new column with sentiment labels
metadata['sentiment_label'] = metadata['sentiment_score'].apply(map_sentiment)


In [20]:
metadata.sample(5)

Unnamed: 0,category,headline,links,abstract,keywords,text,sentiment_score,sentiment_label
12400,ENTERTAINMENT,Calvin Harris Is Not Pleased With All Those 'H...,https://www.huffingtonpost.com/entry/calvin-ha...,More people should debunk rumors with puns.,calvin-harris-taylor-swift-breakup-rumors-happ...,Calvin Harris Is Not Pleased With All Those 'H...,0.3169,positive
28840,FOOD & DRINK,Maison Kayser: Is There Anything Better Than B...,https://www.huffingtonpost.com/entry/maison-ka...,"So, what was for lunch? And how does the newes...",maison-kayser-is-there-an,Maison Kayser: Is There Anything Better Than B...,0.8418,positive
41060,BUSINESS,Secret Santa's Inflation Hedges for 2016,https://www.huffingtonpost.com/entry/secret-sa...,I have something I can give you for the holida...,secret-santas-inflation-h,Secret Santa's Inflation Hedges for 2016 I hav...,0.872,positive
7890,POLITICS,HUFFPOST HILL - Bomb-Carrying Robocops Don't S...,https://www.huffingtonpost.com/entry/huffpost-...,"Despite its best efforts, America is slowly re...",huffpost-hill-bomb-carrying-robocops-dont-see-...,HUFFPOST HILL - Bomb-Carrying Robocops Don't S...,-0.8302,negative
29579,FOOD & DRINK,S'mores Pop-Tarts S'mores,https://www.huffingtonpost.com/entry/smores-po...,Why haven't we thought of this?,smores-poptarts-smores,S'mores Pop-Tarts S'mores Why haven't we thoug...,0.0,neutral


In [21]:
# Preprocess the text data
metadata['text'] = metadata['headline'] + " " + metadata['abstract']
metadata['text'] = metadata['text'].apply(lambda x: x.lower())
metadata['text'] = metadata['text'].apply(lambda x: re.sub(r'\W+', ' ', x))

In [28]:
# Initialize the HashingVectorizer
vectorizer = HashingVectorizer(n_features=1000, binary=True)

In [23]:
# Generate the matrix of hashed features
tfidf_matrix = vectorizer.fit_transform(metadata['text'])

In [24]:
# Create a FAISS index
d = tfidf_matrix.shape[1]
index = faiss.IndexFlatL2(d)

In [25]:
# Add the vectors to the index
index.add(tfidf_matrix.toarray())

In [26]:
metadata

Unnamed: 0,category,headline,links,abstract,keywords,text,sentiment_score,sentiment_label
0,WELLNESS,143 Miles in 35 Days: Lessons Learned,https://www.huffingtonpost.com/entry/running-l...,Resting is part of training. I've confirmed wh...,running-lessons,143 miles in 35 days lessons learned resting i...,-0.2846,neutral
1,WELLNESS,Talking to Yourself: Crazy or Crazy Helpful?,https://www.huffingtonpost.com/entry/talking-t...,Think of talking to yourself as a tool to coac...,talking-to-yourself-crazy,talking to yourself crazy or crazy helpful thi...,0.8074,positive
2,WELLNESS,Crenezumab: Trial Will Gauge Whether Alzheimer...,https://www.huffingtonpost.com/entry/crenezuma...,The clock is ticking for the United States to ...,crenezumab-alzheimers-disease-drug,crenezumab trial will gauge whether alzheimer ...,0.4404,positive
3,WELLNESS,"Oh, What a Difference She Made",https://www.huffingtonpost.com/entry/meaningfu...,"If you want to be busy, keep trying to be perf...",meaningful-life,oh what a difference she made if you want to b...,0.8402,positive
4,WELLNESS,Green Superfoods,https://www.huffingtonpost.com/entry/green-sup...,"First, the bad news: Soda bread, corned beef a...",green-superfoods,green superfoods first the bad news soda bread...,0.4664,positive
...,...,...,...,...,...,...,...,...
47327,SPORTS,Norovirus Outbreak At Winter Olympics Prompts ...,https://www.huffingtonpost.com/entry/norovirus...,Troops will replace hundreds of civilian secur...,norovirus-outbreak-at-winter-olympics-prompts-...,norovirus outbreak at winter olympics prompts ...,0.5859,positive
47328,SPORTS,This Baseball Team Learned There's A Wrong Way...,https://www.huffingtonpost.com/entry/san-jose-...,Many fans were pissed after seeing the minor l...,san-jose-giants-japanese-heritage-night,this baseball team learned there s a wrong way...,-0.7650,negative
47329,SPORTS,Some Young Spurs Fan Dabbed 38 Times In A Sing...,https://www.huffingtonpost.com/entry/dab-kid-s...,"Never change, young man. Never change.",dab-kid-san-antonio-spurs,some young spurs fan dabbed 38 times in a sing...,0.3182,positive
47330,SPORTS,Rasheed Wallace Ejected From Knicks-Suns Game ...,https://www.huffingtonpost.com/entry/rasheed-w...,Wallace was hit with a first technical for a h...,rasheed-wallace-ejected-knicks-suns-ball-dont-lie,rasheed wallace ejected from knicks suns game ...,-0.4588,negative


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
import pandas as pd

def get_recommendations(query, vectorizer, index, metadata, k=10):
    # Transform the query into a vector
    query_vector = vectorizer.transform([query])
    
    def search(index, query_vector, k=10):
        D, I = index.search(query_vector, k)
        return I[0]

    # Use the index to find similar items
    neighbor_indices = search(index, query_vector.toarray(), k)
    
    # Fetch sentiment scores and labels corresponding to recommended headlines
    recommendations = []
    for idx in neighbor_indices:
        headline = metadata.iloc[idx]['headline']
        links = metadata.iloc[idx]['links']
        sentiment_score = metadata.iloc[idx]['sentiment_score']
        sentiment_label = metadata.iloc[idx]['sentiment_label']
        recommendations.append([headline, sentiment_score, sentiment_label, links])
    
    # Create a DataFrame from the recommendations
    recommendations_df = pd.DataFrame(recommendations, columns=['Headline', 'Sentiment Score', 'Sentiment Label','Links'])
    
    return recommendations_df.to_string(index=False)

# Example usage
x = input("Search: ")
query = x
recommendations_df = get_recommendations(query, vectorizer, index, metadata)
print("Recommendations for:", query)
print(recommendations_df)

Recommendations for: messi
                                                                   Headline  Sentiment Score Sentiment Label                                                                                                                                   Links
               Gwen Stefani Rocked The Sheer Trend Back In The '90s (PHOTO)           0.0000         neutral                                            https://www.huffingtonpost.com/entry/no-doubt-gwen-stefani-photo_us_5b9c88d9e4b03a1dcc7f8c86
                                 The Fake People Behind Your Favorite Foods           0.4404        positive                                              https://www.huffingtonpost.com/entry/the-fake-people-behind-yo_us_5b9d796ce4b03a1dcc8878bd
            New Spending Agreement Repeals Obamacare's Mythical Death Panel           0.3945        positive                                 https://www.huffingtonpost.com/entry/spending-bill-obamacare-advisory-board_us_5a7b7af6e4b08dfc92