In [1]:
import torch

In [2]:
from transformers import pipeline

generator = pipeline('text-generation', model="facebook/opt-2.7b")
generator("What is Google?")

[{'generated_text': 'What is Google?\n\nGoogle is a search engine that allows you to search the web.'}]

In [None]:
# load env variables
from dotenv import load_dotenv
import os

load_dotenv()

finnhub_api = os.getenv("FINNHUB_API")

In [None]:
import finnhub
import json
from datetime import datetime

finnhub_client = finnhub.Client(api_key=finnhub_api)

google_news = finnhub_client.company_news('GOOG', _from="2024-01-01", to="2024-12-31")

print(len(google_news))

245


In [5]:
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def preprocess_corpus(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

for news_item in google_news:
    news_item['headline'] = preprocess_corpus(news_item['headline'])
    # unix time to Year-month-day
    news_item['datetime'] = datetime.utcfromtimestamp(news_item['datetime']).strftime('%Y-%m-%d')

# cosine similarity filtering
def cosine_similarity_fileration(news_items, threshold=0.5):
    headlines = [item['headline'] for item in news_items]
    vectorizer = TfidfVectorizer().fit_transform(headlines)
    vectors = vectorizer.toarray()
    cosine_matrix = cosine_similarity(vectors)
    
    unique_news_items = []
    for i in range(len(news_items)):
        is_unique = True
        for j in range(i):
            if cosine_matrix[i][j] > threshold:
                is_unique = False
                break
        if is_unique:
            unique_news_items.append(news_items[i])
    
    return unique_news_items

In [6]:
google_news = cosine_similarity_fileration(google_news)
print(len(google_news))

221


In [7]:
# dump into json
with open("google_news.json", "w") as json_file:
    json.dump(google_news, json_file, indent=4)