In [None]:
import torch
import finnhub
import json
from datetime import datetime
import os
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
from transformers import pipeline

In [None]:
generator = pipeline('text-generation', model="facebook/opt-2.7b")
#generator("What is Google?")

[{'generated_text': 'What is Google?\n\nGoogle is a search engine that allows you to search the web.'}]

In [None]:
# load env variables
load_dotenv()

finnhub_api = os.getenv("FINNHUB_API")
finnhub_client = finnhub.Client(api_key=finnhub_api)

In [12]:
if os.path.exists("google_news.json"):
    os.remove("google_news.json")

In [None]:
def preprocess_corpus(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def filter_headlines(news_items, keyword="google"):
    return [item for item in news_items if keyword in item['headline']]

def cosine_similarity_filteration(news_items, threshold=0.8):
    headlines = [item['headline'] for item in news_items]
    vectorizer = TfidfVectorizer().fit_transform(headlines)
    vectors = vectorizer.toarray()
    cosine_matrix = cosine_similarity(vectors)
    
    unique_news_items = []
    for i in range(len(news_items)):
        is_unique = True
        for j in range(i):
            if cosine_matrix[i][j] > threshold:
                is_unique = False
                break
        if is_unique:
            unique_news_items.append(news_items[i])
    
    return unique_news_items

In [None]:
# Check if the JSON file exists and read its content if it does
if os.path.exists("google_news.json"):
    with open("google_news.json", "r") as json_file:
        all_news = json.load(json_file)
else:
    all_news = []

date_dict = {
    # "2024-01-01": "2024-01-31",
    # "2024-02-01": "2024-02-28",
    "2024-03-01": "2024-03-31",
    "2024-04-01": "2024-04-30",
    "2024-05-01": "2024-05-31",
    "2024-06-01": "2024-06-30",
    "2024-07-01": "2024-07-31",
    "2024-08-01": "2024-08-31",
}


for sd, ed in date_dict.items():
    google_news = finnhub_client.company_news('GOOG', _from=sd, to=ed)

    for news_item in google_news:
        # Convert unix time to Year-month-day
        news_item['datetime'] = datetime.utcfromtimestamp(news_item['datetime']).strftime('%Y-%m-%d')
        # Preprocess the headline
        news_item['headline'] = preprocess_corpus(news_item['headline'])

    # Filter headlines containing the keyword "google"
    google_news = filter_headlines(google_news)

    # Apply cosine similarity filtering
    google_news = cosine_similarity_filteration(google_news)

    all_news.extend(google_news)

# Write all news items to the JSON file
with open("google_news.json", "w") as json_file:
    json.dump(all_news, json_file, indent=4)

print(f"# of news headlines: {len(all_news)}")

Total news items collected: 276
