In [10]:
import torch
import finnhub
import json
from datetime import datetime
import os
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
from transformers import pipeline
import yfinance as yf
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
generator = pipeline('text-generation', model="facebook/opt-2.7b")
#generator("What is Google?")

Device set to use cpu


In [13]:
# load env variables
load_dotenv()

finnhub_api = os.getenv("FINNHUB_API")
finnhub_client = finnhub.Client(api_key=finnhub_api)

In [14]:
if os.path.exists("google_news.json"):
    os.remove("google_news.json")

In [17]:
def preprocess_corpus(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def filter_headlines(news_items, keyword="google"):
    return [item for item in news_items if keyword in item['headline']]

def cosine_similarity_filteration(news_items, threshold=0.8):
    headlines = [item['headline'] for item in news_items]
    vectorizer = TfidfVectorizer().fit_transform(headlines)
    vectors = vectorizer.toarray()
    cosine_matrix = cosine_similarity(vectors)
    
    unique_news_items = []
    for i in range(len(news_items)):
        is_unique = True
        for j in range(i):
            if cosine_matrix[i][j] > threshold:
                is_unique = False
                break
        if is_unique:
            unique_news_items.append(news_items[i])
    
    return unique_news_items

In [None]:
# Check if the JSON file exists and read its content if it does
if os.path.exists("google_news.json"):
    with open("google_news.json", "r") as json_file:
        all_news = json.load(json_file)
else:
    all_news = []

date_dict = {
    # "2024-01-01": "2024-01-31",
    # "2024-02-01": "2024-02-28",
    "2024-03-01": "2024-03-31",
    "2024-04-01": "2024-04-30",
    "2024-05-01": "2024-05-31",
    "2024-06-01": "2024-06-30",
    "2024-07-01": "2024-07-31",
    "2024-08-01": "2024-08-31",
}


for sd, ed in date_dict.items():
    google_news = finnhub_client.company_news('GOOG', _from=sd, to=ed)

    for news_item in google_news:
        # Convert unix time to Year-month-day
        news_item['datetime'] = datetime.utcfromtimestamp(news_item['datetime']).strftime('%Y-%m-%d')
        # Preprocess the headline
        news_item['headline'] = preprocess_corpus(news_item['headline'])

    # Filter headlines containing the keyword "google"
    google_news = filter_headlines(google_news)

    # Apply cosine similarity filtering
    google_news = cosine_similarity_filteration(google_news)

    all_news.extend(google_news)

# Write all news items to the JSON file
with open("google_news.json", "w") as json_file:
    json.dump(all_news, json_file, indent=4)

print(f"# of news headlines: {len(all_news)}")

FinnhubAPIException: FinnhubAPIException(status_code: 401): Please use an API key.

In [40]:
# Fetch stock data using yfinance
symbol = 'GOOG'
start_date = '2024-01-01'
end_date = '2025-01-01'
stock_data = yf.download(symbol, start=start_date, end=end_date)

stock_data['Daily Return'] = stock_data['Close'].pct_change()
stock_data['Excess Return'] = stock_data['Daily Return'] - stock_data['Daily Return'].mean()
stock_data['3-Day Excess Return'] = stock_data['Excess Return'].rolling(window=3).sum()
stock_data = stock_data.dropna()

stock_data_train = stock_data['2024-01-01':'2024-09-01']
stock_data_test = stock_data['2024-09-01':'2025-01-01']

stock_data_train.to_csv('stock_returns_training.csv')
stock_data_test.to_csv('stock_returns.csv')

print(stock_data_test.head())

[*********************100%***********************]  1 of 1 completed

Price            Close        High         Low        Open    Volume  \
Ticker            GOOG        GOOG        GOOG        GOOG      GOOG   
Date                                                                   
2024-09-03  158.221985  162.980320  157.468827  162.915477  26533100   
2024-09-04  157.423935  160.007595  157.054845  157.688286  17410700   
2024-09-05  158.212006  160.621091  157.134646  157.394004  14139500   
2024-09-06  151.757843  158.830495  151.563313  158.301793  24999100   
2024-09-09  149.370529  154.464756  148.032051  153.455906  28057700   

Price      Daily Return Excess Return 3-Day Excess Return  
Ticker                                                     
Date                                                       
2024-09-03    -0.039368     -0.040773           -0.039806  
2024-09-04    -0.005044     -0.006449           -0.038163  
2024-09-05     0.005006      0.003601           -0.043622  
2024-09-06    -0.040794     -0.042200           -0.045048  
202


