## 1. Data Collection YF

In [None]:
import yfinance as yf
import pandas as pd

# Function to fetch Yahoo Finance news
def fetch_yahoo_news(stock_symbol, news_count=10):
    search_result = yf.Search(stock_symbol, news_count=news_count)
    news_data = search_result.news  # Extract news list
    
    # Convert to DataFrame
    df_check = pd.DataFrame(news_data)
    df = pd.DataFrame(news_data)
    
    # Extract relevant columns
    df = df[['title', 'publisher', 'link', 'providerPublishTime']]
    
    # Convert publish time to datetime format
    df['providerPublishTime'] = pd.to_datetime(df['providerPublishTime'], unit='s')
    
    return df

# Example: Fetch AAPL news
stock_symbol = "AAPL"
yahoo_news_df = fetch_yahoo_news(stock_symbol, news_count=10)

# Display first few rows
print(yahoo_news_df.head())

# Save to Parquet for future use
yahoo_news_df.to_parquet("./news_data/Yahoo_Finance_News.parquet", index=False)

## 2. Data Collection Google

In [None]:

from GoogleNews import GoogleNews
import pandas as pd
import dateparser

# Function to convert relative dates into proper timestamps
def convert_relative_time(relative_time):
    return dateparser.parse(relative_time)

# Function to fetch multiple pages of Google News headlines
def fetch_google_news(stock_symbol, pages=5):
    googlenews = GoogleNews(lang='en', region='US', period="1d")
    googlenews.search(stock_symbol)

    all_results = []

    # Iterate through multiple pages
    for i in range(2, pages + 2):  # Start from page 2 to avoid duplicates
        googlenews.getpage(i)
        result = googlenews.result()
        all_results.extend(result)

    # Convert to DataFrame
    df = pd.DataFrame(all_results)

    # Keep only relevant columns
    df = df[['title', 'media', 'link', 'date']]
    # Filter only trusted news sources
    trusted_sources = ["Bloomberg", "CNBC", "Reuters", "Yahoo Finance", "MarketWatch", "WSJ"]
    df = df[df['media'].isin(trusted_sources)]
    
    return df

# Example: Fetch Google News for AAPL (5 pages of results)
google_news_df = fetch_google_news("AAPL", pages=5)
google_news_df['date'] = google_news_df['date'].apply(convert_relative_time)
# Drop duplicates based on title and date
google_news_df.drop_duplicates(subset=['title', 'link'], inplace=True)
google_news_df.sort_values(by='date', ascending=False, inplace=True)
# Save to Parquet for future use
google_news_df.to_parquet("./news_data/Google_News.parquet", index=False)

## 3. Sentiment via NLTK(Vader) (YF & Google)

In [None]:
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

# Download VADER if not already installed
nltk.download('vader_lexicon')

# Initialize Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

In [None]:
# Load the saved news data
yf_news_data = pd.read_parquet("./news_data/Yahoo_Finance_News.parquet")
google_news_data = pd.read_parquet("./news_data/Google_News.parquet")
# Check the first few rows
print("\nYF News Data:")
print(yf_news_data.head())
print("\nGoogle News Data:")
print(google_news_data.head())

In [None]:
# Function to compute sentiment score
def get_sentiment(text):
    return sia.polarity_scores(text)['compound']

# Apply sentiment analysis to news titles
yf_news_data['sentiment_score'] = yf_news_data['title'].apply(get_sentiment)
google_news_data['sentiment_score'] = google_news_data['title'].apply(get_sentiment)

# Display the first few results
print("YF Score:\n", yf_news_data[['title', 'sentiment_score']].head())
print("\nGoogle Score\n",google_news_data[['title', 'sentiment_score']].head())

# Save to Parquet for future use
yf_news_data.to_parquet("./news_data/Yahoo_Finance_News_with_Sentiment.parquet", index=False)
google_news_data.to_parquet("./news_data/Google_News_with_Sentiment.parquet", index=False)

In [None]:
yf_news_data.isnull().sum()  # Check for null values

In [None]:
google_news_data.isnull().sum()  # Check for null values