## 1. Data Collection YF

In [20]:
import yfinance as yf
import pandas as pd

# Function to fetch Yahoo Finance news
def fetch_yahoo_news(stock_symbol, news_count=10):
    search_result = yf.Search(stock_symbol, news_count=news_count)
    news_data = search_result.news  # Extract news list
    
    # Convert to DataFrame
    df_check = pd.DataFrame(news_data)
    df = pd.DataFrame(news_data)
    
    # Extract relevant columns
    df = df[['title', 'publisher', 'link', 'providerPublishTime']]
    
    # Convert publish time to datetime format
    df['providerPublishTime'] = pd.to_datetime(df['providerPublishTime'], unit='s')
    
    return df

# Example: Fetch AAPL news
stock_symbol = "AAPL"
yahoo_news_df = fetch_yahoo_news(stock_symbol, news_count=10)

# Display first few rows
print(yahoo_news_df.head())

# Save to Parquet for future use
yahoo_news_df.to_parquet("./news_data/Yahoo_Finance_News.parquet", index=False)

                                               title            publisher  \
0  Is Apple Inc. (AAPL) the Best Magic Formula St...       Insider Monkey   
1  Analyst Explains Apple’s (AAPL) Biggest Challe...       Insider Monkey   
2  Jim Cramer Says Apple Inc. (AAPL) ‘Didn’t Have...       Insider Monkey   
3  3 things to know about Apple's lower-cost iPho...  Yahoo Finance Video   
4  Apple Inc. (AAPL) Unveils $500 Billion U.S. In...       Insider Monkey   

                                                link providerPublishTime  
0  https://finance.yahoo.com/news/apple-inc-aapl-... 2025-02-27 18:27:37  
1  https://finance.yahoo.com/news/analyst-explain... 2025-02-27 14:57:14  
2  https://finance.yahoo.com/news/jim-cramer-says... 2025-02-27 12:15:54  
3  https://finance.yahoo.com/video/3-things-know-... 2025-02-27 18:07:01  
4  https://finance.yahoo.com/news/apple-inc-aapl-... 2025-02-27 06:02:11  


## 2. Sentiment via NLTK(Vader) (YF)

In [21]:
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

# Download VADER if not already installed
nltk.download('vader_lexicon')

# Initialize Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\naush\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [22]:
# Load the saved news data
news_data = pd.read_parquet("./news_data/Yahoo_Finance_News.parquet")

# Check the first few rows
print(news_data.head())

                                               title            publisher  \
0  Is Apple Inc. (AAPL) the Best Magic Formula St...       Insider Monkey   
1  Analyst Explains Apple’s (AAPL) Biggest Challe...       Insider Monkey   
2  Jim Cramer Says Apple Inc. (AAPL) ‘Didn’t Have...       Insider Monkey   
3  3 things to know about Apple's lower-cost iPho...  Yahoo Finance Video   
4  Apple Inc. (AAPL) Unveils $500 Billion U.S. In...       Insider Monkey   

                                                link providerPublishTime  
0  https://finance.yahoo.com/news/apple-inc-aapl-... 2025-02-27 18:27:37  
1  https://finance.yahoo.com/news/analyst-explain... 2025-02-27 14:57:14  
2  https://finance.yahoo.com/news/jim-cramer-says... 2025-02-27 12:15:54  
3  https://finance.yahoo.com/video/3-things-know-... 2025-02-27 18:07:01  
4  https://finance.yahoo.com/news/apple-inc-aapl-... 2025-02-27 06:02:11  


In [23]:
# Function to compute sentiment score
def get_sentiment(text):
    return sia.polarity_scores(text)['compound']

# Apply sentiment analysis to news titles
news_data['sentiment_score'] = news_data['title'].apply(get_sentiment)

# Display the first few results
print(news_data[['title', 'sentiment_score']].head())

# Save to Parquet for future use
news_data.to_parquet("./news_data/Yahoo_Finance_News_with_Sentiment.parquet", index=False)

                                               title  sentiment_score
0  Is Apple Inc. (AAPL) the Best Magic Formula St...           0.6369
1  Analyst Explains Apple’s (AAPL) Biggest Challe...           0.0772
2  Jim Cramer Says Apple Inc. (AAPL) ‘Didn’t Have...           0.0000
3  3 things to know about Apple's lower-cost iPho...           0.0000
4  Apple Inc. (AAPL) Unveils $500 Billion U.S. In...           0.4019


In [24]:
news_data.isnull().sum()  # Check for null values

title                  0
publisher              0
link                   0
providerPublishTime    0
sentiment_score        0
dtype: int64

## 3. Data Collection Google

In [25]:
from GoogleNews import GoogleNews
import pandas as pd
import dateparser

# Function to convert relative dates into proper timestamps
def convert_relative_time(relative_time):
    return dateparser.parse(relative_time)

# Function to fetch multiple pages of Google News headlines
def fetch_google_news(stock_symbol, pages=5):
    googlenews = GoogleNews(lang='en', region='US', period="1d")
    googlenews.search(stock_symbol)

    all_results = []

    # Iterate through multiple pages
    for i in range(2, pages + 2):  # Start from page 2 to avoid duplicates
        googlenews.getpage(i)
        result = googlenews.result()
        all_results.extend(result)

    # Convert to DataFrame
    df = pd.DataFrame(all_results)

    # Keep only relevant columns
    df = df[['title', 'media', 'link', 'date']]
    # Filter only trusted news sources
    trusted_sources = ["Bloomberg", "CNBC", "Reuters", "Yahoo Finance", "MarketWatch", "WSJ"]
    df = df[df['media'].isin(trusted_sources)]
    
    return df

# Example: Fetch Google News for AAPL (5 pages of results)
google_news_df = fetch_google_news("AAPL", pages=5)
google_news_df['date'] = google_news_df['date'].apply(convert_relative_time)
google_news_df.sort_values(by='date', ascending=False, inplace=True)
# Save to Parquet for future use
google_news_df.to_parquet("./news_data/Google_News.parquet", index=False)