In [12]:
import numpy as np
import pandas as pd
import time
import twython
import requests
import nltk
import warnings
from newspaper import Article
from htmldate import find_date
from tqdm import tqdm
from bs4 import BeautifulSoup
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

nltk.downloader.download('vader_lexicon')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ctori\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [13]:
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-gpu')
# driver = webdriver.Chrome(options=options)

In [14]:
def get_article_links(company, page_number):
    base_url = "https://investing.com/equities"
    url = f"{base_url}/{company}/{page_number}"
    # print(f"URL - {url}")

    driver = None
    try:
        # print("Fetching url..")
        # Fetch the URL
        # print("Before get url")
        # driver = requests.Session()
        # retry = Retry(connect=3, backoff_factor=0.5)
        # adapter = HTTPAdapter(max_retries=retry)
        # driver.mount('http://', adapter)
        # driver.mount('https://', adapter)
        driver = webdriver.Chrome()
        driver.get(url)
        # print("After get url")

        # Variables for tracking scroll position
        old_scroll_position = 0
        new_scroll_position = None

        # print("Before new!=old ")
        # Scroll to the bottom of the page until the scroll position stops changing
        while new_scroll_position != old_scroll_position:
            old_scroll_position = driver.execute_script(
                "return (window.pageYOffset !== undefined) ? "
                "window.pageYOffset : (document.documentElement || "
                "document.body.parentNode || document.body);"
            )
            time.sleep(1)
            driver.execute_script(
                "var scrollingElement = (document.scrollingElement || "
                "document.body);scrollingElement.scrollTop = "
                "scrollingElement.scrollHeight;"
            )
            new_scroll_position = driver.execute_script(
                "return (window.pageYOffset !== undefined) ? "
                "window.pageYOffset : (document.documentElement || "
                "document.body.parentNode || document.body);"
            )
        # print("After new !=old")

        # List to store cleaned links
        cleaned_links = []

        # Find the element containing article links
        # print("Before div_element")
        article_div_element = driver.find_element(By.CLASS_NAME, "mb-4")
        # print("After div_element")

        # Extract all anchor elements within the article div
        # print("Before article links")
        article_links = article_div_element.find_elements(By.TAG_NAME, "a")
        # print("After article links")

        # Iterate through the links and filter out unwanted ones
        # print("Before article link for loop")
        for article_link in article_links:
            partial_link = article_link.get_attribute("href")
            if partial_link:
                if "https" in partial_link and "comments" not in partial_link:
                    cleaned_links.append(partial_link)
                elif partial_link.startswith("/") and "comments" not in partial_link:
                    cleaned_links.append(f"{base_url}{partial_link}")
        # print("After article link for loop")
        return np.unique(cleaned_links)
    except WebDriverException as e:
        print(f"An error occurred while accessing the website: {e}")
        return []
    finally:
        # Close the browser window
        driver.quit()

In [20]:
companies = {"apple":"apple-computer-inc-opinion", "microsoft":"microsoft-corp-opinion", "amazon":"amazon-com-inc-opinion"}
# companies = {"apple":"apple-computer-inc-opinion"}

article_urls = {"apple":[], "microsoft":[], "amazon":[]}
for company in companies:
    for page in range(1,100):
        results = get_article_links(companies[company], page)
        # print(f"Results = {results}")
        print(f"Company: {company}, Page: {page}, Results: {len(results)} articles found")
        article_urls[company].extend(results)

Company: apple, Page: 1, Results: 19 articles found
Company: apple, Page: 2, Results: 18 articles found
Company: apple, Page: 3, Results: 18 articles found
Company: apple, Page: 4, Results: 18 articles found
Company: apple, Page: 5, Results: 20 articles found
Company: apple, Page: 6, Results: 19 articles found
Company: apple, Page: 7, Results: 19 articles found
Company: apple, Page: 8, Results: 20 articles found
Company: apple, Page: 9, Results: 19 articles found
Company: apple, Page: 10, Results: 17 articles found
Company: apple, Page: 11, Results: 18 articles found
Company: apple, Page: 12, Results: 17 articles found
Company: apple, Page: 13, Results: 18 articles found
Company: apple, Page: 14, Results: 16 articles found
Company: apple, Page: 15, Results: 18 articles found
Company: apple, Page: 16, Results: 17 articles found
An error occurred while accessing the website: Message: timeout: Timed out receiving message from renderer: 298.110
  (Session info: chrome=119.0.6045.160)
Stack

In [21]:
from newspaper import Article, Config
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import time

# Original data
input_tickers = ['APPL', 'MSFT', 'AMZN']

# Dictionary to store article sentiments for each ticker
ticker_sentiments = {}

# Mapping between tickers and company names
ticker_company_name_data = {'APPL': "apple", 'MSFT': "microsoft", 'AMZN': "amazon"}

config = Config()
config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
config.request_timeout = 15 

# Create a DataFrame to populate while iterating
for input_ticker in input_tickers:
    # Initialize an empty DataFrame for each ticker
    ticker_sentiments[input_ticker] = pd.DataFrame({
        'ticker': [],
        'publish_date': [],
        'title': [],
        'body_text': [],
        'url': [],
        'neg': [],
        'neu': [],
        'pos': [],
        'compound': []
    })

    # Loop over all the articles for the current ticker
    for link in article_urls[ticker_company_name_data[input_ticker]]:
        article = Article(link, config=config)

        for attempt in range(3):
        # article.download()

            try:
                article.download()
                article.parse()
                text = article.text
                break
            # text = article.text

            except Exception as e:
                print(f"Retry {attempt+1}: Error while downloading article {link}: {e}")
                time.sleep(5)  
                # print(f"Error while parsing article: {e}")
                # continue
        else:
            print(f"Failed to download article after multiple attempts: {link}")
            continue
        # Initialize sentiment analyzer
        sid = SentimentIntensityAnalyzer()

        # Get positive, negative, neutral, and compound scores
        polarity = sid.polarity_scores(text)

        # Create a dictionary with article information and sentiment scores
        article_data = {
            'ticker': input_ticker,
            'publish_date': find_date(link),
            'title': article.title,
            'body_text': article.text,
            'url': link
        }

        # Update the dictionary with sentiment scores
        article_data.update(polarity)

        # Concatenate the new data to the existing DataFrame for the current ticker
        ticker_sentiments[input_ticker] = pd.concat([ticker_sentiments[input_ticker], pd.DataFrame(article_data, index=[0])])

        # Reset the index for the DataFrame
        ticker_sentiments[input_ticker].reset_index(drop=True, inplace=True)

Retry 1: Error while downloading article https://www.investing.com/members/contributors/225786149: Article `download()` failed with HTTPSConnectionPool(host='www.investing.com', port=443): Read timed out. (read timeout=15) on URL https://www.investing.com/members/contributors/225786149
Retry 1: Error while downloading article https://www.investing.com/members/contributors/233480972: Article `download()` failed with HTTPSConnectionPool(host='www.investing.com', port=443): Read timed out. (read timeout=15) on URL https://www.investing.com/members/contributors/233480972
Retry 1: Error while downloading article https://www.investing.com/members/contributors/217129592: Article `download()` failed with HTTPSConnectionPool(host='www.investing.com', port=443): Read timed out. (read timeout=15) on URL https://www.investing.com/members/contributors/217129592
Retry 1: Error while downloading article https://www.investing.com/analysis/intels-shares-up-42-ytd-as-foundry-plans-spark-optimism-2006416

In [None]:
# Save DataFrame 
for ticker in input_tickers:
    ticker_sentiments[ticker].to_csv(f"Raw Data/csv/{ticker}_sentiments_data.csv", sep=',', encoding='utf-8', header=True)