# Webscraping 4 Months of Business Insider Articles and Ebay Stock Data

## Page 1 Articles

In [6]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

# URL for page 1 on Business Insider's eBay stock news page
url_1 = 'https://markets.businessinsider.com/news/ebay-stock'

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

driver.get(url_1)

article_URLs_1 = set()  # Using a set to avoid duplicate URLs

time.sleep(5)  # Wait for 5 seconds

page_source = driver.page_source

soup = BeautifulSoup(page_source, 'html.parser')

# Find all 'news-link'
news_links = soup.find_all('a', class_='news-link')

for tag in news_links:
    if tag.has_attr('href'):
        href = tag['href']
       
        # Check if 'href' already contains a full URL
        link = href if href.startswith('https://') else 'https://markets.businessinsider.com' + href
        article_URLs_1.add(link)
        print("Found new link:", link)

print("Total number of unique links collected:", len(article_URLs_1))

time.sleep(5)


driver.quit()

Found new link: https://markets.businessinsider.com/news/stocks/the-3-most-undervalued-e-commerce-stocks-to-buy-in-march-2024-1033170412
Found new link: https://markets.businessinsider.com/news/stocks/bernstein-remains-a-hold-on-ebay-ebay-1033150390
Found new link: https://markets.businessinsider.com/news/stocks/analysts-opinions-are-mixed-on-these-consumer-cyclical-stocks-ebay-ebay-and-european-wax-center-ewcz-1033149392
Found new link: https://seekingalpha.com/news/4076911-catalyst-watch-oracle-earnings-arm-ipo-lockup-and-triple-witching-day?utm_source=businessinsider&utm_medium=referral&feed_item_type=news
Found new link: https://markets.businessinsider.com/news/stocks/ebay-s-q4-earnings-triumph-sparks-golden-cross-can-it-last-1033147008
Found new link: https://markets.businessinsider.com/news/stocks/navigating-9-analyst-ratings-for-ebay-1033119361
Found new link: https://markets.businessinsider.com/news/stocks/ebay-ebay-receives-a-rating-update-from-a-top-analyst-1033119126
Found n

In [8]:
import requests
import os

save_dir = '/Users/ian/Desktop/BAX 422/Group Project/Articles'
file_counter = 1  # Starting at 51

# Loop through each URL in the set
for url in article_URLs_1:
    try:
        # Fetch the page's HTML content
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
        page_html = response.text

        # Construct the file name and path
        file_name = f'article {file_counter}.html'  # Saving as .html
        file_path = os.path.join(save_dir, file_name)

        # Save the HTML content to a file
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(page_html)

        print(f"Saved: {file_name}")  # Optional: Print confirmation

        file_counter += 1  # Increment the counter for the next file name

    except requests.RequestException as e:
        print(f"Failed to fetch {url}: {e}")


Saved: article 1.html
Saved: article 2.html
Saved: article 3.html
Saved: article 4.html
Saved: article 5.html
Saved: article 6.html
Failed to fetch https://seekingalpha.com/news/4072590-ebay-non-gaap-eps-of-1_07-beats-0_04-revenue-of-2_56b-beats-50m?utm_source=businessinsider&utm_medium=referral&feed_item_type=news: 403 Client Error: Forbidden for url: https://seekingalpha.com/news/4072590-ebay-non-gaap-eps-of-1_07-beats-0_04-revenue-of-2_56b-beats-50m?utm_source=businessinsider&utm_medium=referral&feed_item_type=news
Failed to fetch https://seekingalpha.com/news/4060253-paypal-starts-round-of-layoffs-in-cost-cutting-drive-report?utm_source=businessinsider&utm_medium=referral&feed_item_type=news: 403 Client Error: Forbidden for url: https://seekingalpha.com/news/4060253-paypal-starts-round-of-layoffs-in-cost-cutting-drive-report?utm_source=businessinsider&utm_medium=referral&feed_item_type=news
Saved: article 7.html
Saved: article 8.html
Saved: article 9.html
Saved: article 10.html
Fai

### The articles that failed to save are forbidden by the 3rd party site. 

## Page 1 Article Dates

In [9]:
# URL for page 1 on Business Insider's eBay stock news page
url_1 = 'https://markets.businessinsider.com/news/ebay-stock'

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

driver.get(url_1)

time.sleep(5)  

page_source = driver.page_source

driver.quit()

soup = BeautifulSoup(page_source, 'html.parser')

# Store dates for page 1
dates_1 = []

# Finding all 'latest-news_story'
story_divs_1 = soup.find_all('div', class_='latest-news__story')

for story_div in story_divs_1:
    time_tag = story_div.find('time', class_='latest-news__date')
    if time_tag and time_tag.has_attr('datetime'):
        
        # Extracting the date from the 'datetime' attribute
        datetime_str_1 = time_tag['datetime']
        dates_1.append(datetime_str_1)  # Add the date to the list

for date in dates_1:
    print("Found date:", date)

Found date: 3/17/2024 10:00:00 PM
Found date: 3/11/2024 8:37:06 AM
Found date: 3/11/2024 4:43:43 AM
Found date: 3/8/2024 8:00:00 PM
Found date: 3/8/2024 4:39:27 PM
Found date: 2/29/2024 5:00:15 PM
Found date: 2/29/2024 12:21:54 PM
Found date: 2/29/2024 10:02:02 AM
Found date: 2/28/2024 8:08:13 PM
Found date: 2/28/2024 8:02:03 PM
Found date: 2/28/2024 5:37:59 PM
Found date: 2/28/2024 5:17:23 PM
Found date: 2/28/2024 2:39:53 PM
Found date: 2/28/2024 2:09:02 PM
Found date: 2/28/2024 12:30:45 PM
Found date: 2/28/2024 12:20:36 PM
Found date: 2/28/2024 10:32:20 AM
Found date: 2/28/2024 10:30:26 AM
Found date: 2/28/2024 8:40:39 AM
Found date: 2/28/2024 7:15:48 AM
Found date: 2/28/2024 6:10:24 AM
Found date: 2/28/2024 5:40:05 AM
Found date: 2/27/2024 10:09:05 PM
Found date: 2/27/2024 9:18:51 PM
Found date: 2/27/2024 9:11:29 PM
Found date: 2/27/2024 9:06:18 PM
Found date: 2/27/2024 8:18:05 PM
Found date: 2/27/2024 3:10:23 PM
Found date: 2/27/2024 2:45:00 PM
Found date: 2/27/2024 2:19:00 PM
Foun

## Joining page 1 links with dates

In [10]:
for link, date in zip(article_URLs_1, dates_1):
    print("Found new link:", link)
    print("Found date:", date)
    print()


Found new link: https://markets.businessinsider.com/news/stocks/next-big-thing-3-potential-ipos-with-more-promise-than-reddit-1033050368
Found date: 3/17/2024 10:00:00 PM

Found new link: https://markets.businessinsider.com/news/stocks/the-7-most-undervalued-large-cap-stocks-to-buy-in-february-2024-1033034243
Found date: 3/11/2024 8:37:06 AM

Found new link: https://markets.businessinsider.com/news/stocks/why-bumble-shares-are-trading-lower-by-12-here-are-other-stocks-moving-in-wednesday-s-mid-day-session-1033114603
Found date: 3/11/2024 4:43:43 AM

Found new link: https://markets.businessinsider.com/news/stocks/ebay-layoffs-2024-what-to-know-about-the-latest-ebay-job-cuts-1032992677
Found date: 3/8/2024 8:00:00 PM

Found new link: https://markets.businessinsider.com/news/stocks/analysts-have-conflicting-sentiments-on-these-consumer-cyclical-companies-cracker-barrel-cbrl-norwegian-cruise-line-nclh-and-ebay-ebay-1033113887
Found date: 3/8/2024 4:39:27 PM

Found new link: https://markets

## Page 2 Articles

In [12]:
save_dir = '/Users/ian/Desktop/BAX 422/Group Project/Articles'
file_counter = 40  # Starting at 40

# Looping through each URL in the set
for url in article_URLs_2:
    try:
        response = requests.get(url)
        response.raise_for_status()  
        page_html = response.text

        
        file_name = f'article {file_counter}.html'  # Saving as .html
        file_path = os.path.join(save_dir, file_name)

        
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(page_html)

        print(f"Saved: {file_name}")  

        file_counter += 1  # Increment the counter for the next file name

    except requests.RequestException as e:
        print(f"Failed to fetch {url}: {e}")

Saved: article 40.html
Saved: article 41.html
Saved: article 42.html
Saved: article 43.html
Saved: article 44.html
Saved: article 45.html
Saved: article 46.html
Saved: article 47.html
Saved: article 48.html
Saved: article 49.html
Saved: article 50.html
Saved: article 51.html
Saved: article 52.html
Saved: article 53.html
Saved: article 54.html
Saved: article 55.html
Saved: article 56.html
Saved: article 57.html
Saved: article 58.html
Saved: article 59.html
Saved: article 60.html
Saved: article 61.html
Saved: article 62.html
Saved: article 63.html
Saved: article 64.html
Saved: article 65.html
Saved: article 66.html
Saved: article 67.html
Saved: article 68.html
Saved: article 69.html
Saved: article 70.html
Failed to fetch https://seekingalpha.com/news/4057698-ebay-stock-rises-3-on-plans-to-cut-9-of-full-time-staff-scale-back-contracts?utm_source=businessinsider&utm_medium=referral&feed_item_type=news: 403 Client Error: Forbidden for url: https://seekingalpha.com/news/4057698-ebay-stock-ri

### The articles that failed to save are forbidden by the 3rd party site. 

## Page 2 Article Dates

In [13]:
# Page 2 for Business Insider's eBay stock news page
url_2 = 'https://markets.businessinsider.com/news/ebay-stock?p=2'

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

driver.get(url_2)

time.sleep(5)  

page_source = driver.page_source

driver.quit()

soup = BeautifulSoup(page_source, 'html.parser')

# Store dates for page 2
dates_2 = []

# Page 2 tag for 'latest-news_story'
story_divs_2 = soup.find_all('div', class_='latest-news__story')

for story_div in story_divs_2:
    time_tag = story_div.find('time', class_='latest-news__date')
    if time_tag and time_tag.has_attr('datetime'):
        
        # Extracting the date from the 'datetime' attribute
        datetime_str_2 = time_tag['datetime']
        dates_2.append(datetime_str_2)  # Add the date to the list

for date in dates_2:
    print("Found date:", date)

Found date: 1/24/2024 12:40:16 PM
Found date: 1/24/2024 11:56:01 AM
Found date: 1/24/2024 11:25:49 AM
Found date: 1/24/2024 11:16:46 AM
Found date: 1/24/2024 3:04:02 AM
Found date: 1/23/2024 10:03:29 PM
Found date: 1/16/2024 9:30:00 PM
Found date: 1/12/2024 10:19:37 PM
Found date: 1/12/2024 3:26:21 PM
Found date: 1/11/2024 12:26:07 PM
Found date: 1/9/2024 9:00:49 PM
Found date: 1/9/2024 6:03:49 AM
Found date: 1/4/2024 6:10:01 PM
Found date: 1/3/2024 12:59:29 PM
Found date: 12/12/2023 4:07:00 PM
Found date: 12/11/2023 11:40:00 AM
Found date: 12/6/2023 4:19:37 AM
Found date: 12/5/2023 9:30:00 PM
Found date: 12/5/2023 9:53:40 AM
Found date: 11/30/2023 5:24:07 PM
Found date: 11/29/2023 10:05:00 PM
Found date: 11/25/2023 2:00:00 PM
Found date: 11/22/2023 3:06:50 AM
Found date: 11/21/2023 9:30:00 PM
Found date: 11/21/2023 5:48:00 PM
Found date: 11/18/2023 2:00:00 PM
Found date: 11/17/2023 9:30:00 PM
Found date: 11/16/2023 9:30:00 PM
Found date: 11/14/2023 9:30:00 PM
Found date: 11/9/2023 9:1

## Joining page 2 links with dates

In [14]:
for link, date in zip(article_URLs_2, dates_2):
    print("Found new link:", link)
    print("Found date:", date)
    print()

Found new link: https://markets.businessinsider.com/news/stocks/analysts-offer-insights-on-consumer-cyclical-companies-deckers-outdoor-deck-and-ebay-ebay-1032993176
Found date: 1/24/2024 12:40:16 PM

Found new link: https://markets.businessinsider.com/news/stocks/3-stocks-that-are-about-to-get-absolutely-crushed-in-q4-1032802653
Found date: 1/24/2024 11:56:01 AM

Found new link: https://markets.businessinsider.com/news/stocks/analysts-conflicted-on-these-technology-names-upwork-upwk-ebay-ebay-and-axon-enterprise-axon-1032796220
Found date: 1/24/2024 11:25:49 AM

Found new link: https://markets.businessinsider.com/news/stocks/ebay-supports-voluntary-offer-to-acquire-adevinta-1032840138
Found date: 1/24/2024 11:16:46 AM

Found new link: https://markets.businessinsider.com/news/stocks/analysts-have-conflicting-sentiments-on-these-technology-companies-ebay-ebay-toast-inc-tost-and-nanostring-tech-nstg-1032796874
Found date: 1/24/2024 3:04:02 AM

Found new link: https://markets.businessinsid

## Joining page 1 and page 2 together

In [15]:
combined_article_URLs = list(article_URLs_1) + list(article_URLs_2)
combined_dates = list(dates_1) + list(dates_2)

# Printing and iterating over the combined lists
for link, date in zip(combined_article_URLs, combined_dates):
    print("Found new link:", link)
    print("Found date:", date)
    print()  # Space for better readability

Found new link: https://markets.businessinsider.com/news/stocks/next-big-thing-3-potential-ipos-with-more-promise-than-reddit-1033050368
Found date: 3/17/2024 10:00:00 PM

Found new link: https://markets.businessinsider.com/news/stocks/the-7-most-undervalued-large-cap-stocks-to-buy-in-february-2024-1033034243
Found date: 3/11/2024 8:37:06 AM

Found new link: https://markets.businessinsider.com/news/stocks/why-bumble-shares-are-trading-lower-by-12-here-are-other-stocks-moving-in-wednesday-s-mid-day-session-1033114603
Found date: 3/11/2024 4:43:43 AM

Found new link: https://markets.businessinsider.com/news/stocks/ebay-layoffs-2024-what-to-know-about-the-latest-ebay-job-cuts-1032992677
Found date: 3/8/2024 8:00:00 PM

Found new link: https://markets.businessinsider.com/news/stocks/analysts-have-conflicting-sentiments-on-these-consumer-cyclical-companies-cracker-barrel-cbrl-norwegian-cruise-line-nclh-and-ebay-ebay-1033113887
Found date: 3/8/2024 4:39:27 PM

Found new link: https://markets

In [16]:
# Confirming that both pages were correctly merged together
print(len(combined_article_URLs))

100


## Parsing Article Details from Accessible HTMLs

In [17]:
from bs4 import BeautifulSoup
import os

save_dir = '/Users/ian/Desktop/BAX 422/Group Project/Articles'

html_files = [file for file in os.listdir(save_dir) if file.endswith('.html')]

# Defining a file counter variable for naming purposes
file_counter = 1

for html_file in html_files:
    file_path = os.path.join(save_dir, html_file)

    with open(file_path, 'r', encoding='utf-8') as file:
        page_html = file.read()

    soup = BeautifulSoup(page_html, 'html.parser')

    # Extracting all paragraphs within each article
    paragraphs = [element.text.strip() for element in soup.find_all('p')]

    txt_file_name = f'article {file_counter}.txt'
    txt_file_path = os.path.join(save_dir, txt_file_name)

    with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
        txt_file.write('\n\n'.join(paragraphs))

    print(f"Processed and saved: {txt_file_name}")

    # Increment for the file counter name
    file_counter += 1


Processed and saved: article 1.txt
Processed and saved: article 2.txt
Processed and saved: article 3.txt
Processed and saved: article 4.txt
Processed and saved: article 5.txt
Processed and saved: article 6.txt
Processed and saved: article 7.txt
Processed and saved: article 8.txt
Processed and saved: article 9.txt
Processed and saved: article 10.txt
Processed and saved: article 11.txt
Processed and saved: article 12.txt
Processed and saved: article 13.txt
Processed and saved: article 14.txt
Processed and saved: article 15.txt
Processed and saved: article 16.txt
Processed and saved: article 17.txt
Processed and saved: article 18.txt
Processed and saved: article 19.txt
Processed and saved: article 20.txt
Processed and saved: article 21.txt
Processed and saved: article 22.txt
Processed and saved: article 23.txt
Processed and saved: article 24.txt
Processed and saved: article 25.txt
Processed and saved: article 26.txt
Processed and saved: article 27.txt
Processed and saved: article 28.txt
P

## Stock Data

In [9]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import csv

# Business Insider Stock Data URL
url = 'https://markets.businessinsider.com/stocks/ebay-stock'

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

driver.get(url)

time.sleep(15)  

page_source = driver.page_source

soup = BeautifulSoup(page_source, 'html.parser')

# To hold all the data
data = []

# Finding all table rows with the class 'table__tr'
table_rows = soup.find_all('tr', class_='table__tr')

for row in table_rows:
    # Find all cell tags in this row
    cells = row.find_all('td', class_='table__td')

    # Checking that number of cells in the row is 6
    if len(cells) == 6:
        row_data = {
            'date': cells[0].text.strip(),
            'open': cells[1].text.strip(),
            'high': cells[2].text.strip(),
            'low': cells[3].text.strip(),
            'close': cells[4].text.strip(),
            'volume': cells[5].text.strip().replace(',', '')  # Remove commas from volume
        }

        data.append(row_data)

driver.quit()

filename = '/Users/ian/Desktop/BAX 422/Group Project/daily_stock_data.csv'  # Change this to a valid path on your system

# Opening the file
with open(filename, mode='w', newline='') as file:
    # Create a CSV writer object
    writer = csv.DictWriter(file, fieldnames=data[0].keys())

    # Write the header
    writer.writeheader()

    # Write the data rows
    for entry in data:
        writer.writerow(entry)

# Printing the path to the saved CSV file
print(f"Data has been saved to {filename}")

Data has been saved to /Users/ian/Desktop/BAX 422/Group Project/daily_stock_data.csv


## Saving to Database

In [1]:
import pymongo
from pymongo import MongoClient

In [2]:
mo_c = MongoClient()
client = MongoClient('localhost', 27017)

In [3]:
db = client["BI_Articles"]

In [4]:
collection = db['BI_Reddit_Data']

In [22]:
## Took two approaches as txt files did not initially get inserted. This first attempt was only partially successful.

save_dir = '/Users/ian/Desktop/BAX 422/Group Project/Articles' 

def get_paragraphs_from_file(file_path):
    paragraphs = []
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            paragraphs = file.read().strip().split('\n\n')
    else:
        print(f"File not found: {file_path}")
    return paragraphs

for index, (url, date) in enumerate(zip(combined_article_URLs, combined_dates), start=1):
    file_name = f'article {index}.txt'
    file_path = os.path.join(save_dir, file_name)
    
    paragraphs = get_paragraphs_from_file(file_path)
    if not paragraphs:
        print(f"No content found {file_name}. Skipping.")
        continue

    article_text = "\n\n".join(paragraphs)
    
    document = {
        "article": url,
        "article text": article_text,
        "date": date
    }
    
    try:
        collection.insert_one(document)
        print(f"Inserted document for URL {url} with date {date}")
    except Exception as e:
        print(f"Failed to insert document for URL {url}: {e}")

Inserted document for URL https://markets.businessinsider.com/news/stocks/next-big-thing-3-potential-ipos-with-more-promise-than-reddit-1033050368 with date 3/17/2024 10:00:00 PM
Inserted document for URL https://markets.businessinsider.com/news/stocks/the-7-most-undervalued-large-cap-stocks-to-buy-in-february-2024-1033034243 with date 3/11/2024 8:37:06 AM
Inserted document for URL https://markets.businessinsider.com/news/stocks/why-bumble-shares-are-trading-lower-by-12-here-are-other-stocks-moving-in-wednesday-s-mid-day-session-1033114603 with date 3/11/2024 4:43:43 AM
Inserted document for URL https://markets.businessinsider.com/news/stocks/ebay-layoffs-2024-what-to-know-about-the-latest-ebay-job-cuts-1032992677 with date 3/8/2024 8:00:00 PM
Inserted document for URL https://markets.businessinsider.com/news/stocks/analysts-have-conflicting-sentiments-on-these-consumer-cyclical-companies-cracker-barrel-cbrl-norwegian-cruise-line-nclh-and-ebay-ebay-1033113887 with date 3/8/2024 4:39:27

In [38]:
print("All articles, text, and dates have been inserted into the database.")

All articles, text, and dates have been inserted into the database.


In [23]:
for i in range(1, 89):  # From article 1 to 88
    file_name = f'article {i}.txt'
    file_path = os.path.join(save_dir, file_name)

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            article_text = file.read()
            
            document = {
                "article_text": article_text,
                "article_number": i,  
                "article": url,
                "date": date
            }
            
            # Inserting into collection
            result = collection.insert_one(document)
            print(f"Inserted document for article {i} with _id: {result.inserted_id}")
            
    except FileNotFoundError:
        print(f"The file {file_path} was not found.")
    except Exception as e:
        print(f"An error occurred while processing {file_name}: {e}")

Inserted document for article 1 with _id: 65fa67a4dc62e56865f22ac0
Inserted document for article 2 with _id: 65fa67a4dc62e56865f22ac1
Inserted document for article 3 with _id: 65fa67a4dc62e56865f22ac2
Inserted document for article 4 with _id: 65fa67a4dc62e56865f22ac3
Inserted document for article 5 with _id: 65fa67a4dc62e56865f22ac4
Inserted document for article 6 with _id: 65fa67a4dc62e56865f22ac5
Inserted document for article 7 with _id: 65fa67a4dc62e56865f22ac6
Inserted document for article 8 with _id: 65fa67a4dc62e56865f22ac7
Inserted document for article 9 with _id: 65fa67a4dc62e56865f22ac8
Inserted document for article 10 with _id: 65fa67a4dc62e56865f22ac9
Inserted document for article 11 with _id: 65fa67a4dc62e56865f22aca
Inserted document for article 12 with _id: 65fa67a4dc62e56865f22acb
Inserted document for article 13 with _id: 65fa67a4dc62e56865f22acc
Inserted document for article 14 with _id: 65fa67a4dc62e56865f22acd
Inserted document for article 15 with _id: 65fa67a4dc62e5

### Testing to ensure text data was inserted into database... it was

In [25]:
from bson.objectid import ObjectId

document_id = '65fa67a4dc62e56865f22ac3'

document = collection.find_one({"_id": ObjectId(document_id)})

if document:
    print("Found document:")
    print(document["article_text"])  
else:
    print("Document not found.")

Found document:
Mark Kelley, an analyst from Stifel Nicolaus, maintained the Hold rating on Ebay (EBAY – Research Report). The associated price target is $45.00.

Mark Kelley has given his Hold rating due to a combination of factors regarding Ebay’s recent performance and future prospects. The main factor is the mixed results from Ebay’s 3Q23, which showed some positive growth areas such as the focus category growth and the advertising business. However, these bright spots were somewhat offset by weakening consumer trends that started in September and continue to the present. This has led to expectations for a muted holiday season, particularly due to weakness in consumer demand from the EU, specifically the UK and Germany, and to a lesser extent, the U.S.

The second set of factors contributing to the Hold rating are the challenges facing eBay in the near term. Although the growth in focus categories is encouraging, Mark is cautious about its potential to be a major growth catalyst, e

In [5]:
sample_document = collection.find_one()

if sample_document:
    # Print the field names ("headers") of the document
    field_names = sample_document.keys()
    print("Field names in the document:", list(field_names))
else:
    print("No documents found in the collection.")

Field names in the document: ['_id', 'article', 'article text', 'date']
