In [None]:
from selenium import webdriver
from bs4 import BeautifulSoup
from datetime import datetime
import time
import os

# Initialize the Chrome driver
driver = webdriver.Chrome()

# Navigate to the subreddit search results for 'ebay'
driver.get('https://www.reddit.com/r/wallstreetbets/search/?q=ebay&restrict_sr=1')

# Scroll to the end of the page to load all results
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(3)  # Allow the page to load more content
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# Parse the loaded page with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Find all the posts by looking for 'time' elements and post titles
time_elements = soup.find_all('time', {'datetime': True})
post_links = soup.find_all('a', {'data-testid': 'post-title'})

# Ensure the directory exists where we will save the HTML files
save_folder = 'reddit_posts_html'
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

# Loop through each post, print details, and save the HTML content
for index, (time_element, post_link) in enumerate(zip(time_elements, post_links)):
    # Extract and format the post date
    dt_string = time_element['datetime']
    dt_object = datetime.strptime(dt_string, "%Y-%m-%dT%H:%M:%S.%fZ")
    formatted_date = dt_object.strftime("%A, %B %d, %Y")

    # Extract the post title and link
    title = post_link.text.strip()
    href = post_link['href']
    if not href.startswith('http'):
        href = 'https://www.reddit.com' + href

    # Print the title, date, and link
    print("Title:", title)
    print("Date:", formatted_date)
    print("Link:", href)

    # Visit the post link to save its HTML content
    driver.get(href)
    time.sleep(3)  # Wait for the post to load

    # Save the HTML content of the post
    page_source = driver.page_source
    file_path = os.path.join(save_folder, 'post_' + str(index + 1) + '.html')
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(page_source)

    print('Saved post', index + 1, 'as HTML.\n')

# Close the browser
driver.quit()

print('Finished processing all posts.')


Title: EBAY (DON’T READ if you’re squeamish!)
Date: Sunday, January 14, 2024
Link: https://www.reddit.com/r/wallstreetbets/comments/196276m/ebay_dont_read_if_youre_squeamish/
Saved post 1 as HTML.

Title: Jim Cramer says this legacy retail company’s (eBay) stock is done. Looks like it is time for calls on eBay!
Date: Thursday, November 09, 2023
Link: https://www.reddit.com/r/wallstreetbets/comments/17rdu33/jim_cramer_says_this_legacy_retail_companys_ebay/
Saved post 2 as HTML.

Title: eBay to pay $3 million after employees sent foetal pig to bloggers
Date: Friday, January 12, 2024
Link: https://www.reddit.com/r/wallstreetbets/comments/194lxdh/ebay_to_pay_3_million_after_employees_sent_foetal/
Saved post 3 as HTML.

Title: EBAY GOING DOWN THE TUBES IN A HURRY
Date: Monday, September 26, 2022
Link: https://www.reddit.com/r/wallstreetbets/comments/xosgtc/ebay_going_down_the_tubes_in_a_hurry/
Saved post 4 as HTML.

Title: EBAY earnings play July 15th -- buy the rumor AND the news
Date: Tue

In [12]:
from bs4 import BeautifulSoup
from datetime import datetime
import os

# Directory containing the saved HTML files
directory = '/Users/ian/Desktop/BAX 422/Group Project/reddit_posts_html'

# Get a sorted list of filenames in numerical order
file_list = [f for f in os.listdir(directory) if f.endswith(".html")]
file_list.sort(key=lambda f: int(f.replace('post_', '').replace('.html', '')))

# Loop through each sorted file in the directory
for filename in file_list:
    file_path = os.path.join(directory, filename)

    # Open and read the file content
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(content, 'html.parser')

    # Scrape the title
    title_tag = soup.find('title')
    title = title_tag.text.strip() if title_tag else 'Title not found'

    # Scrape the date
    time_element = soup.find('time', {'datetime': True})
    formatted_date = 'Date not found'
    if time_element:
        dt_string = time_element['datetime']
        dt_object = datetime.strptime(dt_string, "%Y-%m-%dT%H:%M:%S.%fZ")
        formatted_date = dt_object.strftime("%A, %B %d, %Y")

    # Scrape the content
    content_tags = soup.select('div.text-neutral-content[slot="text-body"]')
    content_text = ' '.join(tag.text.strip() for tag in content_tags) if content_tags else 'Content not found'

    # Print the file name, title, date, and content
    print('File:', filename)
    print('Title:', title)
    print('Date:', formatted_date)
    print('Content:', content_text)

    # Find all the comments
    comments = soup.find_all('shreddit-comment', limit=20)

    # Iterate over the comments and print their text and scores
    for comment in comments:
        try:
            score = comment.find('shreddit-comment-action-row')['score']
            comment_text = ' '.join(p.text for p in comment.find_all('p'))
            print('Comment:', comment_text)
            print('Score:', score)
        except Exception as e:
            print('Comment or score not found')
        print('-------' *20)

    print('\n' + '--------------------------------------------- END OF POST ---------------------------------------------------------' + '\n')

# Inform the user that all files have been processed.
print('All files have been processed.')


File: post_1.html
Title: EBAY (DON’T READ if you’re squeamish!) : r/wallstreetbets
Date: Sunday, January 14, 2024
Content: I am actually impressed with Ebay for doing something I couldn't even imagine a crack head creating in his worst nightmares...
  
    EBay was fined $3m after executives sent live spiders, live cockroaches, dead bloody pigs, fetal pigs, pig masks, and spouse suicide books to bloggers who were criticising them.  Executives at Ebay also published the bloggers addresses on Craigslist and made ads inviting people to the bloggers house for cheap sex.
  
    The real question is, is the cheap sex legit? Asking for a friend....
  
    This deserves to be trading at 0.
   




    Edit: Forgot to mention Ebay sent funeral wreaths as well as threatened to personally visit the bloggers in their home... I have no knowledge of if this "visit" would be for the cheap sex potential but not ruling anything out.
  
    Edit 2: My spelling. it is soooo clucking bad...
  



      Re

In [None]:
from bs4 import BeautifulSoup
from datetime import datetime
import os
import mysql.connector

# Function to connect to the MySQL database
def connect_to_database():
    return mysql.connector.connect(
        user='root',
        password='161198',
        database='DDR_Group_Project'
    )

# Function to insert post data into reddit_posts table
def insert_post_data(cursor, file_name, title, post_date, content):
    insert_post_query = """
    INSERT INTO reddit_posts (file_name, title, post_date, content)
    VALUES (%s, %s, %s, %s)
    """
    cursor.execute(insert_post_query, (file_name, title, post_date, content))
    return cursor.lastrowid  # Return the id of the inserted post

# Function to insert comment data into reddit_comments table
def insert_comment_data(cursor, post_id, comment_text, score):
    insert_comment_query = """
    INSERT INTO reddit_comments (post_id, comment_text, score)
    VALUES (%s, %s, %s)
    """
    cursor.execute(insert_comment_query, (post_id, comment_text, score))

# Connect to the database
cnx = connect_to_database()
cursor = cnx.cursor()

# Directory containing the saved HTML files
directory = 'reddit_posts_html'
file_list = sorted(
    [f for f in os.listdir(directory) if f.endswith(".html")],
    key=lambda f: int(f.replace('post_', '').replace('.html', ''))
)

# Loop through each sorted file in the directory
for filename in file_list:
    file_path = os.path.join(directory, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    soup = BeautifulSoup(content, 'html.parser')

    # Scrape and format post details
    title = soup.find('title').text.strip() if soup.find('title') else 'Title not found'
    time_element = soup.find('time', {'datetime': True})
    post_date = time_element['datetime'].split('T')[0] if time_element else None
    content_text = ' '.join(tag.text.strip() for tag in soup.select('div.text-neutral-content[slot="text-body"]')) if soup.select('div.text-neutral-content[slot="text-body"]') else 'Content not found'

    # Insert post data into database
    post_id = insert_post_data(cursor, filename, title, post_date, content_text)

    # Scrape, format, and insert comment details
    comments = soup.find_all('shreddit-comment', limit=20)
    for comment in comments:
        try:
            score = comment.find('shreddit-comment-action-row')['score']
            comment_text = ' '.join(p.text for p in comment.find_all('p'))
            insert_comment_data(cursor, post_id, comment_text, score)
        except Exception as e:
            print('Error inserting comment:', e)

    # Commit transactions for the post and its comments
    cnx.commit()
    print('Post and comments inserted for file:', filename)

# Close database connection
cursor.close()
cnx.close()

print('All files have been processed and data inserted into the database.')


Post and comments inserted for file: post_1.html
Post and comments inserted for file: post_2.html
Error inserting comment: 'score'
Post and comments inserted for file: post_3.html
Post and comments inserted for file: post_4.html
Post and comments inserted for file: post_5.html
Post and comments inserted for file: post_6.html
Post and comments inserted for file: post_7.html
Post and comments inserted for file: post_8.html
Post and comments inserted for file: post_9.html
Post and comments inserted for file: post_10.html
Post and comments inserted for file: post_11.html
Post and comments inserted for file: post_12.html
Post and comments inserted for file: post_13.html
Error inserting comment: 'score'
Error inserting comment: 'score'
Error inserting comment: 'score'
Post and comments inserted for file: post_14.html
Post and comments inserted for file: post_15.html
Error inserting comment: 'score'
Post and comments inserted for file: post_16.html
Post and comments inserted for file: post_17

## MongoDB Approach

In [1]:
import pymongo
from pymongo import MongoClient
import os

In [2]:
mo_c = MongoClient()
client = MongoClient('localhost', 27017)

In [3]:
db = client["BI_Articles"]

In [4]:
collection = db['BI_Reddit_Data']

In [5]:
save_dir = '/Users/ian/Desktop/BAX 422/Group Project/reddit_posts_html' 

def get_date_for_post(post_number):
    return "01-31-2024"  # Placeholder date being used for formatting reasons.

for i in range(1, 244):  # Assuming range of post numbers you have
    file_name = f'post_{i}.html'
    file_path = os.path.join(save_dir, file_name)

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            post_text = file.read()
            
            post_date = get_date_for_post(i)
            
            document = {
                "post_text": post_text,
                "post_number": i,
                "date": post_date  
            }
            
            # Inserting into collection to join Business Insider article data
            result = collection.insert_one(document)
            print(f"Inserted document for post {i} with _id: {result.inserted_id}")
            
    except FileNotFoundError:
        print(f"The file {file_path} was not found.")
    except Exception as e:
        print(f"An error occurred while processing {file_name}: {e}")

Inserted document for post 1 with _id: 65fa686b55c1113824683b4a
Inserted document for post 2 with _id: 65fa686b55c1113824683b4b
Inserted document for post 3 with _id: 65fa686b55c1113824683b4c
Inserted document for post 4 with _id: 65fa686b55c1113824683b4d
Inserted document for post 5 with _id: 65fa686b55c1113824683b4e
Inserted document for post 6 with _id: 65fa686b55c1113824683b4f
Inserted document for post 7 with _id: 65fa686b55c1113824683b50
Inserted document for post 8 with _id: 65fa686b55c1113824683b51
Inserted document for post 9 with _id: 65fa686b55c1113824683b52
Inserted document for post 10 with _id: 65fa686b55c1113824683b53
Inserted document for post 11 with _id: 65fa686b55c1113824683b54
Inserted document for post 12 with _id: 65fa686b55c1113824683b55
Inserted document for post 13 with _id: 65fa686b55c1113824683b56
Inserted document for post 14 with _id: 65fa686b55c1113824683b57
Inserted document for post 15 with _id: 65fa686b55c1113824683b58
Inserted document for post 16 with