Main code

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json

# start_of_scraping and number of days take in only these formats:
# Xm - x minutes - for ex. 30m; this works only for the last one hour
# XX:XX AM/PM - for ex. 6:15 AM; this works only for today (the day of scraping)
# Yesterday XX:XX AM/PM - for ex. Yesterday 11:55 PM - for yesterday (the day before the day of scraping)
# Month DD, YYYY XX:XX AM/PM - for ex. May 25, 2024 6:15 AM - for any day in the past except today and yesterday

start_of_scraping = '30m' # If you want to scrape from the beginning, set this to None
number_of_days = 'Yesterday 11:55 PM' # Set a time amount to scrape
check_last_x = -100

def save_messages_to_json(messages, filename): # Dump messages to the json file
    with open(filename, 'a', encoding='utf-8') as f:
        for message in messages:
            json.dump(message, f, ensure_ascii=False)
            f.write('\n')

def log_into_account(driver, username, password):
    WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.NAME, 'login'))).send_keys(username)
    driver.find_element(By.NAME, 'password').send_keys(password)
    driver.find_element(By.XPATH, '//button[@data-testid="log-in-submit"]').click()
    WebDriverWait(driver, 10).until(EC.url_changes(driver.current_url))

def fast_scroll(driver):
    message = driver.find_elements(By.CSS_SELECTOR, 'div.StreamMessage_container__omTCg')[-1:]
    for i in message:
        timestamp = i.find_element(By.CSS_SELECTOR, 'time.StreamMessage_timestamp__VVDmF').text
        print(timestamp)
        if timestamp == start_of_scraping:
            return True

    else:
        False

def scroll_and_scrape(driver, filename, saved_message_ids):
    messages = driver.find_elements(By.CSS_SELECTOR, 'div.StreamMessage_container__omTCg')[check_last_x:] # Protection to speed things up (You can try adjusting the number)
    new_messages = []
    found_desired_timestamp = False

    for message in messages:
        try:
            message_id = message.get_attribute('data-testid').replace('message-', '') # Extract unique message ids
            if message_id in saved_message_ids: #look for duplicates and skip them
                continue
            saved_message_ids.add(message_id)
            # saved_message_ids = saved_message_ids[check_last_x:]

            # Extracting important information
            message_text = message.find_element(By.CSS_SELECTOR, 'div.RichTextMessage_body__4qUeP').text
            timestamp = message.find_element(By.CSS_SELECTOR, 'time.StreamMessage_timestamp__VVDmF').text
            likes_elem = message.find_elements(By.CSS_SELECTOR, 'span.StreamMessageLabelCount_labelCount__dWyPL')
            likes = likes_elem[0].text if likes_elem else '0'

            if timestamp == number_of_days:
                found_desired_timestamp = True

            message_obj = {
                'id': message_id,
                'timestamp': timestamp,
                'likes': likes,
                'text': message_text,
            }
            new_messages.append(message_obj)

        except Exception:
            continue

    if new_messages:
        save_messages_to_json(new_messages, filename) # Save new messages

    return found_desired_timestamp

def remove_old_messages(driver): # Saves RAM and makes scraping much more efficient and faster
    # JavaScript to remove messages that are out of the viewport
    script = """
    var messages = document.querySelectorAll('div.StreamMessage_container__omTCg');
    var viewportHeight = window.innerHeight;
    messages.forEach(function(message) {
        var rect = message.getBoundingClientRect();
        if (rect.bottom < 0) {
            message.parentNode.removeChild(message);
        }
    });
    """
    driver.execute_script(script)

def main():
    options = Options()
    options.headless = True
    driver = webdriver.Chrome(options=options)
    driver.get('https://stocktwits.com/signin?next=/login')

    with open('usrpass.txt', 'r') as file:
        username, password = [line.strip() for line in file.readlines()]

    log_into_account(driver, username, password)
    driver.get('https://stocktwits.com/stream/trending')

    json_filename = 'messages.json'
    saved_message_ids = set()

    try:
        with open(json_filename, 'r', encoding='utf-8') as f:
            for line in f:
                saved_message_ids.add(json.loads(line)['id'])
    except FileNotFoundError:
        pass

    fast_scroll_count = 0
    if start_of_scraping:
        print("Fast scrolling to the desired timestamp.")
        while not fast_scroll(driver):
            driver.execute_script("window.scrollBy(0, window.innerHeight);")
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'div.StreamMessage_container__omTCg'))
            )
            fast_scroll_count += 1
            if fast_scroll_count % 5 == 0:
                remove_old_messages(driver)
        remove_old_messages(driver)

    scroll_count = 0
    while True:
        if scroll_and_scrape(driver, json_filename, saved_message_ids):
            print("Reached desired timestamp.")
            break
        driver.execute_script("window.scrollBy(0, window.innerHeight);")
        scroll_count += 1

        # Remove old messages every few scrolls to reduce memory usage
        if scroll_count % 5 == 0:
            remove_old_messages(driver)

        # Wait for new messages to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div.StreamMessage_container__omTCg'))
        )

    print("Scraping completed.")
    driver.quit()

if __name__ == "__main__":
    main()