In [72]:
# gather_tweets(credentials_file='login.json', stopDate='2025-04-10')

In [73]:
import undetected_chromedriver as uc
import json
import datetime as dt
import pandas as pd
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import os
from dotenv import load_dotenv
import requests
from selenium.common import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os
import time
from selenium.webdriver.chrome.options import Options

load_dotenv()


def login_chrome(credentials_file):
    # Open json file that contains login_chrome info
    try:
        with open(credentials_file, 'r') as f:
            creds = json.load(f)
        email = creds['email']
        password = creds['password']
    except FileNotFoundError:
        print(f"Error: Credentials file not found at {credentials_file}")
        return None
    except KeyError as e:
        print(f"Error: Missing key {e} in credentials file {credentials_file}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from {credentials_file}")
        return None

    # print(creds)
    # Setup webdriver

    # Set up Chrome options
    driver = uc.Chrome()

    # Login to Google
    driver.get('https://accounts.google.com/signin')
    # wait a second for login_chrome page to load
    time.sleep(2)
    driver.find_element(By.XPATH, '//*[@id="identifierId"]').send_keys(email)
    driver.find_element(By.XPATH, '//*[@id="identifierNext"]/div/button/span').click()
    WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="password"]/div[1]/div/div[1]/input'))
    )
    driver.find_element(By.XPATH, '//*[@id="password"]/div[1]/div/div[1]/input').send_keys(password)
    driver.find_element(By.XPATH, '//*[@id="passwordNext"]/div/button/span').click()
    WebDriverWait(driver, 30).until(
        EC.presence_of_element_located((By.XPATH, "//a[@aria-label='Google Account settings']"))
    )

    return driver


def login_twitter(credentials_file):
    driver = login_chrome(credentials_file)
    if driver is None:
        return None # Propagate the error if login_chrome failed

    # Login to Twitter
    driver.get('https://x.com/')

    # Wait for sign in to appear
    WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.TAG_NAME, 'iframe'))
    )
    curWindow = driver.current_window_handle
    driver.find_element(By.TAG_NAME, 'iframe').click()
    wait = WebDriverWait(driver, 10)
    wait.until(EC.number_of_windows_to_be(2))

    for window in driver.window_handles:
        if window != curWindow:
            driver.switch_to.window(window)

    WebDriverWait(driver, 30).until(
        EC.presence_of_element_located((By.LINK_TEXT, 'Following'))
    )        

    accList = driver.find_element(By.LINK_TEXT, 'Following')
    accList.find_element(By.TAG_NAME, 'div').click()    
    driver.switch_to.window(curWindow)
    return driver


def bulk_gather_tweets(credentials_file, tweets_to_scrape=50):
    driver = login_twitter(credentials_file)
    if driver is None:
        return pd.DataFrame() # Return empty DataFrame if login failed

    # Wait to log into Twitter
    WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, "//span[text()='Following']"))
    )
    driver.find_element(By.XPATH, "//span[text()='Following']").click()
    # Wait for timeline to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, "//div[@aria-label='Timeline: Your Home Timeline']"))
    )

    # Initialize scrolling height, tweets set, and stop variable
    numTweets = 0
    curHeight = driver.execute_script('return window.pageYOffset;')
    tweetList = []
    tweetSet = set()
    keepScrolling = True

    while keepScrolling:

        timeline = driver.find_element(By.XPATH, "//div[@aria-label='Timeline: Your Home Timeline']")
        posts = timeline.find_elements(By.XPATH, ".//article[@data-testid='tweet']")

        for post in posts:
            try:
                source = post.find_element(By.XPATH, ".//div[@data-testid='User-Name']")
                username = source.find_element(By.TAG_NAME, 'a')
                timestamp = post.find_element(By.TAG_NAME, 'time').get_attribute('datetime')
                tweetText = post.find_element(By.XPATH, ".//div[@data-testid='tweetText']")
                # print(f'Source: {username.text}\nTime: {timestamp.text}\n Tweet: {tweetText.text}\n\n')

                tweetDict = {'Source': username.text, 'timestamp': timestamp, 'Tweet': tweetText.text}

                # Add unique entries into a dictionary
                if tweetText.text not in tweetSet:
                    numTweets += 1
                    tweetSet.add(tweetText.text)
                    tweetList.append(tweetDict)
            except:
                print(f'No tweet text found for {username.text} on: {timestamp}')
                continue

            if numTweets >= tweets_to_scrape:
                keepScrolling = False
                break

            # if stopText in tweetText.text:
            #     keepScrolling = False
            #     break

        # Scroll to load more tweets
        driver.execute_script('window.scrollBy(0, 5000);')
        time.sleep(1)

    return pd.DataFrame(tweetList)


async def gather_tweets(credentials_file, stopDate, skipRefresh=False, login=False, driver=None,
                  stopAtHandle='@CacheCollegiate', mode='refresh', tweets_to_scrape=50000000000,
                  system_instructions=None, findSpecificUser=True, stopTexts=None, alreadyExecutedTweets=[]):
    if login:
        driver = login_twitter(credentials_file)
        if driver is None:
            return pd.DataFrame() # Return empty DataFrame if login failed
        time.sleep(5)
    # Setup for image saving
    image_folder = 'downloaded_images'
    if not os.path.exists(image_folder):
        os.makedirs(image_folder)

    # Initialize tweets set, and stop variable
    numTweets = 0
    max_retries = 1  # Maximum number of retries
    tweetList = []
    tweetSet = set()
    continueGatheringTweets = True
    print("Navigating to following tab")

    # Following tab
    element = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.XPATH, "//*[text()='Following']"))
        )
    element.click()

   

    while continueGatheringTweets:
        retry_count = 0
        print('Continuing...')
        # time.sleep(1.5)  # sleep to avoid getting banned on twitter.
        while retry_count < max_retries:
            try:
                # Wait for the element to be present
                WebDriverWait(driver, 12).until(
                    EC.presence_of_element_located((By.XPATH, "//div[@aria-label='Home timeline']"))
                )
                print("Element found!")
                break  # Exit the loop if the element is found
            except TimeoutException:
                print(f"Element not found. Retrying... ({retry_count + 1}/{max_retries})")
                retry_count += 1
                driver.refresh()  # Refresh the page
                print('sleeping 10 seconds')
                time.sleep(60)
        else:
            driver.close()
            print('black screen of death, closing and reopening new window after wait')
            time.sleep(5)
            driver = login_twitter(credentials_file) # Also use credentials file on re-login
            if driver is None:
                 return pd.DataFrame(tweetList) # Return collected tweets if re-login fails
            time.sleep(5)
            continue

        print("home timeline loaded")

        try:
            timeline = driver.find_element(By.XPATH, "//div[@aria-label='Home timeline']")
        except:
            timeline = driver.find_element(By.XPATH, "//div[@aria-label='Timeline: Search timeline']")

        WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//span[text()='Following']"))
        )
        driver.find_element(By.XPATH, "//span[text()='Following']").click()

        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, ".//article[@data-testid='tweet']"))
        )
        posts = timeline.find_elements(By.XPATH, ".//article[@data-testid='tweet']")

        print("{} posts found".format(len(posts)))

        return posts



    return pd.DataFrame(tweetList)


In [74]:
# driver = login_chrome('login.json')

In [75]:

import asyncio

async def login_twitter(driver):
    # Login to Twitter
    driver.get('https://x.com/')

    # Wait for sign in to appear
    WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.TAG_NAME, 'iframe'))
    )
    curWindow = driver.current_window_handle
    driver.find_element(By.TAG_NAME, 'iframe').click()
    wait = WebDriverWait(driver, 10)
    wait.until(EC.number_of_windows_to_be(3))

    for window in driver.window_handles:
        if window != curWindow:
            driver.switch_to.window(window)

        try:
            driver.find_element(By.CSS_SELECTOR, '.VV3oRb.YZVTmd.SmR8').click()
        except Exception as e:
            print(f"Error clicking element: {e}")

    # Find the input field with the specified classes
    # Switch back to the original window
    driver.switch_to.window(curWindow)
    print(f"Switched back to original window: {curWindow}")
    try:
        input_field = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, 'input'))
        )
        input_field.send_keys("5stack5")
        
        # Click the Next button
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//span[text()='Next']"))
        )
        next_button.click()
        
        print("Successfully entered text and clicked Next button")

        return driver
    except Exception as e:
        print(f"Error interacting with elements: {e}")


In [77]:
driver = login_chrome('login.json')
driver = login_twitter(driver)
gather_tweets(driver=driver, credentials_file='login.json', stopDate='2025-04-10')


Error clicking element: Message: no such element: Unable to locate element: {"method":"css selector","selector":".VV3oRb.YZVTmd.SmR8"}
  (Session info: chrome=135.0.7049.85); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00BE8073+60707]
	GetHandleVerifier [0x00BE80B4+60772]
	(No symbol) [0x00A10683]
	(No symbol) [0x00A58660]
	(No symbol) [0x00A589FB]
	(No symbol) [0x00AA1022]
	(No symbol) [0x00A7D094]
	(No symbol) [0x00A9E824]
	(No symbol) [0x00A7CE46]
	(No symbol) [0x00A4C5D3]
	(No symbol) [0x00A4D424]
	GetHandleVerifier [0x00E2BB53+2435075]
	GetHandleVerifier [0x00E270F3+2416035]
	GetHandleVerifier [0x00E4349C+2531660]
	GetHandleVerifier [0x00BFF145+155125]
	GetHandleVerifier [0x00C05AED+182173]
	GetHandleVerifier [0x00BEF948+91640]
	GetHandleVerifier [0x00BEFAF0+92064]
	GetHandleVerifier [0x00BDA5B0+4704]
	BaseThreadInitThunk [0x7613FCC9+25]
	RtlGetApp

[<undetected_chromedriver.webelement.WebElement (session="c494214e5628ad5927d40eedcf93f74e", element="f.11C1470E4E65617955E7C55FED60A3B8.d.83516310A5660A85125002873FC2A111.e.294")>,
 <undetected_chromedriver.webelement.WebElement (session="c494214e5628ad5927d40eedcf93f74e", element="f.11C1470E4E65617955E7C55FED60A3B8.d.83516310A5660A85125002873FC2A111.e.298")>,
 <undetected_chromedriver.webelement.WebElement (session="c494214e5628ad5927d40eedcf93f74e", element="f.11C1470E4E65617955E7C55FED60A3B8.d.83516310A5660A85125002873FC2A111.e.299")>,
 <undetected_chromedriver.webelement.WebElement (session="c494214e5628ad5927d40eedcf93f74e", element="f.11C1470E4E65617955E7C55FED60A3B8.d.83516310A5660A85125002873FC2A111.e.300")>,
 <undetected_chromedriver.webelement.WebElement (session="c494214e5628ad5927d40eedcf93f74e", element="f.11C1470E4E65617955E7C55FED60A3B8.d.83516310A5660A85125002873FC2A111.e.301")>,
 <undetected_chromedriver.webelement.WebElement (session="c494214e5628ad5927d40eedcf93f74e