# Twitter Scrapper (Test Zone)

## Cargando librerías

In [48]:
import os
import json
import sys 
from sys import exit 
import numpy as np

In [None]:
import time
import urllib.request
import ssl
from dotenv import load_dotenv
from unicodedata import name
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urlparse, parse_qs
import urllib.parse



## Cargando variables del ambiente (api keys, contraseñas)

In [50]:
load_dotenv()
twitter_user = os.getenv("twitter_user")
twitter_password = os.getenv("twitter_password")
twitter_username = os.getenv("twitter_username")
detect_language_api_key= os.getenv("DETECT_LANGUAGE_API_KEY")
youtube_api_key = os.getenv("YOUTUBE_API_KEY")
gemini_api_key = os.getenv("GEMINI_API_KEY")    

## Funciones principales

In [97]:
import os 
import google.generativeai as genai 

def askGemini(prompt, column_text, api_key=None):
    time.sleep(10)
    if api_key is None:
        print("Gemini API key not set. Please set the GEMINI_API_KEY environment variable.")
        return None
    model = "gemini-1.5-turbo"

    genai.configure(api_key=api_key)
    model_llm = genai.GenerativeModel('gemini-1.5-flash')
    prompt = f"{prompt} +\n{column_text}"
    generic_response_es = "No hay respuesta"
    try:
        response = model_llm.generate_content(prompt)

        # Check if the response has text and is not just whitespace
        if hasattr(response, 'text') and response.text and response.text.strip():
            return response.text
        else:
            # Handle cases where response.text might be missing, empty, or only whitespace
            # This also catches if the model couldn't generate a proper candidate.
            if response.prompt_feedback and \
               response.prompt_feedback.block_reason != genai.types.BlockReason.UNSPECIFIED:
                print(f"Advertencia: La respuesta fue bloqueada. Razón: {response.prompt_feedback.block_reason_message}")
            return generic_response_es
            
    except Exception as e:
        print(f"Ocurrió un error al consultar a Gemini: {e}")
        return generic_response_es


In [98]:
def generate_x_search_url(query: str, from_user: str = None, type= 'top') -> str:
    """
    Genera la query de x para su búsqueda sencillita
    https://x.com/search?q=Query%20Text%20(from%3Ausername)&src=typed_query

    Args:
        query: The main search query string (e.g., "Crypto secured loan").
        from_user: Optional. The X.com username to filter tweets from (e.g., "cdixon").

    Returns:
        A string containing the fully formed X.com search URL.
    """
    base_url = "https://x.com/search"
    
    search_components = [query]
    if from_user:
        search_components.append(f"(from:{from_user})")
        
    full_query_string = " ".join(search_components)
    
    encoded_query = urllib.parse.quote(full_query_string, safe='()')
    search_url = f"{base_url}?q={encoded_query}&src=typed_query"
    if type=='latest':
        search_url = search_url + '&f=live'
    return search_url

# Example Usage:
# if __name__ == "__main__":
#     url1 = generate_x_search_url("Crypto secured loan", "cdixon")
#     print(f"Generated URL 1: {url1}")
#
#     url2 = generate_x_search_url("AI in education")
#     print(f"Generated URL 2: {url2}")

In [99]:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import re 

# It's good practice to configure a logger instead of using print for production scripts
# import logging
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def scrape_tweets_from_page(
    url: str,
    button_css_selector: str,
    tweet_article_selector: str = 'article[data-testid="tweet"]',
    tweet_data_testid: str = 'tweetText',
    max_scroll_attempts: int = 20,
    no_new_tweets_threshold: int = 5, # Reduced threshold for faster exit if no new content
    wait_time_after_action: float = 3.0, # Time to wait after scroll or click
    webdriver_options=None,
    driver = None,
    max_results: int = 10
):
    """
    Scrapes tweets from a given X.com URL by repeatedly trying to click a "show more"
    button or scrolling down the page.

    Args:
        url (str): The URL of the X.com page to scrape.
        button_css_selector (str): The CSS selector for the "show more" or similar button.
                                   WARNING: This is often brittle. Consider more robust selectors.
        tweet_data_testid (str): The value of the 'data-testid' attribute for tweet text containers.
        max_scroll_attempts (int): Maximum number of attempts to scroll/click.
        no_new_tweets_threshold (int): Stop if no new tweets are found after this many attempts.
        wait_time_after_action (float): Seconds to wait after a scroll or button click.
        max_results (int, optional): Maximum number of tweets to scrape. Defaults to None (no limit).
        webdriver_options: Optional Selenium WebDriver options (e.g., for headless mode).

    Returns:
        pandas.DataFrame: A DataFrame containing the scraped tweet texts, or an empty
                          DataFrame if an error occurs or no tweets are found.
    """
   
   #if webdriver_options is None:
   #     # Basic example, you might want to configure this further (e.g., path to chromedriver)
   #     options = webdriver.ChromeOptions()
   #     # options.add_argument('--headless') # Uncomment for headless browsing
   #     # options.add_argument('--disable-gpu')
   #     driver = webdriver.Chrome(options=options)
   # else:
   #     driver = webdriver.Chrome(options=webdriver_options)
    if driver is None:
        raise ValueError("A Selenium WebDriver instance must be provided via the 'driver' argument.")

    all_tweets_data = []
    scroll_attempt = 0
    previous_tweet_count = 0
    no_new_tweets_counter = 0

    print(f"Attempting to scrape tweets from: {url}")
    driver.get(url)
    time.sleep(wait_time_after_action) # Initial wait for page to load

    try:
        while scroll_attempt < max_scroll_attempts:
            scroll_attempt += 1
            print(f'Load attempt number: {scroll_attempt}')

            try:
                # Try to find and click the button
                # IMPORTANT: The provided button_css_selector is likely very fragile.
                # Consider using XPath with text content for more robustness, e.g.:
                # xpath_selector = "//button[.//span[contains(text(), 'Show more replies') or contains(text(), 'Show')]]"
                # boton_mostrar_mas = WebDriverWait(driver, 10).until(
                #     EC.element_to_be_clickable((By.XPATH, xpath_selector))
                # )
                boton_mostrar_mas = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, button_css_selector))
                )
                print(f'Found and clicked button using selector: {button_css_selector}')
                boton_mostrar_mas.click()
                time.sleep(wait_time_after_action)
            except TimeoutException:
                print('Button not found or not clickable, attempting to scroll.')
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(wait_time_after_action)
            except Exception as e:
                print(f"Error clicking button or scrolling: {e}")
                # Fallback to scroll if click fails for other reasons
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(wait_time_after_action)

            html_content = driver.page_source
            soup = BeautifulSoup(html_content, 'html.parser')
            #tweet_text_elements = soup.find_all('div', {'data-testid': tweet_data_testid})
            
            #current_tweet_count = len(tweet_text_elements)
            tweet_article_elements = soup.select(tweet_article_selector)
            current_tweet_count = len(tweet_article_elements)

            if current_tweet_count > previous_tweet_count:
                new_tweets_found = current_tweet_count - previous_tweet_count
                print(f"Found {new_tweets_found} new tweets.")
                #for tweet_element in tweet_text_elements[previous_tweet_count:]:
                #    tweet_text = tweet_element.get_text(separator=' ', strip=True)
                #    all_tweets_data.append({'Texto del Tweet': tweet_text})
                for article_element in tweet_article_elements[previous_tweet_count:]:
                    tweet_text=None
                    username = None
                    timestamp = None
                    tweet_id = None
                    text_element = article_element.find('div', {'data-testid': tweet_data_testid})
                    if text_element:
                        tweet_text = text_element.get_text(separator=' ', strip=True)
                    
                    user_name_div = article_element.find('div', attrs={'data-testid': 'User-Name'})

                    if user_name_div:
                        user_link_tag = user_name_div.find('a', href=True, string=lambda t: t and t.startswith('@'))
                        if user_link_tag:
                            username = user_link_tag.text.strip()
                    time_tag =  article_element.find('time', datetime=True)
                    if time_tag:
                         timestamp = time_tag['datetime']
                    
                    # Extract Tweet ID (from permalink like /<user>/status/<id>)
                    permalink_tags = article_element.find_all('a', href=True)
                    for tag in permalink_tags:
                        href = tag.get('href', '')
                        if '/status/' in href:
                            try:
                                potential_id = href.split('/status/')[-1].split('?')[0].split('/')[0]
                                if potential_id.isdigit():
                                    tweet_id = potential_id
                                    break 
                            except (IndexError, AttributeError):
                                continue # Problem parsing this href

                    all_tweets_data.append({
                        'tweet_text': tweet_text,
                        'username': username,
                        'timestamp': timestamp, 
                        'tweet_id': tweet_id
                    })
                    if max_results is not None and len(all_tweets_data) >= max_results:
                        print(f"Reached maximum results limit of {max_results}. Stopping scrape.")
                        break
                if max_results is not None and len(all_tweets_data) >= max_results:
                    print(f"Reached maximum results limit of {max_results}. Stopping scrape.")
                previous_tweet_count = current_tweet_count
                no_new_tweets_counter = 0
            else:
                no_new_tweets_counter += 1
                print(f"No new tweets found in this attempt ({no_new_tweets_counter}/{no_new_tweets_threshold}).")
                if no_new_tweets_counter >= no_new_tweets_threshold:
                    print("Reached tweet loading limit or end of content.")
                    break
            if max_results is not None and len(all_tweets_data) >= max_results:
                print(f"Reached maximum results limit of {max_results}. Stopping scrape.")
                break
    except Exception as e:
        print(f"An critical error occurred: {e}")
    #finally:
    #    driver.quit()

    df_tweets = pd.DataFrame(all_tweets_data)
    df_tweets['original_url'] = url
    df_tweets['tweet_link'] = 'https://x.com/'+   df_tweets['username'] +'/status/' + df_tweets['tweet_id']
    df_tweets['tweet_link'] = df_tweets['tweet_link'].str.replace('@', '', regex=False)
    print(f"\nScraped {len(df_tweets)} tweets in total.")
    return df_tweets


# Example Usage (you'll need to define 'your_target_url' and potentially adjust the selector):
# if __name__ == "__main__":
#     target_url = "https://x.com/search?q=some%20query&src=typed_query" # Replace with your actual URL
     # THIS IS THE BRITTLE SELECTOR FROM YOUR ORIGINAL SCRIPT.
     # IT IS HIGHLY RECOMMENDED TO REPLACE THIS WITH A MORE ROBUST ONE.
#     original_button_css = '#react-root > div > div > div.css-175oi2r.r-1f2l425.r-13qz1uu.r-417010.r-18u37iz > main > div > div > div > div > div > section > div > div > div:nth-child(20) > div.css-175oi2r.r-1adg3ll.r-1ny4l3l > div > button > div'
     # Example of a potentially more robust XPath (you'd need to adjust and test this):
     # robust_button_xpath = "//button[.//span[contains(text(), 'Show more replies') or contains(text(), 'Show')]]"
     # If using XPath, you'd modify the WebDriverWait to use By.XPATH

#     df = scrape_tweets_from_page(target_url, button_css_selector=original_button_css)
#     if not df.empty:
#         print(df.head())
        # df.to_csv('/home/user/scraped_tweets.csv', index=False)
        # print("Saved tweets to /home/user/scraped_tweets.csv")

In [100]:
import re

def get_youtube_channel_username(url_string: str) -> str | None:
    if not isinstance(url_string, str):
        return None

    # Pattern for URLs like /c/ChannelName
    # e.g., https://www.youtube.com/c/CoinBureau
    c_pattern = r"youtube\.com/c/([^/?]+)"
    match_c = re.search(c_pattern, url_string)
    if match_c:
        return match_c.group(1)

    # Pattern for URLs like /@ChannelName
    # e.g., https://www.youtube.com/@DiscoverCrypto_
    at_pattern = r"youtube\.com/@([^/?]+)"
    match_at = re.search(at_pattern, url_string)
    if match_at:
        return match_at.group(1)
        # We add the '@' symbol to match the typical handle format
    #    return "@" + match_at.group(1)

    return None


In [101]:
from googleapiclient.discovery import build
youtube = build('youtube', 'v3', developerKey=youtube_api_key)

def get_youtube_comments(video_url):
    if "v=" in video_url:
        video_id = video_url.split("v=")[-1].split("&")[0]
    elif "youtu.be/" in video_url:
        video_id = video_url.split("youtu.be/")[-1].split("?")[0]
    #if not video_id or not YOUTUBE_API_KEY:
    if not video_id or not youtube_api_key:
        return "Error: URL de YouTube no válida o clave API no configurada."
    try:
        comments = []
        response = youtube.commentThreads().list(
            part='snippet', 
            videoId=video_id,
            textFormat='plainText',
            maxResults=100
        ).execute()
        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            author = item['snippet']['topLevelComment']['snippet']['authorDisplayName']
            comments.append({'author': author, 'comment': comment})
        ## Esto será comentado, pero por ahora solo quiero los primeros 10 comentarios
        next_page_token = None
        while True: 
            response = youtube.commentThreads().list(
                part='snippet',
                videoId=video_id,
                textFormat='plainText',
                pageToken=next_page_token,
                maxResults=100
            ).execute()
            for item in response['items']:
                comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
                author = item['snippet']['topLevelComment']['snippet']['authorDisplayName']
                comments.append({'author': author, 'comment': comment})
            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break
        df = pd.DataFrame(comments)
        return df
    except Exception as e:
        return f"Error al obtener comentarios: {e}"



def youtube_search(youtube, **kwargs):
    return youtube.search().list(
        part="snippet",
        **kwargs
    ).execute()


In [102]:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# It's good practice to configure a logger instead of using print for production scripts
# import logging
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def scrape_tweets_from_page2(
    url: str,
    button_css_selector: str,
    tweet_article_selector: str = 'article[data-testid="tweet"]', # Selector for the main tweet article
    tweet_data_testid: str = 'tweetText',
    max_scroll_attempts: int = 20,
    no_new_tweets_threshold: int = 5, # Reduced threshold for faster exit if no new content
    wait_time_after_action: float = 3.0, # Time to wait after scroll or click
    webdriver_options=None, # This parameter is kept for consistency but driver instance is preferred
    driver= None
):
    """
    Scrapes tweets from a given X.com URL by repeatedly trying to click a "show more"
    button or scrolling down the page.

    Args:
        url (str): The URL of the X.com page to scrape.
        button_css_selector (str): The CSS selector for the "show more" or similar button.
                                   Can be None if no such button is expected.
        tweet_article_selector (str): CSS selector for the main tweet article container.
        tweet_data_testid (str): The value of the 'data-testid' attribute for tweet text containers.
        max_scroll_attempts (int): Maximum number of attempts to scroll/click.
        no_new_tweets_threshold (int): Stop if no new tweets are found after this many attempts.
        wait_time_after_action (float): Seconds to wait after a scroll or button click.
        webdriver_options: Optional Selenium WebDriver options (e.g., for headless mode).
                           It's recommended to pass an initialized driver instance instead.
        driver: An initialized Selenium WebDriver instance.

    Returns:
        pandas.DataFrame: A DataFrame containing the scraped tweet data (text, username, ID,
                          timestamp, replies, retweets, likes), or an empty DataFrame.
    """
    if driver is None:
        raise ValueError("A Selenium WebDriver instance must be provided via the 'driver' argument.")

    all_tweets_data = []
    processed_tweet_ids = set()
    scroll_attempt = 0
    no_new_tweets_counter = 0

    print(f"Attempting to scrape tweets from: {url}")
    driver.refresh()
    driver.get(url)
    time.sleep(wait_time_after_action) # Initial wait for page to load

    try:
        while scroll_attempt < max_scroll_attempts:
            scroll_attempt += 1
            print(f'Load attempt number: {scroll_attempt}')

            action_taken = False
            if button_css_selector:
                try:
                    boton_mostrar_mas = WebDriverWait(driver, 5).until( # Shorter wait for button
                        EC.element_to_be_clickable((By.CSS_SELECTOR, button_css_selector))
                    )
                    print(f'Found and clicked button using selector: {button_css_selector}')
                    boton_mostrar_mas.click()
                    action_taken = True
                    time.sleep(wait_time_after_action)
                except TimeoutException:
                    print('Button not found or not clickable in this attempt.')
                except Exception as e:
                    print(f"Error clicking button: {e}")
            
            if not action_taken: # If button not clicked or no button selector, then scroll
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                print('Performing scroll.')
                time.sleep(wait_time_after_action)

            html_content = driver.page_source
            soup = BeautifulSoup(html_content, 'html.parser')
            tweet_article_elements = soup.select(tweet_article_selector)
            
            new_tweets_scraped_this_round = 0

            for article_element in tweet_article_elements:
                tweet_id = None
                permalink_tags = article_element.find_all('a', href=True)
                for tag in permalink_tags:
                    href = tag.get('href', '')
                    if '/status/' in href:
                        try:
                            potential_id = href.split('/status/')[-1].split('?')[0].split('/')[0]
                            if potential_id.isdigit():
                                tweet_id = potential_id
                                break
                        except (IndexError, AttributeError):
                            continue
                
                if tweet_id and tweet_id not in processed_tweet_ids:
                    tweet_text, username, timestamp, reply_count, retweet_count, like_count = (None,) * 6

                    text_element = article_element.find('div', {'data-testid': tweet_data_testid})
                    if text_element: tweet_text = text_element.get_text(separator=' ', strip=True)
                    
                    user_name_div = article_element.find('div', attrs={'data-testid': 'User-Name'})
                    if user_name_div:
                        user_link_tag = user_name_div.find('a', href=True)
                        if user_link_tag:
                            handle_text = user_link_tag.find(string=lambda t: t and t.startswith('@'))
                            if handle_text: username = handle_text.strip()
                            elif user_link_tag.text.strip().startswith('@'): username = user_link_tag.text.strip()
                    
                    time_tag = article_element.find('time', datetime=True)
                    if time_tag: timestamp = time_tag['datetime']

                    for metric_div in article_element.find_all('div', {'data-testid': ['reply', 'retweet', 'like']}):
                        if metric_div.has_attr('aria-label'):
                            count = metric_div['aria-label'].split(' ')[0]
                            if metric_div['data-testid'] == 'reply': reply_count = count
                            elif metric_div['data-testid'] == 'retweet': retweet_count = count
                            elif metric_div['data-testid'] == 'like': like_count = count

                    all_tweets_data.append({'tweet_text': tweet_text, 'username': username, 'tweet_id': tweet_id,
                                            'timestamp': timestamp, 'replies': reply_count, 'retweets': retweet_count, 'likes': like_count})
                    processed_tweet_ids.add(tweet_id)
                    new_tweets_scraped_this_round += 1
            
            if new_tweets_scraped_this_round > 0:
                print(f"Found and processed {new_tweets_scraped_this_round} new tweets in this round.")
                no_new_tweets_counter = 0
            else:
                no_new_tweets_counter += 1
                print(f"No new tweets found in this attempt ({no_new_tweets_counter}/{no_new_tweets_threshold}).")
                if no_new_tweets_counter >= no_new_tweets_threshold:
                    print("Reached tweet loading limit or end of content.")
                    break

    except Exception as e:
        print(f"A critical error occurred during scraping: {e}")

    df_tweets = pd.DataFrame(all_tweets_data)
    print(f"\nScraped {len(df_tweets)} tweets in total.")
    return df_tweets

In [103]:
from deep_translator import GoogleTranslator, single_detection
def detectLanguage(text, api_key):
    language_detected = single_detection(text, api_key=api_key)
    return language_detected


def TranslateText(text, source, target):
    try:
        translatedText = GoogleTranslator(source = source, target = target).translate(text)
    except Exception as e:
        translatedText= None
    return translatedText 

def convertDfLanguage(df, column, api_key, target ='es', source = None):
    new_name = column + '_es'
    if source is None:
        df['original_language'] = df[column].apply(lambda x : detectLanguage(x, api_key=api_key))
        df[new_name] = df[['original_language', column]].apply(lambda x: TranslateText(text = x[column], source= x['original_language'], 
                                                                                       target=target ) if x['original_language']!='es' else x[column], axis=1)
    else:
        df['original_language'] = source
        df[new_name] = df[['original_language', column]].apply(lambda x: TranslateText(text = x[column], source= x['original_language'], 
                                                                                       target=target ) if x['original_language']!='es' else x[column], axis=1)

    return df 

In [104]:
#service = Service('E:/Users/1167486/Local/Drivers_web/chromedriver-win64/chromedriver.exe')
#options = webdriver.ChromeOptions()
#options.binary_location = r'E:/Users/1167486/Local/Drivers_web/chrome-win64/chrome.exe'
#options.add_argument('--disable-blink-features=AutomationControlled')


In [105]:
#driver = webdriver.Chrome(service=service, options=options)
#driver = webdriver.Chrome(service=webdriver.chrome.service.Service(ChromeDriverManager().install()), options=options)



In [106]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

print("Attempting to start Chrome...")
try:
    options = webdriver.ChromeOptions()
    options.binary_location = r'E:/Users/1167486/Local/Drivers_web/chrome-win64/chrome.exe'
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument('--no-sandbox') # Often helpful in restricted environments
    options.add_argument('--disable-dev-shm-usage') # Overcomes resource limits
    options.add_argument('--headless') # Try headless to rule out display/GUI issues
    options.add_argument('--disable-gpu') # Often needed with headless

    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
    print("Driver started. Navigating to Google...")
    driver.get("https://www.google.com")
    print(f"Page title: {driver.title}")
    driver.quit()
    print("Driver quit successfully.")
except Exception as e:
    print(f"An error occurred: {e}")

Attempting to start Chrome...
Driver started. Navigating to Google...
Page title: Google
Driver quit successfully.


In [107]:
options = webdriver.ChromeOptions()
options.binary_location = r'E:/Users/1167486/Local/Drivers_web/chrome-win64/chrome.exe'
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage') 
#options.add_argument('--headless') 
#options.add_argument('--disable-gpu') 
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), 
                          options=options)


In [108]:
#driver.quit()

In [109]:
driver.maximize_window()
driver.get('https://x.com/i/flow/login')
time.sleep(5)

In [110]:
#username = driver.find_element("css selector", "input[autocomplete='username']")
#username.clear()
#username.send_keys(twitter_user)
#username.send_keys(Keys.ENTER)
#time.sleep(5)


In [111]:
"""
if  driver.find_element("css selector", "input[autocomplete='on']"):
    xpath ='//*[@id="react-root"]/div/div/div/main/div/div/div/div[2]/div[2]/div[1]/div/div/div[2]/label/div/div[2]/div/input'
    #double_name = driver.find_element("css selector", "input[autocomplete='on']")
    double_name = driver.find_element("css selector", "input[data-testid='ocfEnterTextTextInput']")

    #double_name = driver.find_element(By.NAME, "text")
    #double_name = driver.find_element(By.XPATH, xpath)

    double_name.clear()
    double_name.send_keys(twitter_username)
    double_name.send_keys(Keys.ENTER)
    time.sleep(3)
    password = driver.find_element("css selector", "input[autocomplete='current-password']")
    password.clear()
    password.send_keys(twitter_password)
    password.send_keys(Keys.ENTER)
else:
    time.sleep(3)
    password = driver.find_element("css selector", "input[autocomplete='current-password']")
    password.clear()
    password.send_keys(twitter_password)
    password.send_keys(Keys.ENTER)

time.sleep(2)
"""


'\nif  driver.find_element("css selector", "input[autocomplete=\'on\']"):\n    xpath =\'//*[@id="react-root"]/div/div/div/main/div/div/div/div[2]/div[2]/div[1]/div/div/div[2]/label/div/div[2]/div/input\'\n    #double_name = driver.find_element("css selector", "input[autocomplete=\'on\']")\n    double_name = driver.find_element("css selector", "input[data-testid=\'ocfEnterTextTextInput\']")\n\n    #double_name = driver.find_element(By.NAME, "text")\n    #double_name = driver.find_element(By.XPATH, xpath)\n\n    double_name.clear()\n    double_name.send_keys(twitter_username)\n    double_name.send_keys(Keys.ENTER)\n    time.sleep(3)\n    password = driver.find_element("css selector", "input[autocomplete=\'current-password\']")\n    password.clear()\n    password.send_keys(twitter_password)\n    password.send_keys(Keys.ENTER)\nelse:\n    time.sleep(3)\n    password = driver.find_element("css selector", "input[autocomplete=\'current-password\']")\n    password.clear()\n    password.send_ke

In [None]:
wait = WebDriverWait(driver, 10)
try:
    print('Intentando ingresar el nombre de usuario/correo')
    username = driver.find_element("css selector", "input[autocomplete='username']")
    username.send_keys(twitter_user)
    username.send_keys(Keys.ENTER)
    print('Encontró el nombre de usuario y lo ingresó')
    time.sleep(5)
    # Wait for the username/email input field using data-testid
    print('Intentando ingresar el nombre de usuario como medida de seguridad')
    username_input = wait.until(
        #EC.element_to_be_clickable((By.CSS_SELECTOR, "input[data-testid='ocfEnterTextTextInput']"))
        EC.element_to_be_clickable((By.XPATH, "//input[@data-testid='ocfEnterTextTextInput']"))
    )
    #username_input.clear()
    username_input.send_keys(twitter_username)
    username_input.send_keys(Keys.ENTER)
    print('Encontró el nombre de usuario de seguridad y lo ingresó')
    time.sleep(4)
    print('Intentando ingresar la contraseña')
    password_input = wait.until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "input[autocomplete='current-password']"))
    )
    password_input.clear()
    password_input.send_keys(twitter_password)
    password_input.send_keys(Keys.ENTER)
    print("Encontró la contraseña y la ingresó")
except Exception as e:
    print(f"An error occurred during login steps: {e}")
    # Depending on your script, you might want to quit the driver here
    driver.quit()


Intentando ingresar el nombre de usuario/correo
Encontró el nombre de usuario y lo ingresó
Intentando ingresar el nombre de usuario como medida de seguridad
Encontró el nombre de usuario de seguridad y lo ingresó
Intentando ingresar la contraseña
Encontró la contraseña y la ingresó


In [113]:
#driver.quit()

In [114]:
#password = driver.find_element("css selector", "input[autocomplete='current-password']")
#password.clear()
#password.send_keys(twitter_password)
#password.send_keys(Keys.ENTER)



## Probando las funciones

In [74]:
#tweet_test = 'https://x.com/anabelhoficial/status/1906888988192887030'
#tweet_test = 'https://x.com/Juan_OrtizMX/status/1907289700101935494'
tweet_test = 'https://x.com/RicardoBSalinas/status/1914398783624126967'


In [75]:
query1 = 'Crypto secured loan'
query2 = 'Crypto backed loan'
query3 = 'Loan with cryptos as a guarantee'

In [76]:
test1 = generate_x_search_url(query1)
test2 = generate_x_search_url(query1, 'cdixon')

In [77]:
print(test1)
print(test2)


https://x.com/search?q=Crypto%20secured%20loan&src=typed_query
https://x.com/search?q=Crypto%20secured%20loan%20(from%3Acdixon)&src=typed_query


In [78]:
driver.refresh()
original_button_css = '#react-root > div > div > div.css-175oi2r.r-1f2l425.r-13qz1uu.r-417010.r-18u37iz > main > div > div > div > div > div > section > div > div > div:nth-child(20) > div.css-175oi2r.r-1adg3ll.r-1ny4l3l > div > button > div'
testscrape = scrape_tweets_from_page(test1, max_scroll_attempts=1, 
                                     button_css_selector=original_button_css, driver= driver, max_results=2)

Attempting to scrape tweets from: https://x.com/search?q=Crypto%20secured%20loan&src=typed_query
Load attempt number: 1
Button not found or not clickable, attempting to scroll.
Found 7 new tweets.
Reached maximum results limit of 2. Stopping scrape.
Reached maximum results limit of 2. Stopping scrape.
Reached maximum results limit of 2. Stopping scrape.

Scraped 2 tweets in total.


In [79]:
#testscrape2 = scrape_tweets_from_page2(test1, max_scroll_attempts=1, button_css_selector=original_button_css, driver= driver)

In [80]:
testscrape

Unnamed: 0,tweet_text,username,timestamp,tweet_id,original_url,tweet_link
0,"For anyone curious, this is what my $BTC -back...",@BitMasterK,2025-05-20T17:08:13.000Z,1924874818815398041,https://x.com/search?q=Crypto%20secured%20loan...,https://x.com/BitMasterK/status/19248748188153...
1,Quartz is an on-chain credit card that lets yo...,@GetPyra,2025-03-17T16:45:29.000Z,1901676272041763069,https://x.com/search?q=Crypto%20secured%20loan...,https://x.com/GetPyra/status/1901676272041763069


In [81]:
#testscrape2

In [82]:
convertDfLanguage(testscrape, column='tweet_text', api_key=detect_language_api_key, target ='es')

Unnamed: 0,tweet_text,username,timestamp,tweet_id,original_url,tweet_link,original_language,tweet_text_es
0,"For anyone curious, this is what my $BTC -back...",@BitMasterK,2025-05-20T17:08:13.000Z,1924874818815398041,https://x.com/search?q=Crypto%20secured%20loan...,https://x.com/BitMasterK/status/19248748188153...,en,"Para cualquier persona curiosa, así es como se..."
1,Quartz is an on-chain credit card that lets yo...,@GetPyra,2025-03-17T16:45:29.000Z,1901676272041763069,https://x.com/search?q=Crypto%20secured%20loan...,https://x.com/GetPyra/status/1901676272041763069,en,Quartz es una tarjeta de crédito en la cadena ...


## Análisis principal

In [None]:
listUsersTwitter = [
             'https://x.com/cdixon?roistat_visit=2540096',
             'https://x.com/rogerkver?roistat_visit=2540096',
             'https://x.com/balajis?roistat_visit=2540096',
             'https://x.com/ErikVoorhees?roistat_visit=2540096',
             'https://x.com/pmarca?roistat_visit=2540096',
             'https://x.com/CryptoGodJohn?roistat_visit=2540096',
             'https://x.com/VitalikButerin?roistat_visit=2540096',
             'https://x.com/elonmusk?roistat_visit=2540096',
             'https://x.com/aantonop?roistat_visit=2540096',
             'https://x.com/IvanOnTech?roistat_visit=2540096',
             'https://x.com/SatoshiLite?roistat_visit=2540096',
             'https://x.com/saylor?roistat_visit=2540096'
             ]

youtubeChannelsList=[
    'https://www.youtube.com/c/CoinBureau?roistat_visit=2540096',
    'https://www.youtube.com/@DiscoverCrypto_/videos?roistat_visit=2540096',
    'https://www.youtube.com/c/AltcoinDaily?roistat_visit=2540096',
    'https://www.youtube.com/c/BrianJungy/videos?roistat_visit=2540096',
    'https://www.youtube.com/c/MaxMaher?roistat_visit=2540096',
    'https://www.youtube.com/c/TheMoonCarl?roistat_visit=2540096'

]

questions = [
    'Crypto secured loan', 
    'Crypto backed loan',
    'Loan with cryptos as a guarantee'
]



### Twitter extracción

#### Influencers extracción

In [None]:
listDf = []
for user in listUsersTwitter:
    time.sleep(20)
    user = user.replace('https://x.com/', '')
    user = re.sub(r'\?roistat_visit=\d+', '', user)
    for question in questions:
        query = generate_x_search_url(from_user=user, query=question,  type='top')
        print(f"Scraping this user: {user}\n Making this query: {query}")
        time.sleep(10)
        original_button_css = '#react-root > div > div > div.css-175oi2r.r-1f2l425.r-13qz1uu.r-417010.r-18u37iz > main > div > div > div > div > div > section > div > div > div:nth-child(20) > div.css-175oi2r.r-1adg3ll.r-1ny4l3l > div > button > div'
        tweetDfScrap = scrape_tweets_from_page(query, 
                                               max_scroll_attempts=2, 
                                               button_css_selector = original_button_css, 
                                               driver= driver, 
                                               max_results=10)
        tweetDfScrap['influencer_name'] = user 
        tweetDfScrap['twitter_query'] = question
        listDf.append(tweetDfScrap)



Scraping this user: cdixon
 Making this query: https://x.com/search?q=Crypto%20secured%20loan%20(from%3Acdixon)&src=typed_query
Attempting to scrape tweets from: https://x.com/search?q=Crypto%20secured%20loan%20(from%3Acdixon)&src=typed_query
Load attempt number: 1
Button not found or not clickable, attempting to scroll.
Found 6 new tweets.
Load attempt number: 2
Button not found or not clickable, attempting to scroll.
No new tweets found in this attempt (1/5).

Scraped 6 tweets in total.
Scraping this user: cdixon
 Making this query: https://x.com/search?q=Crypto%20backed%20loan%20(from%3Acdixon)&src=typed_query
Attempting to scrape tweets from: https://x.com/search?q=Crypto%20backed%20loan%20(from%3Acdixon)&src=typed_query
Load attempt number: 1
Button not found or not clickable, attempting to scroll.
Found 8 new tweets.
Load attempt number: 2
Button not found or not clickable, attempting to scroll.
No new tweets found in this attempt (1/5).

Scraped 8 tweets in total.
Scraping this 

In [None]:
listDf

[                                          tweet_text username  \
 0  I’m excited to announce that we’re leading the...  @cdixon   
 1  Excited to announce that we’re leading a seed ...  @cdixon   
 2  The House of Representatives made history toda...  @cdixon   
 3  The UK is on the right path for crypto regulat...  @cdixon   
 4  We’ve invested in a number of UK-based crypto ...  @cdixon   
 5  8/ Why crypto ? You could theoretically have d...  @cdixon   
 
                   timestamp             tweet_id  \
 0  2025-02-11T17:16:03.000Z  1889362777455927416   
 1  2023-09-21T15:56:21.000Z  1704887297680822574   
 2  2024-05-22T22:21:03.000Z  1793406750671974651   
 3  2023-06-11T23:05:10.000Z  1668031650028986368   
 4  2023-06-11T23:05:11.000Z  1668031654273622017   
 5  2021-08-22T23:26:39.000Z  1429585839147806723   
 
                                         original_url  \
 0  https://x.com/search?q=Crypto%20secured%20loan...   
 1  https://x.com/search?q=Crypto%20secured%20loa

In [None]:
dfTweetsInfluencers = pd.concat(listDf, ignore_index=True)
dfTweetsInfluencers = dfTweetsInfluencers.drop_duplicates(subset=['tweet_text'], 
                                                          keep='first')

In [None]:
dfTweetsInfluencers.shape

(130, 8)

In [None]:
dfTweetsInfluencers.to_csv('influencers_tweets_bitcoin_project.csv', 
                           index=False)

#### Extracción de respuestas a partir de tweets de influencers

In [None]:
testDf = dfTweetsInfluencers.iloc[0:1]
testDf



Unnamed: 0,tweet_text,username,timestamp,tweet_id,original_url,tweet_link,influencer_name,twitter_query
0,I’m excited to announce that we’re leading the...,@cdixon,2025-02-11T17:16:03.000Z,1889362777455927416,https://x.com/search?q=Crypto%20secured%20loan...,https://x.com/cdixon/status/1889362777455927416,cdixon,Crypto secured loan


In [None]:
## Filtro 
dfFilteredInfluencers = dfTweetsInfluencers[dfTweetsInfluencers['tweet_text'].str.contains('loan|backed|secured|backed loan|crypto loan', case=False, na=False)]
print(dfFilteredInfluencers.shape)

(21, 8)


In [None]:
generalTweetList = []
for index, row in dfFilteredInfluencers.iterrows():
#for index, row in testDf.iterrows():   
    time.sleep(35)
    print(row['tweet_link'])
    tweetLink = row['tweet_link']
    tweetDfScrap = scrape_tweets_from_page(tweetLink,
                                            max_scroll_attempts=2, 
                                            button_css_selector=original_button_css, 
                                            driver= driver, 
                                            max_results=10)
    tweetDfScrap['influencer_name'] = row['influencer_name']
    tweetDfScrap['twitter_query'] = row['twitter_query']
    tweetDfScrap['influencer_twitter_text'] = row['tweet_text']
    tweetDfScrap['influencer_tweet_link'] = row['tweet_link']
    tweetDfScrap['influencer_tweet_timestamp'] = row['timestamp']
    generalTweetList.append(tweetDfScrap)


https://x.com/balajis/status/1865901424238047549
Attempting to scrape tweets from: https://x.com/balajis/status/1865901424238047549
Load attempt number: 1
Button not found or not clickable, attempting to scroll.
Found 16 new tweets.
Reached maximum results limit of 10. Stopping scrape.
Reached maximum results limit of 10. Stopping scrape.
Reached maximum results limit of 10. Stopping scrape.

Scraped 10 tweets in total.
https://x.com/balajis/status/1642002430174056448
Attempting to scrape tweets from: https://x.com/balajis/status/1642002430174056448
Load attempt number: 1
Button not found or not clickable, attempting to scroll.
Found 14 new tweets.
Reached maximum results limit of 10. Stopping scrape.
Reached maximum results limit of 10. Stopping scrape.
Reached maximum results limit of 10. Stopping scrape.

Scraped 10 tweets in total.
https://x.com/ErikVoorhees/status/1701396004522643695
Attempting to scrape tweets from: https://x.com/ErikVoorhees/status/1701396004522643695
Load attem

In [None]:
dfTweetsComments = pd.concat(generalTweetList, ignore_index=True)
dfTweetsComments = dfTweetsComments.drop_duplicates(subset=['tweet_text'],
                                                    keep='first')


In [None]:
dfTweetsComments.shape

(175, 11)

In [None]:
dfTweetsComments

Unnamed: 0,tweet_text,username,timestamp,tweet_id,original_url,tweet_link,influencer_name,twitter_query,influencer_twitter_text,influencer_tweet_link,influencer_tweet_timestamp
0,CheckID eth lens is available @6529Collections...,@mintfaced,2024-12-08T23:50:26.000Z,1865906816988610824,https://x.com/balajis/status/1865901424238047549,https://x.com/mintfaced/status/186590681698861...,balajis,Crypto secured loan,Two perhaps obvious qualifiers:\n\n1) Bitfinex...,https://x.com/balajis/status/1865901424238047549,2024-12-08T23:29:00.000Z
1,P2P lending is great and a defining use case f...,@gootecks,2024-12-08T23:55:45.000Z,1865908153847836713,https://x.com/balajis/status/1865901424238047549,https://x.com/gootecks/status/1865908153847836713,balajis,Crypto secured loan,Two perhaps obvious qualifiers:\n\n1) Bitfinex...,https://x.com/balajis/status/1865901424238047549,2024-12-08T23:29:00.000Z
2,Nice try diddy.,@CardanoHumpback,2024-12-08T23:29:54.000Z,1865901646825525276,https://x.com/balajis/status/1865901424238047549,https://x.com/CardanoHumpback/status/186590164...,balajis,Crypto secured loan,Two perhaps obvious qualifiers:\n\n1) Bitfinex...,https://x.com/balajis/status/1865901424238047549,2024-12-08T23:29:00.000Z
3,doubly cool that centralized Fed-dictated lend...,@jusnothermfer,2024-12-09T02:58:08.000Z,1865954053391626665,https://x.com/balajis/status/1865901424238047549,https://x.com/jusnothermfer/status/18659540533...,balajis,Crypto secured loan,Two perhaps obvious qualifiers:\n\n1) Bitfinex...,https://x.com/balajis/status/1865901424238047549,2024-12-08T23:29:00.000Z
4,decentralized order book with time-based p2p l...,@useteller,2024-12-09T09:08:41.000Z,1866047303163007366,https://x.com/balajis/status/1865901424238047549,https://x.com/useteller/status/186604730316300...,balajis,Crypto secured loan,Two perhaps obvious qualifiers:\n\n1) Bitfinex...,https://x.com/balajis/status/1865901424238047549,2024-12-08T23:29:00.000Z
...,...,...,...,...,...,...,...,...,...,...,...
190,"When bitcoin is $5000 or lower, you’ll know th...",@askslim,2021-06-25T00:39:48.000Z,1408223366780231680,https://x.com/saylor/status/1408024745392680971,https://x.com/askslim/status/1408223366780231680,saylor,Loan with cryptos as a guarantee,"If you paid off your mortgage last year, you s...",https://x.com/saylor/status/1408024745392680971,2021-06-24T11:30:33.000Z
191,If you haven't spent your parents 401k and mov...,@WinfieldSmart,2021-06-24T21:40:16.000Z,1408178185691348992,https://x.com/saylor/status/1408024745392680971,https://x.com/WinfieldSmart/status/14081781856...,saylor,Loan with cryptos as a guarantee,"If you paid off your mortgage last year, you s...",https://x.com/saylor/status/1408024745392680971,2021-06-24T11:30:33.000Z
192,Anyone who thinks real estate should be owned ...,@olvelez007,2021-06-25T13:16:55.000Z,1408413899066650624,https://x.com/saylor/status/1408024745392680971,https://x.com/olvelez007/status/14084138990666...,saylor,Loan with cryptos as a guarantee,"If you paid off your mortgage last year, you s...",https://x.com/saylor/status/1408024745392680971,2021-06-24T11:30:33.000Z
193,"Also - If you bought Bitcoin a month ago, you’...",@AssetAngus,2021-06-24T11:32:16.000Z,1408025175166160896,https://x.com/saylor/status/1408024745392680971,https://x.com/AssetAngus/status/14080251751661...,saylor,Loan with cryptos as a guarantee,"If you paid off your mortgage last year, you s...",https://x.com/saylor/status/1408024745392680971,2021-06-24T11:30:33.000Z


In [None]:
dfTweetsComments = convertDfLanguage(dfTweetsComments, column="tweet_text", 
                                     api_key=None, target ='es', source = 'en')


In [None]:
desired_columns_tweets = ['tweet_id',  'tweet_text', 'tweet_text_es', 'username','timestamp',
                    'original_url', 'tweet_link', 'influencer_name', 'twitter_query',
                    'influencer_twitter_text', 'influencer_tweet_link',
                    'influencer_tweet_timestamp']

In [None]:
dfTweetsComments= dfTweetsComments[desired_columns_tweets]

In [None]:
dfTweetsComments['filter_to_llm'] = dfTweetsComments['tweet_text'].str.contains('loan|backed|secured', case=False, na=False)


In [None]:
dfTweetsComments.head()

Unnamed: 0,tweet_id,tweet_text,tweet_text_es,username,timestamp,original_url,tweet_link,influencer_name,twitter_query,influencer_twitter_text,influencer_tweet_link,influencer_tweet_timestamp,filter_to_llm
0,1865906816988610824,CheckID eth lens is available @6529Collections...,Checkid ETH Lens está disponible @6529Collecti...,@mintfaced,2024-12-08T23:50:26.000Z,https://x.com/balajis/status/1865901424238047549,https://x.com/mintfaced/status/186590681698861...,balajis,Crypto secured loan,Two perhaps obvious qualifiers:\n\n1) Bitfinex...,https://x.com/balajis/status/1865901424238047549,2024-12-08T23:29:00.000Z,False
1,1865908153847836713,P2P lending is great and a defining use case f...,P2P Lending es excelente y un caso de uso defi...,@gootecks,2024-12-08T23:55:45.000Z,https://x.com/balajis/status/1865901424238047549,https://x.com/gootecks/status/1865908153847836713,balajis,Crypto secured loan,Two perhaps obvious qualifiers:\n\n1) Bitfinex...,https://x.com/balajis/status/1865901424238047549,2024-12-08T23:29:00.000Z,False
2,1865901646825525276,Nice try diddy.,Buen intento Diddy.,@CardanoHumpback,2024-12-08T23:29:54.000Z,https://x.com/balajis/status/1865901424238047549,https://x.com/CardanoHumpback/status/186590164...,balajis,Crypto secured loan,Two perhaps obvious qualifiers:\n\n1) Bitfinex...,https://x.com/balajis/status/1865901424238047549,2024-12-08T23:29:00.000Z,False
3,1865954053391626665,doubly cool that centralized Fed-dictated lend...,¡Doblablemente genial que la tasa de préstamos...,@jusnothermfer,2024-12-09T02:58:08.000Z,https://x.com/balajis/status/1865901424238047549,https://x.com/jusnothermfer/status/18659540533...,balajis,Crypto secured loan,Two perhaps obvious qualifiers:\n\n1) Bitfinex...,https://x.com/balajis/status/1865901424238047549,2024-12-08T23:29:00.000Z,False
4,1866047303163007366,decentralized order book with time-based p2p l...,Libro de pedidos descentralizado con préstamos...,@useteller,2024-12-09T09:08:41.000Z,https://x.com/balajis/status/1865901424238047549,https://x.com/useteller/status/186604730316300...,balajis,Crypto secured loan,Two perhaps obvious qualifiers:\n\n1) Bitfinex...,https://x.com/balajis/status/1865901424238047549,2024-12-08T23:29:00.000Z,True


In [None]:
dfTweetsComments.to_csv('users_tweets_comments_bitcoin_project.csv', index=False)

### Yt extracción

#### Extracción de videos de influencers

In [None]:
#get_youtube_channel_username(youtubeChannelsList[1])
print(questions[0] + ' from ' + get_youtube_channel_username(youtubeChannelsList[1]))
#youtube_search(youtube, q= questions[0], maxResults=5, order='date')

Crypto secured loan from DiscoverCrypto_


In [None]:
listDfYoutubeVideo = []
for channel in youtubeChannelsList:
    channel = get_youtube_channel_username(channel)
    print(channel)
    for question in questions:
        print(question + ' from ' + channel)
        search_response = youtube_search(youtube, q= question + ' from ' + channel, maxResults=10,
                                          order='date')
        items = search_response.get('items')
        if len(items) == 0:
            print('No hay resultados para esta búsqueda')
            continue
        for item in items:
            video_id = item['id']['videoId']
            publishedAt = item['snippet']['publishedAt']
            video_url = f"https://www.youtube.com/watch?v={video_id}"
            video_title = item['snippet']['title']
            print(f"Video URL: {video_url}")
            dicttodf = {'video_url': video_url, 
                        'question': question, 'channel': channel, 'video_id': video_id, 'publishedAt': publishedAt,
                            'video_title': video_title}
            df = pd.DataFrame(dicttodf, index=[0])
            listDfYoutubeVideo.append(df)
        print(search_response)
        time.sleep(15)

CoinBureau
Crypto secured loan from CoinBureau
Video URL: https://www.youtube.com/watch?v=tLWx2upCdkU
Video URL: https://www.youtube.com/watch?v=zizUqOOxE_0
{'kind': 'youtube#searchListResponse', 'etag': 'i_KCfP1yFPNWC5LL2H636Cr4kDc', 'regionCode': 'MX', 'pageInfo': {'totalResults': 2030, 'resultsPerPage': 2}, 'items': [{'kind': 'youtube#searchResult', 'etag': '3kCsPobgFlZH2AlD-50UrFqlWqE', 'id': {'kind': 'youtube#video', 'videoId': 'tLWx2upCdkU'}, 'snippet': {'publishedAt': '2025-04-29T15:35:09Z', 'channelId': 'UCqK_GSMbpiV8spgD3ZGloSw', 'title': 'DeFi vs CeFi Lending: Key Trends You Need to Watch in Crypto&#39;s Comeback', 'description': "Do you find DeFi confusing? Don't worry, you're not alone. The truth is, Decentralised Finance can be confusing for many ...", 'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/tLWx2upCdkU/default.jpg', 'width': 120, 'height': 90}, 'medium': {'url': 'https://i.ytimg.com/vi/tLWx2upCdkU/mqdefault.jpg', 'width': 320, 'height': 180}, 'high': {'u

In [None]:
dfVideosInfluencers  = pd.concat(listDfYoutubeVideo, ignore_index=True)

In [None]:
dfVideosInfluencers.shape

(6, 6)

In [None]:
dfVideosInfluencers.to_csv('influencers_youtube_videos_bitcoin_project.csv', index=False)

#### Extracción de comentarios a partir de videos de influencers

In [None]:
listDf = []
#for index, row in dfVideosInfluencers.iloc[0:1].iterrows():
for index, row in dfVideosInfluencers.iterrows():
    print('Comenzando a extraer los datos de este video')
    print(row['video_url'])
    print(row['video_title'])
    print(row['publishedAt'])
    print(row['channel'])
    print(row['question'])
    print('-------------------')
    time.sleep(10)
    dfComments = get_youtube_comments(row['video_url'])
    dfComments['influener_channel_name'] = row['channel']
    dfComments['original_video_url'] = row['video_url']
    dfComments['original_youtube_question'] = row['question']
    dfComments['original_youtube_video_name'] = row['video_title']
    dfComments['original_date_published_video'] = row['publishedAt']
    print(dfComments.shape)
    listDf.append(dfComments)
    time.sleep(30)

Comenzando a extraer los datos de este video
https://www.youtube.com/watch?v=tLWx2upCdkU
DeFi vs CeFi Lending: Key Trends You Need to Watch in Crypto&#39;s Comeback
2025-04-29T15:35:09Z
CoinBureau
Crypto secured loan
-------------------
(176, 7)
Comenzando a extraer los datos de este video
https://www.youtube.com/watch?v=zizUqOOxE_0
Nexo Review: Complete Guide to Crypto Loans
2020-02-28T17:23:19Z
CoinBureau
Crypto secured loan
-------------------
(302, 7)
Comenzando a extraer los datos de este video
https://www.youtube.com/watch?v=tLWx2upCdkU
DeFi vs CeFi Lending: Key Trends You Need to Watch in Crypto&#39;s Comeback
2025-04-29T15:35:09Z
CoinBureau
Crypto backed loan
-------------------
(176, 7)
Comenzando a extraer los datos de este video
https://www.youtube.com/watch?v=zizUqOOxE_0
Nexo Review: Complete Guide to Crypto Loans
2020-02-28T17:23:19Z
CoinBureau
Crypto backed loan
-------------------
(302, 7)
Comenzando a extraer los datos de este video
https://www.youtube.com/watch?v=tLWx2

In [None]:
dfVideosComments = pd.concat(listDf, ignore_index=True)

In [None]:
dfVideosComments.shape

(1434, 7)

In [None]:
dfVideosComments = convertDfLanguage(dfVideosComments, column="comment", 
                                     api_key=None, target ='es', source = 'en')

In [None]:
dfVideosComments.columns

Index(['author', 'comment', 'influener_channel_name', 'original_video_url',
       'original_youtube_question', 'original_youtube_video_name',
       'original_date_published_video', 'original_language', 'comment_es'],
      dtype='object')

In [None]:
desired_columns_youtube = ['author', 'comment', 'comment_es', 
               'influener_channel_name', 'original_video_url', 'original_youtube_question', 
                'original_youtube_video_name', 'original_date_published_video']

In [None]:
dfVideosComments = dfVideosComments[desired_columns_youtube]

In [None]:
dfVideosComments.shape

(1434, 8)

In [None]:

dfVideosComments['filter_to_llm'] = dfVideosComments['comment'].str.contains('loan|backed|secured', case=False, na=False)


In [None]:
dfVideosComments.tail()

Unnamed: 0,author,comment,comment_es,influener_channel_name,original_video_url,original_youtube_question,original_youtube_video_name,original_date_published_video,filter_to_llm
1429,@pepecow235,Celsius review please.. I took every project u...,Celsius Review Por favor. Tomé todos los proye...,CoinBureau,https://www.youtube.com/watch?v=zizUqOOxE_0,Loan with cryptos as a guarantee,Nexo Review: Complete Guide to Crypto Loans,2020-02-28T17:23:19Z,False
1430,@RichardFletcher,Mimi piano is a scam don't even read this scam...,"Mimi Piano es una estafa, ni siquiera lea esta...",CoinBureau,https://www.youtube.com/watch?v=zizUqOOxE_0,Loan with cryptos as a guarantee,Nexo Review: Complete Guide to Crypto Loans,2020-02-28T17:23:19Z,False
1431,@chungrokim2867,IMO BlockFi or Celsius to be preferred,Imo Blockfi o Celsius a preferir,CoinBureau,https://www.youtube.com/watch?v=zizUqOOxE_0,Loan with cryptos as a guarantee,Nexo Review: Complete Guide to Crypto Loans,2020-02-28T17:23:19Z,False
1432,@richardgreenwood1033,Celsius is far superior in terms of interest p...,Celsius es muy superior en términos de interés...,CoinBureau,https://www.youtube.com/watch?v=zizUqOOxE_0,Loan with cryptos as a guarantee,Nexo Review: Complete Guide to Crypto Loans,2020-02-28T17:23:19Z,True
1433,@giak7525,Nexo is my favorite centralized crypto bank. E...,Nexo es mi banco de cifrado centralizado favor...,CoinBureau,https://www.youtube.com/watch?v=zizUqOOxE_0,Loan with cryptos as a guarantee,Nexo Review: Complete Guide to Crypto Loans,2020-02-28T17:23:19Z,False


In [None]:
dfVideosComments.to_csv('users_youtube_comments_bitcoin_project.csv', 
                        index=False)

### Usando un LLM para responder la pregunta de hipótesis

#### Pequeño test 

In [None]:
testlm = dfVideosComments[dfVideosComments['filter_to_llm']].head(2)

In [None]:
testlm

Unnamed: 0,author,comment,comment_es,influener_channel_name,original_video_url,original_youtube_question,original_youtube_video_name,original_date_published_video,filter_to_llm
18,@oohwha,"TL; DR: ""It turns out, about 99% of humans al...","Tl; DR: ""Resulta que alrededor del 99% de los ...",CoinBureau,https://www.youtube.com/watch?v=tLWx2upCdkU,Crypto secured loan,DeFi vs CeFi Lending: Key Trends You Need to W...,2025-04-29T15:35:09Z,True
29,@donking3316,$Loan protocol &XPR and Metal blockchain by M...,$ Protocolo de préstamo y XPR y Metal Blockcha...,CoinBureau,https://www.youtube.com/watch?v=tLWx2upCdkU,Crypto secured loan,DeFi vs CeFi Lending: Key Trends You Need to W...,2025-04-29T15:35:09Z,True


In [None]:
prompt = "En este texto, debemos responder la siguiente pregunta: ¿Qué es lo que el cliente se pregunta sobre nuestro producto (crypto secured loan) que lo convence a usarlo?, si existe relación, responde con una categoría de no más de 5 palabras que responda la pregunta, de lo contrario responde solamente 'No tengo respuesta específica', gracias"

testlm['llm_answer'] = testlm.apply(lambda x: askGemini(prompt, x['comment'], api_key=gemini_api_key) if x['filter_to_llm']==True else 'No tengo respuesta específica', axis=1)


In [None]:
testlm.head()

Unnamed: 0,author,comment,comment_es,influener_channel_name,original_video_url,original_youtube_question,original_youtube_video_name,original_date_published_video,filter_to_llm,llm_answer
18,@oohwha,"TL; DR: ""It turns out, about 99% of humans al...","Tl; DR: ""Resulta que alrededor del 99% de los ...",CoinBureau,https://www.youtube.com/watch?v=tLWx2upCdkU,Crypto secured loan,DeFi vs CeFi Lending: Key Trends You Need to W...,2025-04-29T15:35:09Z,True,Reducción de costos\n
29,@donking3316,$Loan protocol &XPR and Metal blockchain by M...,$ Protocolo de préstamo y XPR y Metal Blockcha...,CoinBureau,https://www.youtube.com/watch?v=tLWx2upCdkU,Crypto secured loan,DeFi vs CeFi Lending: Key Trends You Need to W...,2025-04-29T15:35:09Z,True,No tengo respuesta específica\n


#### Obteniendo todas las respuestas si aplica

In [None]:
prompt = """
En este texto, debemos responder la siguiente pregunta: 
¿Qué es lo que el cliente se pregunta sobre nuestro producto (crypto secured loan) que lo convence a usarlo?, 
si existe relación, responde con una categoría de no más de 5 palabras que responda la pregunta, 
de lo contrario responde solamente 'No tengo respuesta específica', gracias
"""


In [None]:
dfTweetsComments['llm_answer'] = dfTweetsComments.apply(lambda x: askGemini(prompt, x['tweet_text'], 
                                                                            api_key=gemini_api_key) if x['filter_to_llm']==True else 
                                                                            'No tengo respuesta específica', axis=1)


In [None]:
#dfTweetsComments[~dfTweetsComments['llm_answer'].isin(['No tengo respuesta específica', '"No tengo respuesta específica"'])]
dfTweetsComments[~dfTweetsComments['llm_answer'].str.contains('No tengo respuesta', regex=False, case=False)]

Unnamed: 0,tweet_id,tweet_text,tweet_text_es,username,timestamp,original_url,tweet_link,influencer_name,twitter_query,influencer_twitter_text,influencer_tweet_link,influencer_tweet_timestamp,filter_to_llm,llm_answer
41,1012529181056536576,Wonder how many folks are getting margin calls...,¿Me pregunto cuántas personas reciben llamadas...,@BlackmonTrader,2018-06-29T02:52:23.000Z,https://x.com/ErikVoorhees/status/101252380285...,https://x.com/BlackmonTrader/status/1012529181...,ErikVoorhees,Crypto secured loan,WSJ article on SALT Lending (get fiat loans ba...,https://x.com/ErikVoorhees/status/101252380285...,2018-06-29T02:31:01.000Z,True,Riesgo de llamadas de margen\n
124,1222881166207918083,If I lock some ETH in a CDP and get DAI to buy...,Si bloqueo un poco de ETH en un CDP y hago que...,@davidiach,2020-01-30T13:56:04.000Z,https://x.com/VitalikButerin/status/1222690397...,https://x.com/davidiach/status/122288116620791...,VitalikButerin,Crypto secured loan,"I know I'm in the minority on this issue, but ...",https://x.com/VitalikButerin/status/1222690397...,2020-01-30T01:18:02.000Z,True,Seguridad y acceso a liquidez\n


In [None]:
dfVideosComments['llm_answer'] = dfVideosComments.apply(lambda x: askGemini(prompt, x['comment'], api_key=gemini_api_key) if x['filter_to_llm']==True else 'No tengo respuesta específica', 
                                                        axis=1)


In [None]:
dfVideosComments[~dfVideosComments['llm_answer'].str.contains('No tengo respuesta',  regex=False, case=False)]

Unnamed: 0,author,comment,comment_es,influener_channel_name,original_video_url,original_youtube_question,original_youtube_video_name,original_date_published_video,filter_to_llm,llm_answer
18,@oohwha,"TL; DR: ""It turns out, about 99% of humans al...","Tl; DR: ""Resulta que alrededor del 99% de los ...",CoinBureau,https://www.youtube.com/watch?v=tLWx2upCdkU,Crypto secured loan,DeFi vs CeFi Lending: Key Trends You Need to W...,2025-04-29T15:35:09Z,True,Reducción de costos\n
106,@oohwha,"TL; DR: ""It turns out, about 99% of humans al...","Tl; DR: ""Resulta que alrededor del 99% de los ...",CoinBureau,https://www.youtube.com/watch?v=tLWx2upCdkU,Crypto secured loan,DeFi vs CeFi Lending: Key Trends You Need to W...,2025-04-29T15:35:09Z,True,Reducción de costos\n
211,@trentd4988,Just took out a loan against my Cardano. Didnt...,Acabo de tomar un préstamo contra mi Cardano. ...,CoinBureau,https://www.youtube.com/watch?v=zizUqOOxE_0,Crypto secured loan,Nexo Review: Complete Guide to Crypto Loans,2020-02-28T17:23:19Z,True,Rapidez y facilidad de uso\n
213,@Baruksinho,Excellent video.\r\nI only had one doubt. The ...,Excelente video.\r\nSolo tenía una duda. ¿El c...,CoinBureau,https://www.youtube.com/watch?v=zizUqOOxE_0,Crypto secured loan,Nexo Review: Complete Guide to Crypto Loans,2020-02-28T17:23:19Z,True,Generación de interés en garantía\n
215,@DiyEcoProjects,Hi there ~ Just wondering. Can you get a loan ...,Hola ~ solo preguntándome. ¿Puede obtener un p...,CoinBureau,https://www.youtube.com/watch?v=zizUqOOxE_0,Crypto secured loan,Nexo Review: Complete Guide to Crypto Loans,2020-02-28T17:23:19Z,True,Préstamo cripto sin vender activos\n
231,@Rad_B_OLand,You didn’t cover how difficult it is to get Ne...,No cubriste lo difícil que es conseguir tokens...,CoinBureau,https://www.youtube.com/watch?v=zizUqOOxE_0,Crypto secured loan,Nexo Review: Complete Guide to Crypto Loans,2020-02-28T17:23:19Z,True,Acceso a tasas de interés bajas\n
243,@sheldonred1,does anything happen to your loan if you borro...,¿Le sucede algo a su préstamo si pide prestado...,CoinBureau,https://www.youtube.com/watch?v=zizUqOOxE_0,Crypto secured loan,Nexo Review: Complete Guide to Crypto Loans,2020-02-28T17:23:19Z,True,Riesgo de caída de precio ETH\n
311,@trentd4988,Just took out a loan against my Cardano. Didnt...,Acabo de tomar un préstamo contra mi Cardano. ...,CoinBureau,https://www.youtube.com/watch?v=zizUqOOxE_0,Crypto secured loan,Nexo Review: Complete Guide to Crypto Loans,2020-02-28T17:23:19Z,True,Rapidez y facilidad de uso\n
313,@Baruksinho,Excellent video.\r\nI only had one doubt. The ...,Excelente video.\r\nSolo tenía una duda. ¿El c...,CoinBureau,https://www.youtube.com/watch?v=zizUqOOxE_0,Crypto secured loan,Nexo Review: Complete Guide to Crypto Loans,2020-02-28T17:23:19Z,True,Generación de intereses en garantía\n
315,@DiyEcoProjects,Hi there ~ Just wondering. Can you get a loan ...,Hola ~ solo preguntándome. ¿Puede obtener un p...,CoinBureau,https://www.youtube.com/watch?v=zizUqOOxE_0,Crypto secured loan,Nexo Review: Complete Guide to Crypto Loans,2020-02-28T17:23:19Z,True,"Préstamo cripto, impuestos, libre de impuestos\n"


In [None]:
list1 = dfTweetsComments[~dfTweetsComments['llm_answer'].str.contains('No tengo respuesta',  regex=False, case=False)]['llm_answer'].unique()   
list2 = dfVideosComments[~dfVideosComments['llm_answer'].str.contains('No tengo respuesta',  regex=False, case=False)]['llm_answer'].unique()   

pd.DataFrame({'original_category': np.concatenate((list1, list2), axis=0)}).to_csv('categories_bitcoin_project.csv', index=False)

In [None]:
dfTweetsComments.to_csv('users_tweets_comments_bitcoin_project_withllm_answer.csv', index=False)

In [None]:
dfVideosComments.to_csv('users_youtube_comments_bitcoin_project_withllm_answer.csv', index=False)

### Análisis rápido

In [None]:
import pandas as pd 
import numpy as np
dfTweetsComments = pd.read_csv('users_tweets_comments_bitcoin_project_withllm_answer.csv')

dfVideosComments = pd.read_csv('users_youtube_comments_bitcoin_project_withllm_answer.csv')
comments_edited = pd.read_csv('categories_bitcoin_project_edited.csv')


In [None]:
list1 = dfTweetsComments[~dfTweetsComments['llm_answer'].str.contains('No tengo respuesta',  regex=False, case=False)]['llm_answer'].unique()   
list2 = dfVideosComments[~dfVideosComments['llm_answer'].str.contains('No tengo respuesta',  regex=False, case=False)]['llm_answer'].unique()   

pd.DataFrame({'original_category': np.concatenate((list1, list2), axis=0)}).to_csv('categories_bitcoin_project.csv', index=False)

In [None]:
print(dfTweetsComments.columns)
print(dfVideosComments.columns)

Index(['tweet_id', 'tweet_text', 'tweet_text_es', 'username', 'timestamp',
       'original_url', 'tweet_link', 'influencer_name', 'twitter_query',
       'influencer_twitter_text', 'influencer_tweet_link',
       'influencer_tweet_timestamp', 'filter_to_llm', 'llm_answer'],
      dtype='object')
Index(['author', 'comment', 'comment_es', 'influener_channel_name',
       'original_video_url', 'original_youtube_question',
       'original_youtube_video_name', 'original_date_published_video',
       'filter_to_llm', 'llm_answer'],
      dtype='object')


In [None]:
dfTweetsSelected = dfTweetsComments[['tweet_text', 'tweet_text_es', 'twitter_query', 'timestamp',
                  'tweet_link', 'influencer_name', 'filter_to_llm', 'llm_answer' ]]

dfVideosSelected = dfVideosComments[['comment', 'comment_es', 'original_youtube_question', 'original_date_published_video',
                  'original_video_url', 'influener_channel_name', 'filter_to_llm', 'llm_answer' ]]

new_columns = ['comment_or_tweet_text', 'comment_or_tweet_text_es', 'original_youtube_question_or_twitter_query',  
           'timestamp', 'tweet_link_or_original_video_url', 'influencer_name_or_channel_name',
           'filter_to_llm', 'llm_answer', 'social_media_type']

dfTweetsSelected['social_media_type'] = 'twitter'
dfVideosSelected['social_media_type'] = 'youtube'
print(dfTweetsSelected.head(2))
print(dfVideosSelected.head(2))


                                          tweet_text  \
0  CheckID eth lens is available @6529Collections...   
1  P2P lending is great and a defining use case f...   

                                       tweet_text_es        twitter_query  \
0  Checkid ETH Lens está disponible @6529Collecti...  Crypto secured loan   
1  P2P Lending es excelente y un caso de uso defi...  Crypto secured loan   

                  timestamp  \
0  2024-12-08T23:50:26.000Z   
1  2024-12-08T23:55:45.000Z   

                                          tweet_link influencer_name  \
0  https://x.com/mintfaced/status/186590681698861...         balajis   
1  https://x.com/gootecks/status/1865908153847836713         balajis   

   filter_to_llm                     llm_answer social_media_type  
0          False  No tengo respuesta específica           twitter  
1          False  No tengo respuesta específica           twitter  
                                             comment  \
0  🤑 EXCLUSIVE Deals (Bonuse

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfTweetsSelected['social_media_type'] = 'twitter'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfVideosSelected['social_media_type'] = 'youtube'


In [None]:
dfTweetsSelected.columns = new_columns
dfVideosSelected.columns = new_columns

dfSocialMedia = pd.concat([dfTweetsSelected, dfVideosSelected], ignore_index=True)
print(dfSocialMedia.shape)
dfSocialMedia = dfSocialMedia[~dfSocialMedia['llm_answer'].str.contains('No tengo respuesta',  regex=False, case=False)]
print(dfSocialMedia.shape)


(1609, 9)
(48, 9)


In [None]:
dfSocialMedia['llm_answer_clean'] = dfSocialMedia['llm_answer'].astype(str).str.strip().str.lower()
comments_edited['original_category_clean'] = comments_edited['original_category'].astype(str).str.strip().str.lower()


In [None]:
comments_edited.head(10)

Unnamed: 0,original_category,simplified_category,plain_language_category
0,Riesgo de llamadas de margen,Riesgos asociados,Posibles pérdidas
1,Seguridad y acceso a liquidez,Beneficios financieros,Ventajas económicas
2,Reducción de costos,Beneficios financieros,Ventajas económicas
3,Rapidez y facilidad de uso,Facilidad de uso,Uso sencillo y rápido
4,Generación de interés en garantía,Beneficios financieros,Ganar intereses
5,Préstamo cripto sin vender activos,Beneficios financieros,Obtener dinero sin vender tus criptos
6,Acceso a tasas de interés bajas,Beneficios financieros,Tasas bajas
7,Riesgo de caída de precio ETH,Riesgos asociados,Posibles pérdidas
8,Generación de intereses en garantía,Beneficios financieros,Ganar intereses
9,"Préstamo cripto, impuestos, libre de impuestos",Beneficios financieros,Beneficios fiscales


In [None]:
#pd.merge(dfSocialMedia, comments_edited, how='left', left_on='llm_answer', right_on='original_category', 
#         suffixes=('', '_edited'))
dfSocialMedia = pd.merge(dfSocialMedia, comments_edited, how='left', left_on='llm_answer_clean', right_on='original_category_clean')

In [None]:
dfSocialMedia.to_csv('all_comments_tweets_yt_bitcoin_project.csv', index=False) 

## Análisis principal para Divisas

In [115]:
import pandas as pd 
import numpy as np
import re 



In [116]:
mainPath = 'E:/Users/1167486/Local/proyectos/divisas/Scraper_divisas'

In [117]:
xLinks = pd.read_csv(mainPath + '/Links_x_to_scrape.csv')

In [118]:
test = xLinks.iloc[0:2]
test

Unnamed: 0,link,app,social_media
0,https://x.com/RevolutApp/status/19153142986219...,Revolut,X
1,https://x.com/RevolutApp/status/19243908275869...,Revolut,X


In [138]:
import random
random.seed(42)



In [142]:
times = [20, 30, 35, 15, 25, 7, 10, 12, 5, 8, 18, 22, 28, 40]

In [151]:
listDf = []
#for index, row in test.iterrows():
for index, row in xLinks.iterrows():
    print(f'index: {index + 1}')
    print(row['link'])
    print(row['app'])
    x = random.sample(times, len(times))
    print(f'random time: {x[0]}')
    print('-------------------')
    time.sleep(x[0])
    if row['app']!='Wise':
        maxResults= 30
        scroll_attempts = 15
    else: 
        maxResults= 20
        scroll_attempts = 10 
    #print(f'Results required: {scroll_attempts}')
    #print(f'max scroll attempts: {scroll_attempts}')
    
    tweetDfScrap = scrape_tweets_from_page(row['link'], 
                                               max_scroll_attempts=scroll_attempts, 
                                               button_css_selector = original_button_css, 
                                               driver= driver, 
                                               max_results=maxResults)
    tweetDfScrap['app'] = row['app']
    tweetDfScrap['original_tweet_link'] = row['link']
    tweetDfScrap['original_social_media'] = row['social_media']
    listDf.append(tweetDfScrap)

index: 1
https://x.com/RevolutApp/status/1915314298621935998
Revolut
random time: 5
-------------------
Attempting to scrape tweets from: https://x.com/RevolutApp/status/1915314298621935998
Load attempt number: 1
Button not found or not clickable, attempting to scroll.
Found 20 new tweets.
Load attempt number: 2
Button not found or not clickable, attempting to scroll.
No new tweets found in this attempt (1/5).
Load attempt number: 3
Button not found or not clickable, attempting to scroll.
No new tweets found in this attempt (2/5).
Load attempt number: 4
Button not found or not clickable, attempting to scroll.
No new tweets found in this attempt (3/5).
Load attempt number: 5
Button not found or not clickable, attempting to scroll.
No new tweets found in this attempt (4/5).
Load attempt number: 6
Button not found or not clickable, attempting to scroll.
No new tweets found in this attempt (5/5).
Reached tweet loading limit or end of content.

Scraped 20 tweets in total.
index: 2
https://x

In [152]:
tweetsDivisas = pd.concat(listDf, ignore_index=True)

In [153]:
newFilePath = mainPath + '/x_scraped_data_divisas.csv'

In [156]:
tweetsDivisas.shape

(610, 9)

In [154]:
tweetsDivisas.head(20)

Unnamed: 0,tweet_text,username,timestamp,tweet_id,original_url,tweet_link,app,original_tweet_link,original_social_media
0,BBC News - Revolut: 'I was careful and followe...,@tawnyowlhillin1,2025-04-24T08:04:43.000Z,1915315958081523715,https://x.com/RevolutApp/status/19153142986219...,https://x.com/tawnyowlhillin1/status/191531595...,Revolut,https://x.com/RevolutApp/status/19153142986219...,X
1,With banking licence or like crypto payment sy...,@denisPashchenko,2025-04-24T08:36:05.000Z,1915323851728179666,https://x.com/RevolutApp/status/19153142986219...,https://x.com/denisPashchenko/status/191532385...,Revolut,https://x.com/RevolutApp/status/19153142986219...,X
2,WTF is happening with your app in Spain? I can...,@TimChaney598055,2025-05-11T08:40:46.000Z,1921485622251811233,https://x.com/RevolutApp/status/19153142986219...,https://x.com/TimChaney598055/status/192148562...,Revolut,https://x.com/RevolutApp/status/19153142986219...,X
3,"Hello, I am trying to raise a complaint, but t...",@rach3kids,2025-05-02T08:30:10.000Z,1918221465280827756,https://x.com/RevolutApp/status/19153142986219...,https://x.com/rach3kids/status/191822146528082...,Revolut,https://x.com/RevolutApp/status/19153142986219...,X
4,Tengo mi cuenta bloqueada desde hace días sin ...,@GameofLife_ok,2025-04-25T16:01:52.000Z,1915798422700843105,https://x.com/RevolutApp/status/19153142986219...,https://x.com/GameofLife_ok/status/19157984227...,Revolut,https://x.com/RevolutApp/status/19153142986219...,X
5,I use Revolut that is convenient for day to da...,@BaroinEdouard,2025-05-08T16:32:57.000Z,1920517287406117102,https://x.com/RevolutApp/status/19153142986219...,https://x.com/BaroinEdouard/status/19205172874...,Revolut,https://x.com/RevolutApp/status/19153142986219...,X
6,Und Sie werden mein Revolut-Konto nicht genehm...,@evrenuzayaldiz,2025-04-24T13:48:22.000Z,1915402439533928885,https://x.com/RevolutApp/status/19153142986219...,https://x.com/evrenuzayaldiz/status/1915402439...,Revolut,https://x.com/RevolutApp/status/19153142986219...,X
7,Que tenéis que decir de la queja de un cliente...,@CriptosTW,2025-05-09T15:21:30.000Z,1920861695464849459,https://x.com/RevolutApp/status/19153142986219...,https://x.com/CriptosTW/status/192086169546484...,Revolut,https://x.com/RevolutApp/status/19153142986219...,X
8,Are you insolvent? Have tried to transfer mone...,@Maximusprofitas,2025-05-02T08:43:35.000Z,1918224839841640798,https://x.com/RevolutApp/status/19153142986219...,https://x.com/Maximusprofitas/status/191822483...,Revolut,https://x.com/RevolutApp/status/19153142986219...,X
9,I need more information about a certain transa...,@wilswils456,2025-04-26T03:12:37.000Z,1915967221337071910,https://x.com/RevolutApp/status/19153142986219...,https://x.com/wilswils456/status/1915967221337...,Revolut,https://x.com/RevolutApp/status/19153142986219...,X


In [155]:
tweetsDivisas.to_csv(newFilePath, index=False)

## Datos principales para DEMO

## Code not used anymore

In [None]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support   import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd
import re
from bs4 import BeautifulSoup

In [None]:
## Ejecutar esto siempre al testear
link = tweet_test
driver.get(link)

In [None]:
#'<button role="button" class="css-175oi2r r-1777fci r-1pl7oy7 r-13qz1uu r-1loqt21 r-o7ynqc r-6416eg r-1ny4l3l" type="button"><div class="css-175oi2r r-16y2uox r-1wbh5a2 r-1777fci"><div dir="ltr" class="css-146c3p1 r-bcqeeo r-qvutc0 r-37j5jr r-q4m81j r-a023e6 r-rjixqe r-16dba41" style="color: rgb(29, 155, 240);"><span class="css-1jxf684 r-bcqeeo r-1ttztb7 r-qvutc0 r-poiln3">Show probable spam</span></div></div></button>'
#'#react-root > div > div > div.css-175oi2r.r-1f2l425.r-13qz1uu.r-417010.r-18u37iz > main > div > div > div > div > div > section > div > div > div:nth-child(18) > div.css-175oi2r.r-1adg3ll.r-1ny4l3l > div > button'

In [None]:
boton_xpath = '//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/section/div/div/div[12]/div[1]/div/button/div/div/span'
boton_xpath2 = '/html/body/div[1]/div/div/div[2]/main/div/div/div/div/div/section/div/div/div[22]/div[1]/div/button/div'
#'//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/section/div/div/div[20]/div[1]/div/button/div'
#'//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/section/div/div/div[20]/div[1]/div/button/div/div'
'/html/body/div[1]/div/div/div[2]/main/div/div/div/div/div/section/div/div/div[20]/div[1]/div/button/div/div/span'
boton_cssselector = '#react-root > div > div > div.css-175oi2r.r-1f2l425.r-13qz1uu.r-417010.r-18u37iz > main > div > div > div > div > div > section > div > div > div:nth-child(20) > div.css-175oi2r.r-1adg3ll.r-1ny4l3l > div > button > div > div > span'
boton_cssselector = '#react-root > div > div > div.css-175oi2r.r-1f2l425.r-13qz1uu.r-417010.r-18u37iz > main > div > div > div > div > div > section > div > div > div:nth-child(20) > div.css-175oi2r.r-1adg3ll.r-1ny4l3l > div > button > div'


In [None]:
#By.CSS_SELECTOR

In [None]:
'''
for i in range(12):
    print(f'Intento número: {i + 1}')
    try:
        #boton_mostrar_mas = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, boton_xpath)))
        #boton_mostrar_mas = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, boton_xpath2)))    
        boton_mostrar_mas = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, boton_cssselector)))    
        ### Una vez que se encuentra el botón, se hace clic en él
        boton_mostrar_mas.click()
        print('Encontró el botón "Probable Spam"')
        time.sleep(3)  # Espera a que se carguen más tweets después de hacer clic
    except:
        print('Scroll Normal')
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        time.sleep(3)
'''

'\nfor i in range(12):\n    print(f\'Intento número: {i + 1}\')\n    try:\n        #boton_mostrar_mas = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, boton_xpath)))\n        #boton_mostrar_mas = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, boton_xpath2)))    \n        boton_mostrar_mas = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, boton_cssselector)))    \n        ### Una vez que se encuentra el botón, se hace clic en él\n        boton_mostrar_mas.click()\n        print(\'Encontró el botón "Probable Spam"\')\n        time.sleep(3)  # Espera a que se carguen más tweets después de hacer clic\n    except:\n        print(\'Scroll Normal\')\n        driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")\n        time.sleep(3)\n'

In [None]:
##//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/section/div/div/div[12]/div[1]/div/button/div/div/span

In [None]:
boton_xpath = '//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/section/div/div/div[12]/div[1]/div/button/div/div/span'
boton_cssselector = '#react-root > div > div > div.css-175oi2r.r-1f2l425.r-13qz1uu.r-417010.r-18u37iz > main > div > div > div > div > div > section > div > div > div:nth-child(20) > div.css-175oi2r.r-1adg3ll.r-1ny4l3l > div > button > div'
#'#react-root > div > div > div.css-175oi2r.r-1f2l425.r-13qz1uu.r-417010.r-18u37iz > main > div > div > div > div > div > section > div > div > div:nth-child(18) > div.css-175oi2r.r-1adg3ll.r-1ny4l3l > div > button > div > div > span'
### Tweets Counter 

try:
    link = tweet_test
    driver.get(link)
    all_tweets_data = []  # Lista para almacenar los datos de cada tweet
    max_scroll_attempts = 20  # Número máximo de intentos de scroll/clic
    scroll_attempt = 0
    previous_tweet_count = 0
    no_new_tweets_threshold = 20  # Si no se encuentran nuevos tweets después de N intentos, detener
    no_new_tweets_counter=0
    while scroll_attempt < max_scroll_attempts:
        scroll_attempt += 1
        print(f'Intento de carga número: {scroll_attempt}')

        try:
            # Intenta encontrar y hacer clic en el botón
            boton_mostrar_mas = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, boton_cssselector))
            )
            print('Encontró el botón "Probable Spam"')  
            boton_mostrar_mas.click()
            time.sleep(4)
        except:
            # Si el botón no se encuentra o no es clickable, intenta hacer scroll
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
            print('Haciendo scroll normal.')  

            time.sleep(4)

        html_content = driver.page_source
        soup = BeautifulSoup(html_content, 'html.parser')
        #tweet_text_elements = soup.find_all('div', {'data-testid': 'tweetText'})
        tweet_text_elements = soup.find_all('div', {'data-testid': 'tweetText'})
        #tweet_text_elements = soup.find_all('div', {'data-testid': 'tweet'})
         
        current_tweet_count = len(tweet_text_elements)

        if current_tweet_count > previous_tweet_count:
            print(f"Se encontraron {current_tweet_count - previous_tweet_count} nuevos tweets.")
            for tweet_element in tweet_text_elements[previous_tweet_count:]:
                tweet_text = tweet_element.text.strip().replace("\n", "")
                all_tweets_data.append({'Texto del Tweet': tweet_text})
            previous_tweet_count = current_tweet_count
            no_new_tweets_counter = 0  
        else:
            no_new_tweets_counter += 1
            print(f"No se encontraron nuevos tweets en este intento ({no_new_tweets_counter}/{no_new_tweets_threshold}).")
            if no_new_tweets_counter >= no_new_tweets_threshold:
                print("Parece que se ha llegado al final de los tweets cargados.")
                break

    # Crea el DataFrame de pandas a partir de la lista de diccionarios
    df_tweets = pd.DataFrame(all_tweets_data)

    # Imprime información sobre el DataFrame resultante
    print(f"\nSe descargaron {len(df_tweets)} tweets.")
    # print(df_tweets) # Descomentar para ver el DataFrame completo

except Exception as e:
    print(f"Ocurrió un error: {e}")
finally:
    driver.quit()

Intento de carga número: 1
Haciendo scroll normal.
Se encontraron 13 nuevos tweets.
Intento de carga número: 2
Haciendo scroll normal.
Se encontraron 5 nuevos tweets.
Intento de carga número: 3
Haciendo scroll normal.
Se encontraron 1 nuevos tweets.
Intento de carga número: 4
Haciendo scroll normal.
Se encontraron 1 nuevos tweets.
Intento de carga número: 5
Haciendo scroll normal.
No se encontraron nuevos tweets en este intento (1/20).
Intento de carga número: 6
Haciendo scroll normal.
Se encontraron 5 nuevos tweets.
Intento de carga número: 7
Haciendo scroll normal.
No se encontraron nuevos tweets en este intento (1/20).
Intento de carga número: 8
Haciendo scroll normal.
No se encontraron nuevos tweets en este intento (2/20).
Intento de carga número: 9
Haciendo scroll normal.
No se encontraron nuevos tweets en este intento (3/20).
Intento de carga número: 10
Haciendo scroll normal.
No se encontraron nuevos tweets en este intento (4/20).
Intento de carga número: 11
Haciendo scroll norm

In [None]:
'''
try:
    link = tweet_test
    driver.get(link)
    all_tweets_data = []  # Lista para almacenar los datos de cada tweet
    num_scrolls = 90  # Número máximo de scrolls
    number = 0
    for _ in range(num_scrolls):
        number += 1
        print(f'Número de scrolls: {number}')
        # Espera a que al menos un nuevo elemento 'tweetText' esté presente
        WebDriverWait(driver, 45).until(
            EC.presence_of_element_located((By.XPATH, '//div[@data-testid="tweetText"]'))
        )

        # Intenta encontrar y hacer clic en el botón
        try:
            boton_mostrar_mas = WebDriverWait(driver, 10).until(
                #EC.element_to_be_clickable((By.XPATH, boton_xpath))
                EC.element_to_be_clickable((By.CSS_SELECTOR, boton_cssselector))                
            )
            boton_mostrar_mas.click()
            time.sleep(3)  # Espera a que se carguen más tweets después de hacer clic
        except:
            # Si el botón no se encuentra o no es clickable, continúa con el scroll
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
            time.sleep(3)

        html_content = driver.page_source
        soup = BeautifulSoup(html_content, 'html.parser')
        tweet_text_elements = soup.find_all('div', {'data-testid': 'tweetText'})

        for tweet_element in tweet_text_elements:
            tweet_text = tweet_element.text.strip().replace("\n", "")
            all_tweets_data.append({'Texto del Tweet': tweet_text})

    # Crea el DataFrame de pandas a partir de la lista de diccionarios
    df_tweets = pd.DataFrame(all_tweets_data)

    # Imprime el DataFrame (opcional)
    #print(df_tweets)

except Exception as e:
    print(f"Ocurrió un error: {e}")
finally:
    driver.quit()
'''

Número de scrolls: 1
Número de scrolls: 2
Número de scrolls: 3
Número de scrolls: 4
Número de scrolls: 5
Número de scrolls: 6
Número de scrolls: 7
Número de scrolls: 8
Número de scrolls: 9
Número de scrolls: 10
Número de scrolls: 11
Número de scrolls: 12
Número de scrolls: 13
Número de scrolls: 14
Número de scrolls: 15
Número de scrolls: 16
Número de scrolls: 17
Número de scrolls: 18
Número de scrolls: 19
Número de scrolls: 20
Número de scrolls: 21
Número de scrolls: 22
Número de scrolls: 23
Número de scrolls: 24
Número de scrolls: 25
Número de scrolls: 26
Número de scrolls: 27
Número de scrolls: 28
Número de scrolls: 29
Número de scrolls: 30
Número de scrolls: 31
Número de scrolls: 32
Número de scrolls: 33
Número de scrolls: 34
Número de scrolls: 35
Número de scrolls: 36
Número de scrolls: 37
Número de scrolls: 38
Número de scrolls: 39
Número de scrolls: 40
Número de scrolls: 41
Número de scrolls: 42
Número de scrolls: 43
Número de scrolls: 44
Número de scrolls: 45
Número de scrolls: 

In [None]:
'''
try:
    link = tweet_test
    driver.get(link)
    all_tweets_data = []  # Lista para almacenar los datos de cada tweet

    for _ in range(50):  # Ajusta el número de scrolls según sea necesario
        # Espera a que al menos un nuevo elemento 'tweetText' esté presente
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.XPATH, '//div[@data-testid="tweetText"]'))
        )
        html_content = driver.page_source
        soup = BeautifulSoup(html_content, 'html.parser')
        tweet_text_elements = soup.find_all('div', {'data-testid': 'tweetText'})

        for tweet_element in tweet_text_elements:
            tweet_text = tweet_element.text.strip().replace("\n", "")
            all_tweets_data.append({'Texto del Tweet': tweet_text})

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        time.sleep(3) # Un pequeño tiempo de espera después del scroll

    # Crea el DataFrame de pandas a partir de la lista de diccionarios
    df_tweets = pd.DataFrame(all_tweets_data)

    # Imprime el DataFrame (opcional)
    #print(df_tweets)

except Exception as e:
    print(f"Ocurrió un error: {e}")
finally:
    driver.quit()
'''

In [None]:
df_tweets.shape

(27, 1)

In [None]:
df_tweets.tail()

Unnamed: 0,Texto del Tweet
22,"Lo que usted Tio me quiera regalar, para pagar..."
23,"Neither gold nor Bitcoin is inherently ""better..."
24,Oro
25,Bitcoin y tu?
26,No pongas tu mundo de cabeza. Con la app de in...


In [None]:
df_tweets.to_csv('tweets2.csv', index=False, encoding='utf-8-sig')

In [None]:
'''
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    # Scroll hasta el fin 
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")

    # Espera a que la página se cargue 
    time.sleep(4)


    # Recalcula la altura de la página
    new_height = driver.execute_script("return document.body.scrollHeight")

    # Si ya no aumenta la altura, se termina el bucle
    if new_height == last_height:
        break
    last_height = new_height
'''

'\nlast_height = driver.execute_script("return document.body.scrollHeight")\nwhile True:\n    # Scroll hasta el fin \n    driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")\n\n    # Espera a que la página se cargue \n    time.sleep(4)\n\n\n    # Recalcula la altura de la página\n    new_height = driver.execute_script("return document.body.scrollHeight")\n\n    # Si ya no aumenta la altura, se termina el bucle\n    if new_height == last_height:\n        break\n    last_height = new_height\n'

In [None]:
from bs4 import BeautifulSoup


In [None]:
'''
try:
    link = tweet_test
    driver.get(link)
    tweet_text_elements = []
    # Scroll 5 times to load more tweets
    for _ in range(45):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        time.sleep(5) 
        html_content = driver.page_source
        soup = BeautifulSoup(html_content, 'html.parser')
        # Find all elements with 'tweetText'
        tweet_text_elements.extend(soup.find_all('div', {'data-testid': 'tweetText'}))
    # first 10 tweet
    for i, tweet_text_element in enumerate(tweet_text_elements):
        tweet_text = tweet_text_element.text.strip().replace("\n", "")
        print(f"Tweet {i+1}: {tweet_text}")
finally:
    driver.quit()
'''

Tweet 1: ¿Que prefieren #Bitcoin u oro?
Tweet 2: ¿Que prefieren #Bitcoin u oro?
Tweet 3: ¿Que prefieren #Bitcoin u oro?
Tweet 4: ¿Que prefieren #Bitcoin u oro?
Tweet 5: ¿Que prefieren #Bitcoin u oro?
Tweet 6: ¿Que prefieren #Bitcoin u oro?
Tweet 7: ¿Que prefieren #Bitcoin u oro?
Tweet 8: ¿Que prefieren #Bitcoin u oro?
Tweet 9: ¿Que prefieren #Bitcoin u oro?
Tweet 10: ¿Que prefieren #Bitcoin u oro?


In [None]:
len(tweet_text_elements)

305

In [None]:
import pandas as pd 
df = pd.DataFrame({'col':tweet_text_elements})
print(df)

df.to_csv('tweets.csv', index=False, encoding='utf-8-sig')

                                                   col
0         [[¿Que prefieren ], [[#Bitcoin]], [ u oro?]]
1         [[¿Que prefieren ], [[#Bitcoin]], [ u oro?]]
2         [[¿Que prefieren ], [[#Bitcoin]], [ u oro?]]
3         [[¿Que prefieren ], [[#Bitcoin]], [ u oro?]]
4         [[¿Que prefieren ], [[#Bitcoin]], [ u oro?]]
..                                                 ...
300  [["El capitalismo desenfrenado"(NEOLIBERALISMO...
301                                            [[Oro]]
302                                            [[ORO]]
303                                            [[Oro]]
304                                            [[Oro]]

[305 rows x 1 columns]


In [None]:
import os 
os.currentdir = os.getcwd()
print(os.currentdir)

e:\Users\1167486\Local\scripts\Social_media_comments\test


In [None]:
for i, tweet_text_element in enumerate(tweet_text_elements):
    print(tweet_text_element.text.strip().replace("\n", ""))

¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que prefieren #Bitcoin u oro?
¿Que pre

In [None]:
from bs4 import BeautifulSoup
html_content = driver.page_source
soup = BeautifulSoup(html_content, 'html.parser')
x = soup.find_all('div', {'data-testid': 'tweetText'})



In [None]:
for i, tweet_text_element in enumerate(x):
    tweet_text = tweet_text_element.text.strip().replace("\n", "")
    print(f"Tweet {i+1}: {tweet_text}")

Tweet 1: Un millón de veces mejor el oro.
Tweet 2: Bien simple sólo paga y de ahí a chingar a su madre
Tweet 3: Bitcoin
Tweet 4: Oro
Tweet 5: "El capitalismo desenfrenado"(NEOLIBERALISMO)es un capitalismo delincuencial donde ya no hay ética ni principios humanos, lo mismo son secuestradores, corruptos, o evasores de impuestos, Eje: hermana de  @XochitlGalvez o @RicardoBSalinas@CarlosLoret @aristeguicnn @lopezdoriga
Tweet 6: ORO
Tweet 7: Oro
Tweet 8: Oro


In [None]:
'''
try:
    link = 'https://twitter.com/elonmusk'
    driver.get(link)

    tweet_text_elements = []

    # Scroll 5 times to load more tweets
    for _ in range(5):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        time.sleep(2) 

        html_content = driver.page_source
        soup = BeautifulSoup(html_content, 'html.parser')

        # Find all elements with 'tweetText'
        tweet_text_elements.extend(soup.find_all('div', {'data-testid': 'tweetText'}))

    # first 10 tweet
    for i, tweet_text_element in enumerate(tweet_text_elements[:10]):
        tweet_text = tweet_text_element.text.strip().replace("\n", "")
        print(f"Tweet {i+1}: {tweet_text}")

finally:
    driver. Quit()
'''

In [None]:
comments = driver.find_elements("css selector", "div[data-testid='tweetText']")
comments[0].text
#driver.find_element(By.ID, 'tweetText')
len(comments)

8

In [None]:
tweets = driver.find_elements("css selector", "article[data-testid='tweet']")
tweets[0].text
len(tweets)


7

In [None]:
len(tweets)


3

In [None]:
len(tweets)

17

In [None]:
#names = driver.find_elements('css selector', "span[class='css-1jxf684 r-bcqeeo r-1ttztb7 r-qvutc0 r-poiln3']")
names = driver.find_elements("css selector", "div[data-testid='User-Name']")

#names = driver.find_elements(By.XPATH, 
#                             '//*[@id="id__8hzjlqcfwha"]/div[1]/div/a/div/div[1]/span/span')

In [None]:
for x in names:
    print(x.text)


Anabel Hernández
@anabelhoficial
xrp.gerry.xah
@gerryxrp
·
Mar 31
Grok
@grok
·
Mar 31
Rossana Villares
@RossanaVillares
·
22h
Tere
@Teehreh
·
16h
Armando Galindo Reyes
@armandgal65
·
2h
Gilberto Astorga F
@vzlaastor2025
·
13h
Rena Cornejo
@cornejo_rena
·
23h
Juan Carlos Cortés
@jccortes
·
Apr 1
Kubala
@cubalamx
·
12h
Victor manuel Vazquez guerrero
@Victorm06387347
·
12h
Astrophisic
@culiacan929
·
Mar 31
Nadia
@politicanadia
·
18h
Luz Maria Saltijeral
@Luzmasaltijeral
·
Apr 1


In [None]:
#comments = driver.find_elements("css selector", "div[id='tweetText']")


In [None]:
for comment in comments:
    print(comment.text)




Siempre estaré agradecida con cada uno de mis lectores, por aquellos que decidieron conocer la verdad de los hechos ante las situaciones que afectan al país. Gracias por apreciar mi trabajo en "La historia secreta: AMLO y el Cártel de Sinaloa".
Hey 
@grok
 los libros de 
@anabelhoficial
 son ciencia ficción?
No, los libros de Anabel Hernández no son ciencia ficción. Son obras de periodismo investigativo basadas en hechos reales, como "La Historia Secreta: AMLO y el Cártel de Sinaloa", que explora corrupción y narcotráfico en México con evidencia documental, no elementos
Anabel estás correspondida !
La periodista  Anavel si pasara a la historia  de México.
Excelente libro
#PobreBurra3D
Señora,
¿No le gustaría conocer a Jaime Maussan.
Pienso que tendrían mucho de que platicar.
                                    
Parece que se acabó el contrato con la editorial... ¿Es despedía? ¿De qué verdad habla?

Tu ciencia ficción esta muy fumada, chance deberíamos meternos lo que tu te metes para e

In [None]:
#username = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'input[autocomplete="username"]')))
#username.send_keys("your_username")
#username.send_keys(Keys.ENTER)


In [None]:
'''
import time
from selenium import webdriver
from selenium.webdriver import ChromeOptions, Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait


options = ChromeOptions()
options.add_argument("--start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])

driver = webdriver.Chrome(options=options)
url = "https://twitter.com/i/flow/login"
driver.get(url)

username = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'input[autocomplete="username"]')))
username.send_keys("your_username")
username.send_keys(Keys.ENTER)

password = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'input[name="password"]')))
password.send_keys("your_password")
password.send_keys(Keys.ENTER)

time.sleep(10)
'''

In [None]:
x = 'https://x.com/i/flow/login'

In [None]:
import asyncio
from pyppeteer import launch
import nest_asyncio

nest_asyncio.apply()  # Allows asyncio to work in Flask/Gunicorn

async def scrape_tweets(handle, limit=20):
    """Scrapes recent tweets from a Twitter/X profile using headless Chrome."""
    try:
        browser = await launch(
            headless=True,
            args=["--no-sandbox", "--disable-setuid-sandbox", "--disable-blink-features=AutomationControlled"]
        )

        page = await browser.newPage()
        await page.setUserAgent(
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )

        profile_url = f'https://twitter.com/{handle}'
        await page.goto(profile_url, {'waitUntil': 'networkidle2', 'timeout': 60000})

        tweet_selector = 'div[data-testid="tweetText"]'
        await page.waitForSelector(tweet_selector, timeout=30000)

        tweets = []

        for _ in range(15):  # Scroll to load more tweets
            await page.evaluate("window.scrollBy(0, 4000);")
            await asyncio.sleep(2)

            new_tweets = await page.evaluate('''() => {
                return Array.from(document.querySelectorAll('div[data-testid="tweetText"]'))
                .map(tweet => tweet.innerText.trim());
            }''')

            tweets.extend(new_tweets)
            tweets = list(set(tweets))  # Remove duplicates

            if len(tweets) >= limit:
                break

        await browser.close()
        return tweets[:limit] if tweets else ["No tweets found or access blocked."]

    except Exception as e:
        return [f"🔥 Scraping Error: {str(e)}"]