In [None]:
import os
import csv
import time
import random
import logging
from typing import List, Dict
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains

class FacebookReplyScraper:
    def __init__(self, username: str, password: str, headless: bool = False):
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s: %(message)s',
            datefmt='%Y-%m-%d %H:%M:%S'
        )
        self.logger = logging.getLogger(__name__)
        
        self.chrome_options = Options()
        self._setup_chrome_options(headless)
        self.driver = self._initialize_webdriver()
        self.wait = WebDriverWait(self.driver, 30)
        self.username = username
        self.password = password

    def _setup_chrome_options(self, headless):
        options = [
            "--disable-blink-features=AutomationControlled",
            "--disable-notifications",
            "--disable-popup-blocking",
            "--no-sandbox",
            "--disable-dev-shm-usage",
            "--disable-gpu",
            f"--window-size={random.randint(1200,1400)},{random.randint(800,1000)}",
            "--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        ]
        for option in options:
            self.chrome_options.add_argument(option)
        
        self.chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        self.chrome_options.add_experimental_option('useAutomationExtension', False)
        
        if headless:
            self.chrome_options.add_argument("--headless=new")

    def _initialize_webdriver(self, max_retries=3):
        for attempt in range(max_retries):
            try:
                service = Service(ChromeDriverManager().install())
                driver = webdriver.Chrome(service=service, options=self.chrome_options)
                driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
                    "source": """
                        Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
                        window.navigator.chrome = { runtime: {} };
                    """
                })
                return driver
            except Exception as e:
                self.logger.warning(f"WebDriver init attempt {attempt+1} failed: {e}")
                if attempt == max_retries - 1:
                    raise
                time.sleep(5)

    def human_interaction(self, element, action_type, text=None):
        try:
            if action_type == 'type':
                for char in text:
                    element.send_keys(char)
                    time.sleep(random.uniform(0.05, 0.2))
            elif action_type == 'click':
                ActionChains(self.driver).move_to_element(element).pause(
                    random.uniform(0.2, 0.5)).click().perform()
            return True
        except Exception as e:
            self.logger.warning(f"Human interaction failed: {e}")
            return False

    def login(self):
        try:
            self.driver.get("https://www.facebook.com")
            time.sleep(random.uniform(2.0, 3.0))
            
            try:
                if self.driver.find_elements(By.XPATH, "//div[@aria-label='Facebook']"):
                    self.logger.info("Already logged in")
                    return True
            except:
                pass
            
            max_attempts = 3
            for attempt in range(max_attempts):
                try:
                    email = self.wait.until(EC.presence_of_element_located(
                        (By.CSS_SELECTOR, "input[name='email'], input[id='email'], input[type='email']")))
                    email.clear()
                    self.human_interaction(email, 'type', self.username)
                    
                    password = self.wait.until(EC.presence_of_element_located(
                        (By.CSS_SELECTOR, "input[name='pass'], input[id='pass'], input[type='password']")))
                    password.clear()
                    self.human_interaction(password, 'type', self.password)
                    
                    login_btn = self.wait.until(EC.element_to_be_clickable(
                        (By.CSS_SELECTOR, "button[name='login'], button[type='submit'], button[id='loginbutton']")))
                    self.human_interaction(login_btn, 'click')
                    
                    try:
                        self.wait.until(lambda driver: any([
                            "facebook.com/?sk=welcome" in driver.current_url,
                            len(driver.find_elements(By.XPATH, "//div[@aria-label='Facebook']")) > 0
                        ]))
                        self.logger.info("Login successful")
                        time.sleep(random.uniform(2.0, 3.0))
                        return True
                    except TimeoutException:
                        if attempt == max_attempts - 1:
                            raise
                        time.sleep(random.uniform(3, 5))
                        continue
                        
                except Exception as e:
                    if attempt == max_attempts - 1:
                        raise
                    time.sleep(random.uniform(3, 5))
                    continue
            
        except Exception as e:
            self.logger.error(f"Login failed: {e}")
            self.driver.save_screenshot("login_error.png")
            raise

    def scrape_replies_only(self, post_url: str):
        self.logger.info(f"Starting to scrape replies from: {post_url}")
        
        try:
            self.driver.get(post_url)
            time.sleep(random.uniform(5.0, 7.0))  # Increased wait time for post loading
            
            # Dismiss popups
            self._dismiss_popups()
            
            # Scroll to comments section
            self._scroll_to_comments_section()
            
            # Load all comments
            self._load_all_comments()
            
            # Get all top-level comments
            comments = self._get_top_level_comments()
            self.logger.info(f"Found {len(comments)} top-level comments")
            
            replies_data = []
            
            for i, comment in enumerate(comments):
                try:
                    self.logger.info(f"Processing comment {i+1}/{len(comments)}")
                    
                    # Scroll to comment
                    self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", comment)
                    time.sleep(random.uniform(1.0, 2.0))  # Increased wait time
                    
                    # Get comment text
                    comment_text = self._get_comment_text(comment)
                    if not comment_text:
                        self.logger.warning(f"Couldn't extract text from comment {i+1}")
                        continue
                        
                    self.logger.info(f"Comment {i+1} text: {comment_text[:50]}...")
                    
                    # Expand replies if available
                    replies_expanded = self._expand_replies(comment)
                    
                    if replies_expanded:
                        self.logger.info(f"Replies expanded for comment {i+1}")
                        time.sleep(random.uniform(2.0, 3.0))  # Wait for replies to load
                        
                        # Get all replies for this comment
                        replies = self._get_replies_for_comment(comment)
                        self.logger.info(f"Found {len(replies)} replies for comment {i+1}")
                        
                        for j, reply in enumerate(replies):
                            try:
                                self.logger.info(f"Processing reply {j+1}/{len(replies)}")
                                
                                # Scroll to reply
                                self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", reply)
                                time.sleep(random.uniform(0.5, 1.0))
                                
                                reply_text = self._get_reply_text(reply)
                                if reply_text:
                                    self.logger.info(f"Reply {j+1} text: {reply_text[:50]}...")
                                    replies_data.append({
                                        'original_comment_text': comment_text,
                                        'reply_text': reply_text
                                    })
                                else:
                                    self.logger.warning(f"Couldn't extract text from reply {j+1}")
                            except Exception as e:
                                self.logger.warning(f"Error extracting reply {j+1}: {e}")
                    else:
                        self.logger.info(f"No replies found or couldn't expand replies for comment {i+1}")
                
                except Exception as e:
                    self.logger.warning(f"Error processing comment {i+1}: {e}")
            
            self.logger.info(f"Finished scraping. Found {len(replies_data)} replies")
            return replies_data
            
        except Exception as e:
            self.logger.error(f"Error scraping replies: {e}")
            self.driver.save_screenshot("scraping_error.png")
            return []

    def _dismiss_popups(self):
        try:
            # Cookie consent buttons - expanded selectors
            cookie_buttons = self.driver.find_elements(
                By.XPATH, '//div[contains(text(), "Allow all cookies") or contains(text(), "Accept all") or contains(text(), "Accept cookies") or contains(text(), "Allow cookies")]'
            )
            
            for button in cookie_buttons:
                try:
                    if button.is_displayed():
                        self.human_interaction(button, 'click')
                        time.sleep(random.uniform(1.0, 1.5))
                        break
                except:
                    continue
                    
            # Try button elements as well
            cookie_buttons = self.driver.find_elements(
                By.XPATH, '//button[contains(text(), "Allow all cookies") or contains(text(), "Accept all") or contains(text(), "Accept cookies") or contains(text(), "Allow")]'
            )
            
            for button in cookie_buttons:
                try:
                    if button.is_displayed():
                        self.human_interaction(button, 'click')
                        time.sleep(random.uniform(1.0, 1.5))
                        break
                except:
                    continue
        except:
            pass
        
        try:
            # Notification permission - expanded selectors
            notif_buttons = self.driver.find_elements(
                By.XPATH, '//div[contains(text(), "Not now") or contains(text(), "Close") or contains(text(), "Cancel") or contains(text(), "Maybe later")]'
            )
            
            for button in notif_buttons:
                try:
                    if button.is_displayed():
                        self.human_interaction(button, 'click')
                        time.sleep(random.uniform(1.0, 1.5))
                        break
                except:
                    continue
                    
            # Try button elements as well
            notif_buttons = self.driver.find_elements(
                By.XPATH, '//button[contains(text(), "Not now") or contains(text(), "Close") or contains(text(), "Cancel") or contains(text(), "Maybe later")]'
            )
            
            for button in notif_buttons:
                try:
                    if button.is_displayed():
                        self.human_interaction(button, 'click')
                        time.sleep(random.uniform(1.0, 1.5))
                        break
                except:
                    continue
        except:
            pass

    def _scroll_to_comments_section(self):
        try:
            # Scroll down in smaller increments to find comments
            for _ in range(5):  # Increased number of scrolls
                self.driver.execute_script("window.scrollBy(0, 500);")
                time.sleep(random.uniform(1.0, 2.0))
                
                # Look for comment indicators
                comment_indicators = self.driver.find_elements(
                    By.XPATH, '//span[contains(text(), "Comment") or contains(text(), "comment")]'
                )
                
                if comment_indicators:
                    for indicator in comment_indicators:
                        try:
                            if indicator.is_displayed():
                                self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", indicator)
                                time.sleep(random.uniform(1.0, 2.0))
                                break
                        except:
                            continue
                    break
                    
            # Additional scroll to ensure we're at comments section
            self.driver.execute_script("window.scrollBy(0, 300);")
            time.sleep(random.uniform(2.0, 3.0))
            
        except Exception as e:
            self.logger.warning(f"Error scrolling to comments section: {e}")

    def _load_all_comments(self):
        max_attempts = 10  # Increased attempts
        max_scrolls = 15   # Added max scrolls
        scroll_count = 0
        
        for attempt in range(max_attempts):
            try:
                # Find "View more comments" or "Show more comments" buttons with expanded selectors
                more_comments_buttons = self.driver.find_elements(
                    By.XPATH, 
                    '//*[contains(text(), "View more comments") or contains(text(), "See more comments") or ' +
                    'contains(text(), "Show more comments") or contains(text(), "Show previous comments") or ' +
                    'contains(text(), "View") and contains(text(), "more") and contains(text(), "comment")]'
                )
                
                if not more_comments_buttons:
                    # Try to find "Most relevant" dropdown and change to "All comments"
                    relevance_dropdowns = self.driver.find_elements(
                        By.XPATH, '//*[contains(text(), "Most relevant") or contains(text(), "Most Relevant")]'
                    )
                    
                    for dropdown in relevance_dropdowns:
                        try:
                            if dropdown.is_displayed():
                                self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", dropdown)
                                time.sleep(random.uniform(0.5, 1.0))
                                self.human_interaction(dropdown, 'click')
                                time.sleep(random.uniform(1.0, 2.0))
                                
                                # Try to click "All comments" option
                                all_comments_options = self.driver.find_elements(
                                    By.XPATH, '//*[contains(text(), "All comments") or contains(text(), "All Comments")]'
                                )
                                
                                for option in all_comments_options:
                                    try:
                                        if option.is_displayed():
                                            self.human_interaction(option, 'click')
                                            time.sleep(random.uniform(2.0, 3.0))
                                            break
                                    except:
                                        continue
                                
                                break
                        except:
                            continue
                    
                # If no more buttons found, scroll down to load more comments
                if not more_comments_buttons and scroll_count < max_scrolls:
                    self.driver.execute_script("window.scrollBy(0, 800);")
                    time.sleep(random.uniform(2.0, 3.0))
                    scroll_count += 1
                    continue
                
                # If max scrolls reached or no more buttons, break
                if not more_comments_buttons:
                    break
                
                # Click "View more comments" buttons
                for button in more_comments_buttons:
                    try:
                        if button.is_displayed():
                            self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", button)
                            time.sleep(random.uniform(0.5, 1.0))
                            self.human_interaction(button, 'click')
                            time.sleep(random.uniform(2.0, 3.0))  # Increased wait time
                    except:
                        continue
                
            except Exception as e:
                self.logger.warning(f"Error loading more comments (attempt {attempt+1}): {e}")
                time.sleep(random.uniform(2, 3))

    def _get_top_level_comments(self):
        try:
            # Updated selectors for top-level comments
            comment_selectors = [
                # Facebook's UFI2 structure
                '//div[@data-testid="UFI2Comment"]',
                
                # New Facebook layout with role="article" for comments
                '//div[@role="article" and not(ancestor::div[@role="article"])]',
                
                # Using common container classes
                '//div[contains(@class, "x1yztbdb")]//div[@role="article"]',
                
                # Targeting the specific class mentioned in your request
                '//div[contains(@class, "x16hk5td") and contains(@class, "x12rz0ws")]//div[@role="article"]',
                
                # More generic approach - find elements with comment-like characteristics
                '//div[.//div[@aria-label="Like" or @aria-label="Reply" or contains(@aria-label, "comment")]]',
                
                # Looking for specific reply divs with the mentioned class
                '//div[contains(@class, "html-div") and contains(@class, "xdj266r") and contains(@class, "x11i5rnm")]',
                
                # Try to find elements that have both text content and reply/like buttons
                '//div[.//span[@dir="auto"] and .//div[@aria-label="Like" or @aria-label="Reply"]]'
            ]
            
            comments = []
            for selector in comment_selectors:
                try:
                    # The key is finding selectors that specifically get top-level comments only
                    elements = self.driver.find_elements(By.XPATH, selector)
                    if elements:
                        self.logger.info(f"Found {len(elements)} comments with selector: {selector}")
                        comments = elements
                        break
                except Exception as e:
                    self.logger.warning(f"Error with selector {selector}: {e}")
            
            # Additional filtering if needed
            if not comments:
                self.logger.warning("Could not find comments with any selector, trying last resort approach")
                # Last resort - find all potential comment containers and filter
                potential_comments = self.driver.find_elements(
                    By.XPATH, '//div[contains(@class, "x1yztbdb") or contains(@class, "x78zum5") or contains(@class, "x16hk5td")]'
                )
                
                for element in potential_comments:
                    try:
                        # Check if this looks like a comment (has like/reply buttons)
                        has_buttons = len(element.find_elements(
                            By.XPATH, './/div[@aria-label="Like" or @aria-label="Reply"]'
                        )) > 0
                        
                        # Check if it has text content
                        has_text = element.text and len(element.text) > 10
                        
                        if has_buttons and has_text:
                            comments.append(element)
                    except:
                        continue
            
            self.logger.info(f"Identified {len(comments)} comments after filtering")
            return comments
            
        except Exception as e:
            self.logger.warning(f"Error finding top-level comments: {e}")
            return []

    def _expand_replies(self, comment_element):
        try:
            # First take a screenshot to help with debugging
            try:
                self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", comment_element)
                time.sleep(1)
                comment_element.screenshot("comment.png")
                self.logger.info("Took screenshot of comment")
            except:
                pass
            
            # Specifically target the View/See replies buttons with the exact class mentioned
            # and the "View all X replies" pattern
            view_replies_selectors = [
                # Using the class names you provided
                './/div[contains(@class, "x16hk5td") and contains(@class, "x12rz0ws")]',
                
                # Target the specific div pattern with those classes and contains view replies text
                './/div[contains(@class, "x16hk5td") and contains(@class, "x12rz0ws") and contains(text(), "View") and contains(text(), "repl")]',
                
                # Target the exact class combo you mentioned
                './/div[contains(@class, "html-div") and contains(@class, "xdj266r") and contains(@class, "x11i5rnm") and contains(@class, "xat24cr")]',
                
                # More specific selectors for the reply count text
                './/div[contains(@class, "x16hk5td") and contains(text(), "View all")]',
                './/div[contains(text(), "View all") and contains(text(), "replies")]',
                './/span[contains(text(), "View all") and contains(text(), "replies")]',
                
                # Even more specific to match "View all 5 replies" pattern
                './/div[text()[contains(., "View all") and contains(., "replies")]]',
                './/span[text()[contains(., "View all") and contains(., "replies")]]',
                
                # General reply button text patterns
                './/div[contains(text(), "View replies") or contains(text(), "See replies")]',
                './/div[contains(text(), "View") and contains(text(), "reply") or contains(text(), "replies")]',
                './/span[contains(text(), "View") and contains(text(), "reply") or contains(text(), "replies")]'
            ]
            
            # Try each selector
            view_replies_button = None
            used_selector = None
            
            for selector in view_replies_selectors:
                try:
                    buttons = comment_element.find_elements(By.XPATH, selector)
                    
                    for button in buttons:
                        try:
                            button_text = button.text.strip().lower()
                            # Only consider if it contains reply-related text
                            if any(keyword in button_text for keyword in ['view all', 'view', 'see', 'reply', 'replies']):
                                self.logger.info(f"Found reply button with text: '{button_text}' using selector: {selector}")
                                view_replies_button = button
                                used_selector = selector
                                break
                        except:
                            continue
                        
                    if view_replies_button:
                        break
                except Exception as e:
                    self.logger.warning(f"Error with selector {selector}: {e}")
            
            # If no button found, try to search for reply indicators across the entire comment
            if not view_replies_button:
                self.logger.info("No specific reply button found, searching for reply indicators in the entire comment text")
                
                # Look for reply indicators in the comment's text
                comment_text = comment_element.text.lower()
                
                if any(pattern in comment_text for pattern in ['view all', 'view replies', 'see replies']):
                    self.logger.info(f"Comment contains reply indicators in text: {comment_text[:100]}")
                    
                    # Try to find clickable elements that might be the reply button
                    potential_buttons = comment_element.find_elements(By.XPATH, './/div[not(contains(@class, "hidden"))]')
                    
                    for button in potential_buttons:
                        try:
                            button_text = button.text.strip().lower()
                            if any(pattern in button_text for pattern in ['view all', 'view replies', 'see replies']):
                                self.logger.info(f"Found potential reply button with text: '{button_text}'")
                                view_replies_button = button
                                break
                        except:
                            continue
            
            # If a button was found, try to click it
            if view_replies_button:
                try:
                    # Log details about the button
                    button_text = view_replies_button.text.strip() if view_replies_button.text else "No text"
                    self.logger.info(f"Attempting to click view replies button: '{button_text}' found with selector: {used_selector}")
                    
                    # Scroll to ensure the button is in view
                    self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", view_replies_button)
                    time.sleep(random.uniform(1.0, 2.0))
                    
                    # First attempt: Try traditional click
                    try:
                        self.human_interaction(view_replies_button, 'click')
                        self.logger.info("Regular click performed on reply button")
                        time.sleep(random.uniform(2.0, 3.0))
                    except Exception as e:
                        self.logger.warning(f"Regular click failed: {e}, trying JavaScript click")
                        
                        # Second attempt: Try JavaScript click
                        try:
                            self.driver.execute_script("arguments[0].click();", view_replies_button)
                            self.logger.info("JavaScript click performed on reply button")
                            time.sleep(random.uniform(2.0, 3.0))
                        except Exception as e:
                            self.logger.warning(f"JavaScript click failed: {e}, trying alternative approach")
                            
                            # Last attempt: Try sending click event
                            try:
                                self.driver.execute_script("""
                                    var element = arguments[0];
                                    var clickEvent = new MouseEvent('click', {
                                        'view': window,
                                        'bubbles': true,
                                        'cancelable': true
                                    });
                                    element.dispatchEvent(clickEvent);
                                """, view_replies_button)
                                self.logger.info("Click event dispatched to reply button")
                                time.sleep(random.uniform(2.0, 3.0))
                            except Exception as e:
                                self.logger.warning(f"Click event dispatch failed: {e}")
                                return False
                    
                    # Check if replies actually expanded
                    time.sleep(random.uniform(2.0, 3.0))  # Wait for potential replies to load
                    
                    # Take another screenshot to see if replies expanded
                    try:
                        self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", comment_element)
                        time.sleep(1)
                        self.driver.execute_script("window.scrollBy(0, 100);")
                        time.sleep(1)
                        comment_element.screenshot("comment_after_click.png")
                        self.logger.info("Took screenshot after clicking reply button")
                    except:
                        pass
                    
                    return True
                except Exception as e:
                    self.logger.warning(f"Failed to click view replies button: {e}")
                    return False
            
            self.logger.info("No reply button found for this comment")
            return False
            
        except Exception as e:
            self.logger.warning(f"Error expanding replies: {e}")
            return False

        def _get_replies_for_comment(self, comment_element):
         try:
            # Wait for replies to fully load
            time.sleep(random.uniform(2.0, 3.0))
            
            # Take screenshot of the area to help with debugging
            try:
                self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", comment_element)
                time.sleep(1)
                # Scroll a bit to show potential replies
                self.driver.execute_script("window.scrollBy(0, 100);")
                time.sleep(1)
                comment_element.screenshot("comment_with_replies.png")
                self.logger.info("Took screenshot of comment with replies area")
            except:
                pass
            
            # Try to find replies with the specific class mentioned
            specific_reply_selectors = [
                # Using the specific class you mentioned
                './/div[contains(@class, "html-div") and contains(@class, "xdj266r") and contains(@class, "x11i5rnm") and contains(@class, "xat24cr")]',
                
                # Sibling or child elements that match the reply pattern
                './following-sibling::div//div[contains(@class, "xdj266r")]',
                './div[position() > 1]//div[contains(@class, "xdj266r")]',
                
                # Using indentation as a hint (replies are usually indented)
                './/div[contains(@class, "x78zum5") or contains(@class, "x1d52u69")]//div[@role="article"]',
                
                # Looking for specific reply patterns
                './/div[contains(@class, "x16hk5td")]//div[@role="article"]'
            ]
            
            potential_replies = []
            
            # Try each specific selector first
            for selector in specific_reply_selectors:
                try:
                    elements = comment_element.find_elements(By.XPATH, selector)
                    if elements:
                        self.logger.info(f"Found {len(elements)} potential replies with selector: {selector}")
                        potential_replies.extend(elements)
                except Exception as e:
                    self.logger.warning(f"Error with specific selector {selector}: {e}")
            
            # Try to find reply containers that might hold multiple replies
            if not potential_replies:
                self.logger.info("No replies found with specific selectors, trying to find reply containers")
                
                # Facebook often has a container for replies
                container_selectors = [
                    './/div[contains(@class, "x168nmei") or contains(@class, "x13lgxp2")]',
                    './/div[.//div[contains(@class, "xdj266r")] and position() > 1]',
                    './following-sibling::div[.//div[contains(@class, "xdj266r")]]'
                ]
                
                for selector in container_selectors:
                    try:
                        containers = comment_element.find_elements(By.XPATH, selector)
                        
                        for container in containers:
                            try:
                                # Look for reply elements inside the container
                                elements = container.find_elements(By.XPATH, './/div[contains(@class, "xdj266r") or @role="article"]')
                                if elements:
                                    self.logger.info(f"Found {len(elements)} potential replies in container")
                                    potential_replies.extend(elements)
                            except:
                                continue
                                
                        if potential_replies:
                            break
                    except Exception as e:
                        self.logger.warning(f"Error with container selector {selector}: {e}")
                        continue
            
            # If still no replies found, try a more general approach
            if not potential_replies:
                self.logger.info("No replies found with container selectors, trying general approach")
                
                # Look for elements that are likely replies (indented, have reply buttons)
                general_selectors = [
                    './/div[contains(@class, "x1yztbdb") and contains(@style, "margin-left")]',
                    './/div[contains(@class, "x1lq5wgf") and contains(@class, "x1qjc9v5")]',
                    './/div[contains(@class, "x1n2onr6") and contains(@class, "x1ja2u2z")]'
                ]
                
                for selector in general_selectors:
                    try:
                        elements = comment_element.find_elements(By.XPATH, selector)
                        if elements:
                            self.logger.info(f"Found {len(elements)} potential replies with general selector: {selector}")
                            potential_replies.extend(elements)
                    except Exception as e:
                        self.logger.warning(f"Error with general selector {selector}: {e}")
            
            # Filter potential replies to only include actual replies
            replies = []
            for element in potential_replies:
                try:
                    # Check if element has reply characteristics
                    has_text = element.text and len(element.text) > 10
                    has_like_button = len(element.find_elements(
                        By.XPATH, './/div[@aria-label="Like" or @aria-label="Reply"]'
                    )) > 0
                    
                    if has_text and has_like_button:
                        replies.append(element)
                except:
                    continue
            
            self.logger.info(f"Found {len(replies)} actual replies after filtering")
            return replies
            
         except Exception as e:
            self.logger.warning(f"Error finding replies: {e}")
            return []
        
    def _get_comment_text(self, comment_element):
        try:
            # Try multiple selectors for comment text
            text_selectors = [
                './/div[@data-ad-comet-preview="message" or @data-ad-preview="message"]',
                './/div[contains(@class, "x1iorvi4") or contains(@class, "x1pi30zi")]',
                './/span[@dir="auto"]',
                './/div[contains(@class, "xdj266r")]//span[@dir="auto"]',
                './/div[contains(@class, "x11i5rnm")]'
            ]
            
            for selector in text_selectors:
                try:
                    text_elements = comment_element.find_elements(By.XPATH, selector)
                    if text_elements:
                        # Return the text from the first matching element
                        return text_elements[0].text.strip()
                except:
                    continue
            
            # Fallback to the entire comment text if specific selectors fail
            return comment_element.text.strip()
        except Exception as e:
            self.logger.warning(f"Error getting comment text: {e}")
            return ""

    def _get_reply_text(self, reply_element):
        try:
            # Similar to comment text but might have different structure
            text_selectors = [
                './/div[@data-ad-comet-preview="message" or @data-ad-preview="message"]',
                './/div[contains(@class, "x1iorvi4") or contains(@class, "x1pi30zi")]',
                './/span[@dir="auto"]',
                './/div[contains(@class, "x1n2onr6")]//span[@dir="auto"]',
                './/div[contains(@class, "x11i5rnm")]'
            ]
            
            for selector in text_selectors:
                try:
                    text_elements = reply_element.find_elements(By.XPATH, selector)
                    if text_elements:
                        return text_elements[0].text.strip()
                except:
                    continue
            
            # Fallback to the entire reply text
            return reply_element.text.strip()
        except Exception as e:
            self.logger.warning(f"Error getting reply text: {e}")
            return ""

    def save_to_csv(self, data: List[Dict], filename: str):
        try:
            if not data:
                self.logger.warning("No data to save")
                return
            
            # Ensure directory exists
            os.makedirs(os.path.dirname(filename), exist_ok=True)
            
            with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
                fieldnames = ['original_comment_text', 'reply_text']
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                
                writer.writeheader()
                for row in data:
                    writer.writerow(row)
            
            self.logger.info(f"Data saved to {filename}")
        except Exception as e:
            self.logger.error(f"Error saving to CSV: {e}")

    def close(self):
        try:
            self.driver.quit()
            self.logger.info("Browser closed successfully")
        except Exception as e:
            self.logger.warning(f"Error closing browser: {e}")

    def __del__(self):
        self.close()


# Example usage
if __name__ == "__main__":
    # Initialize scraper
    scraper = FacebookReplyScraper(
        username="gmail",
        password="password",
        headless=False  # Set to True for headless mode
    )
    
    try:
        # Login to Facebook
        scraper.login()
        
        # Scrape replies from a post
        post_url = "https://www.facebook.com/PiumiSrinayaka/videos/508592312022519"
        replies_data = scraper.scrape_replies_only(post_url)
        
        # Save data to CSV
        scraper.save_to_csv(replies_data, "facebook_replies.csv")
        
    except Exception as e:
        scraper.logger.error(f"An error occurred: {e}")
    finally:
        # Close the browser
        scraper.close()

2025-03-29 14:45:12 - INFO: Get LATEST chromedriver version for google-chrome
2025-03-29 14:45:12 - INFO: Get LATEST chromedriver version for google-chrome
2025-03-29 14:45:13 - INFO: Driver [/Users/pasindumalinda/.wdm/drivers/chromedriver/mac64/134.0.6998.165/chromedriver-mac-arm64/chromedriver] found in cache
2025-03-29 14:45:45 - INFO: Login successful
2025-03-29 14:45:47 - INFO: Starting to scrape replies from: https://www.facebook.com/PiumiSrinayaka/videos/508592312022519
2025-03-29 14:46:53 - INFO: Found 72 comments with selector: //div[@role="article" and not(ancestor::div[@role="article"])]
2025-03-29 14:46:53 - INFO: Identified 72 comments after filtering
2025-03-29 14:46:53 - INFO: Found 72 top-level comments
2025-03-29 14:46:53 - INFO: Processing comment 1/72
2025-03-29 14:46:55 - INFO: Comment 1 text: Sankalpa Adikari
මොනා උනත් පොත්ත සුදු කෑල්ලක් දැකල...
2025-03-29 14:46:56 - INFO: Took screenshot of comment
2025-03-29 14:46:56 - INFO: No specific reply button found, search

In [None]:
import os
import csv
import time
import random
import re
from typing import List, Dict
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    NoSuchElementException,
    TimeoutException,
    StaleElementReferenceException,
    ElementClickInterceptedException
)
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
import logging

class FacebookCommentScraper:
    def __init__(self, username: str, password: str, headless: bool = False, log_level=logging.INFO):
        # Setup logging
        logging.basicConfig(
            level=log_level, 
            format='%(asctime)s - %(levelname)s: %(message)s',
            datefmt='%Y-%m-%d %H:%M:%S'
        )
        self.logger = logging.getLogger(__name__)

        # Chrome options for performance and stealth
        self.chrome_options = Options()
        
        # Enhanced stealth and performance options
        stealth_options = [
            "--disable-blink-features=AutomationControlled",
            "--disable-infobars",
            "--disable-notifications",
            "--disable-popup-blocking",
            "--disable-web-security",
            "--disable-extensions",
            "--no-sandbox",
            "--disable-dev-shm-usage",
            "--disable-gpu",
            "--log-level=3"
        ]
        
        for option in stealth_options:
            self.chrome_options.add_argument(option)
        
        # Randomize window size
        width = random.randint(1200, 1400)
        height = random.randint(800, 1000)
        self.chrome_options.add_argument(f"--window-size={width},{height}")
        
        if headless:
            self.chrome_options.add_argument("--headless=new")
        
        # Anti-detection settings
        self.chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        self.chrome_options.add_experimental_option('useAutomationExtension', False)
        
        # Initialize webdriver
        self.driver = self._initialize_webdriver()
        
        # Setup waits
        self.driver.implicitly_wait(10)
        self.wait = WebDriverWait(self.driver, 30)
        
        # Credentials and tracking
        self.username = username
        self.password = password
        self.comment_count = 0
        self.reply_count = 0

    def _initialize_webdriver(self, max_retries=3):
        """Initialize Chrome WebDriver with retry mechanism"""
        for attempt in range(max_retries):
            try:
                service = Service(ChromeDriverManager().install())
                driver = webdriver.Chrome(service=service, options=self.chrome_options)
                return driver
            except Exception as e:
                self.logger.warning(f"WebDriver initialization attempt {attempt + 1} failed: {e}")
                if attempt == max_retries - 1:
                    raise
                time.sleep(2)

    def human_type(self, element, text, speed=0.1):
        """Simulate human-like typing"""
        for character in text:
            element.send_keys(character)
            time.sleep(random.uniform(speed/2, speed*1.5))

    def human_click(self, element, attempts=3):
        """Simulate human-like clicking with retry"""
        for attempt in range(attempts):
            try:
                ActionChains(self.driver).move_to_element(element).pause(
                    random.uniform(0.2, 0.5)).perform()
                element.click()
                return True
            except Exception as e:
                self.logger.warning(f"Click attempt {attempt + 1} failed: {e}")
                if attempt == attempts - 1:
                    return False
                time.sleep(1)
        return False

    def login(self):
        """Enhanced login with multiple login strategies"""
        login_url = "https://www.facebook.com/login"
        
        try:
            self.driver.get(login_url)
            time.sleep(random.uniform(2.0, 4.0))
            
            # Wait for email field to be present
            try:
                email_field = self.wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "input[name='email'], input[id='email']"))
                )
                email_field.clear()
                self.human_type(email_field, self.username)
                time.sleep(random.uniform(0.5, 1.5))
            except Exception as e:
                self.logger.error("Could not find email field")
                raise

            # Find password field
            try:
                password_field = self.wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "input[name='pass'], input[id='pass']"))
                )
                password_field.clear()
                self.human_type(password_field, self.password)
                time.sleep(random.uniform(0.5, 1.5))
            except Exception as e:
                self.logger.error("Could not find password field")
                raise

            # Click login button
            try:
                login_button = self.wait.until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "button[name='login'], button[type='submit'], #loginbutton"))
                )
                self.human_click(login_button)
            except Exception as e:
                # Try submitting with Enter key as fallback
                try:
                    password_field.send_keys(Keys.RETURN)
                except Exception as e:
                    self.logger.error("Could not find login button")
                    raise

            # Wait for successful login
            try:
                self.wait.until(
                    EC.presence_of_element_located((By.XPATH, "//div[@role='main'] | //div[@aria-label='Facebook']"))
                )
                self.logger.info("Login successful!")
                time.sleep(random.uniform(3.0, 5.0))
                return True
            except TimeoutException:
                # Check for security checks
                try:
                    security_check = self.driver.find_element(
                        By.XPATH, "//*[contains(text(), 'Enter Login Code') or contains(text(), 'Security Check')]"
                    )
                    self.logger.warning("Security check detected. Please complete manually.")
                    input("Press Enter after completing security check...")
                    return True
                except NoSuchElementException:
                    self.logger.error("Login verification failed")
                    raise
        
        except Exception as e:
            self.logger.error(f"Critical login error: {e}")
            self.driver.save_screenshot("login_error.png")
            raise

    def click_view_more_comments(self):
        """Click all 'View more comments' buttons"""
        view_more_selectors = [
            '//div[contains(text(), "View more comments")]',
            '//div[contains(text(), "View more") and contains(text(), "comment")]',
            '//span[contains(text(), "View more comments")]',
            '//div[@aria-label="View more comments"]'
        ]
        
        for selector in view_more_selectors:
            try:
                buttons = self.driver.find_elements(By.XPATH, selector)
                for button in buttons:
                    try:
                        if button.is_displayed():
                            self.human_click(button)
                            time.sleep(random.uniform(0.5, 1.5))
                    except Exception:
                        continue
            except Exception:
                continue

    def click_view_replies(self, comment_element):
        """Click 'View replies' buttons within a comment"""
        reply_button_selectors = [
            './/div[contains(text(), "View all") and contains(text(), "replies")]',
            './/div[contains(text(), "View replies")]',
            './/span[contains(text(), "View replies")]',
            './/a[contains(text(), "View replies")]'
        ]
        
        for selector in reply_button_selectors:
            try:
                reply_buttons = comment_element.find_elements(By.XPATH, selector)
                for button in reply_buttons:
                    if button.is_displayed():
                        self.human_click(button)
                        time.sleep(random.uniform(1.0, 2.0))
            except Exception:
                continue

    def scrape_comments(self, video_url: str, max_comments: int = 8000):
        """Scrape comments from a video"""
        self.logger.info(f"Starting to scrape comments from: {video_url}")
        
        try:
            # Navigate to video
            self.driver.get(video_url)
            time.sleep(random.uniform(3.0, 5.0))
            
            # Scroll and load comments
            scroll_attempts = 0
            last_loaded_count = 0
            stale_count = 0
            
            while scroll_attempts < 200 and stale_count < 20:
                # Random scroll
                scroll_distance = random.randint(300, 800)
                self.driver.execute_script(f"window.scrollBy(0, {scroll_distance});")
                time.sleep(random.uniform(1.5, 3.0))
                
                # Click view more comments
                self.click_view_more_comments()
                
                # Estimate comments
                try:
                    comment_elements = self.driver.find_elements(
                        By.XPATH, '//div[@data-testid="UFI2Comment"] | //div[@role="article"]'
                    )
                    current_count = len(comment_elements)
                except Exception:
                    current_count = 0
                
                # Progress tracking
                if current_count == last_loaded_count:
                    stale_count += 1
                else:
                    stale_count = 0
                    last_loaded_count = current_count
                
                scroll_attempts += 1
                
                # Log progress
                if scroll_attempts % 5 == 0:
                    self.logger.info(f"Scrolled {scroll_attempts} times, loaded {current_count} comments")
                
                # Exit if max reached
                if current_count >= max_comments:
                    break
            
            # Find comment elements
            comment_elements = self.driver.find_elements(
                By.XPATH, '//div[@data-testid="UFI2Comment"] | //div[@role="article"]'
            )
            
            self.logger.info(f"Found {len(comment_elements)} comment elements")
            
            # Scrape comments
            comments_data = []
            for idx, comment in enumerate(comment_elements):
                if len(comments_data) >= max_comments:
                    break
                
                try:
                    comment_data = self._extract_comment_data(comment)
                    if comment_data:
                        comments_data.append(comment_data)
                    
                    # Progress logging
                    if (idx + 1) % 10 == 0:
                        self.logger.info(f"Processed {idx + 1} comments (found {len(comments_data)})")
                
                except Exception as e:
                    self.logger.warning(f"Error processing comment {idx + 1}: {e}")
            
            self.logger.info(f"Finished scraping comments. Total comments: {len(comments_data)}")
            return comments_data
        
        except Exception as e:
            self.logger.error(f"Error scraping video {video_url}: {e}")
            return []

    def scrape_replies(self, comments_data: List[Dict]):
        """Scrape replies for each comment"""
        self.logger.info("Starting to scrape replies...")
        
        replies_data = []
        for comment_idx, comment in enumerate(comments_data, 1):
            try:
                # Find comment element again
                try:
                    comment_text = comment.get('text', '')
                    comment_element = self._find_comment_element_by_text(comment_text)
                    
                    if not comment_element:
                        self.logger.warning(f"Could not find element for comment {comment_idx}")
                        continue
                    
                    # Expand replies
                    self.click_view_replies(comment_element)
                    
                    # Click all "See more" in replies
                    self._expand_see_more_in_replies(comment_element)
                    
                    # Extract replies
                    reply_elements = comment_element.find_elements(
                        By.XPATH, './/div[@data-testid="UFI2Comment"] | .//div[@role="article"]'
                    )
                    
                    for reply_idx, reply in enumerate(reply_elements, 1):
                        try:
                            reply_text_element = reply.find_element(
                                By.XPATH, './/div[@dir="auto"] | .//div[contains(@class, "xdj266r")]'
                            )
                            reply_text = reply_text_element.text.strip()
                            
                            if reply_text:
                                replies_data.append({
                                    'original_comment_text': comment_text,
                                    'reply_text': reply_text
                                })
                                
                                # Periodic logging
                                if len(replies_data) % 10 == 0:
                                    self.logger.info(f"Processed {len(replies_data)} replies")
                        
                        except Exception as e:
                            self.logger.warning(f"Error extracting reply {reply_idx} from comment {comment_idx}: {e}")
                
                except Exception as e:
                    self.logger.warning(f"Error processing comment {comment_idx} for replies: {e}")
            
            except Exception as e:
                self.logger.error(f"Unexpected error processing comment {comment_idx}: {e}")
        
        self.logger.info(f"Finished scraping replies. Total replies: {len(replies_data)}")
        return replies_data

    def _find_comment_element_by_text(self, comment_text, max_attempts=3):
        """Find comment element by its text content"""
        if not comment_text or len(comment_text) < 10:
            return None
            
        for attempt in range(max_attempts):
            try:
                # Use XPath to find comment containing the specific text
                comment_elements = self.driver.find_elements(
                    By.XPATH, f'//*[contains(text(), "{comment_text[:30]}")]'
                )
                
                for element in comment_elements:
                    if comment_text in element.text:
                        return element
            
            except Exception as e:
                self.logger.warning(f"Attempt {attempt + 1} to find comment failed: {e}")
                time.sleep(1)
        
        return None

    def _expand_see_more_in_replies(self, comment_element):
        """Expand all 'See more' options in replies"""
        try:
            see_more_buttons = comment_element.find_elements(
                By.XPATH, './/div[contains(text(), "See more")]'
            )
            
            for button in see_more_buttons:
                try:
                    if button.is_displayed():
                        self.human_click(button)
                        time.sleep(random.uniform(0.3, 0.7))
                except Exception:
                    continue
        
        except Exception as e:
            self.logger.warning(f"Error expanding 'See more' in replies: {e}")

    def _extract_comment_data(self, comment_element):
        """Extract data from a single comment element"""
        try:
            # Expand full comment text
            try:
                see_more = comment_element.find_element(
                    By.XPATH, './/div[contains(text(), "See more")]'
                )
                if see_more.is_displayed():
                    self.human_click(see_more)
                    time.sleep(random.uniform(0.5, 1.0))
            except NoSuchElementException:
                pass
            
            # Get comment text
            try:
                comment_text_element = comment_element.find_element(
                    By.XPATH, './/div[@dir="auto"] | .//div[contains(@class, "xdj266r")]'
                )
                comment_text = comment_text_element.text.strip()
            except NoSuchElementException:
                comment_text = comment_element.text.strip()
            
            if not comment_text:
                return None
            
            # Get comment metadata
            comment_author = ""
            comment_time = ""
            try:
                author_element = comment_element.find_element(
                    By.XPATH, './/a[contains(@href, "user") or contains(@href, "profile")]'
                )
                comment_author = author_element.text.strip()
            except NoSuchElementException:
                pass
            
            try:
                time_element = comment_element.find_element(
                    By.XPATH, './/a[contains(@href, "permalink")]'
                )
                comment_time = time_element.text.strip()
            except NoSuchElementException:
                pass
            
            return {
                'author': comment_author,
                'time': comment_time,
                'text': comment_text
            }
        
        except Exception as e:
            self.logger.warning(f"Error extracting comment: {e}")
            return None

    def save_to_csv(self, data: List[Dict], output_file: str, data_type: str = 'comments'):
        """Save scraped data to CSV"""
        try:
            os.makedirs(os.path.dirname(output_file) or '.', exist_ok=True)
            
            with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
                if data_type == 'comments':
                    writer = csv.writer(csvfile)
                    writer.writerow(['Author', 'Timestamp', 'Comment'])
                    for comment in data:
                        writer.writerow([
                            comment.get('author', ''),
                            comment.get('time', ''),
                            comment.get('text', '')
                        ])
                
                elif data_type == 'replies':
                    writer = csv.writer(csvfile)
                    writer.writerow(['Original Comment', 'Reply'])
                    for reply in data:
                        writer.writerow([
                            reply.get('original_comment_text', ''),
                            reply.get('reply_text', '')
                        ])
            
            self.logger.info(f"Data saved to: {output_file}")
        
        except Exception as e:
            self.logger.error(f"Error saving to CSV: {e}")
            raise

    def close(self):
        """Close the browser"""
        try:
            self.driver.quit()
        except Exception:
            pass

def main():
    # Configuration
    USERNAME = 'gmail'  # Replace with your credentials
    PASSWORD = 'password'  # Replace with your password
    VIDEO_URL = 'https://www.facebook.com/PiumiSrinayaka/videos/508592312022519'  # Replace with your video URL
    COMMENTS_OUTPUT_FILE = 'facebook_comments.csv'
    REPLIES_OUTPUT_FILE = 'facebook_replies.csv'
    MAX_COMMENTS = 8000  # Target number of comments to scrape
    
    # Initialize scraper
    scraper = None
    try:
        print("Starting Facebook Comment Scraper...")
        scraper = FacebookCommentScraper(USERNAME, PASSWORD, headless=False)
        
        # Login
        scraper.login()
        
        # Scrape comments first
        comments_data = scraper.scrape_comments(VIDEO_URL, max_comments=MAX_COMMENTS)
        
        # Save comments to CSV
        if comments_data:
            scraper.save_to_csv(comments_data, COMMENTS_OUTPUT_FILE, 'comments')
            
            # Then scrape replies
            replies_data = scraper.scrape_replies(comments_data)
            
            # Save replies to CSV
            if replies_data:
                scraper.save_to_csv(replies_data, REPLIES_OUTPUT_FILE, 'replies')
            else:
                print("No replies were scraped.")
        else:
            print("No comments were scraped.")
    
    except Exception as e:
        print(f"Fatal error: {e}")
        if scraper:
            scraper.driver.save_screenshot("error_screenshot.png")
    
    finally:
        if scraper:
            scraper.close()

if __name__ == '__main__':
    main()


# Facebook data mining 

In [None]:
import os
import csv
import time
import random
import re
from typing import List, Dict
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    NoSuchElementException,
    TimeoutException,
    StaleElementReferenceException,
    ElementClickInterceptedException
)
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
import logging

class FacebookCommentScraper:
    def __init__(self, username: str, password: str, headless: bool = False, log_level=logging.INFO):
        # Setup logging
        logging.basicConfig(
            level=log_level, 
            format='%(asctime)s - %(levelname)s: %(message)s',
            datefmt='%Y-%m-%d %H:%M:%S'
        )
        self.logger = logging.getLogger(__name__)

        # Chrome options for performance and stealth
        self.chrome_options = Options()
        
        # Enhanced stealth and performance options
        stealth_options = [
            "--disable-blink-features=AutomationControlled",
            "--disable-infobars",
            "--disable-notifications",
            "--disable-popup-blocking",
            "--disable-web-security",
            "--disable-extensions",
            "--no-sandbox",
            "--disable-dev-shm-usage",
            "--disable-gpu",
            "--log-level=3"
        ]
        
        for option in stealth_options:
            self.chrome_options.add_argument(option)
        
        # Randomize window size
        width = random.randint(1200, 1400)
        height = random.randint(800, 1000)
        self.chrome_options.add_argument(f"--window-size={width},{height}")
        
        if headless:
            self.chrome_options.add_argument("--headless=new")
        
        # Anti-detection settings
        self.chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        self.chrome_options.add_experimental_option('useAutomationExtension', False)
        
        # Initialize webdriver
        self.driver = self._initialize_webdriver()
        
        # Setup waits
        self.driver.implicitly_wait(10)
        self.wait = WebDriverWait(self.driver, 30)
        
        # Credentials and tracking
        self.username = username
        self.password = password
        self.comment_count = 0

    def _initialize_webdriver(self, max_retries=3):
        """Initialize Chrome WebDriver with retry mechanism"""
        for attempt in range(max_retries):
            try:
                service = Service(ChromeDriverManager().install())
                driver = webdriver.Chrome(service=service, options=self.chrome_options)
                return driver
            except Exception as e:
                self.logger.warning(f"WebDriver initialization attempt {attempt + 1} failed: {e}")
                if attempt == max_retries - 1:
                    raise
                time.sleep(2)

    def human_type(self, element, text, speed=0.1):
        """Simulate human-like typing"""
        for character in text:
            element.send_keys(character)
            time.sleep(random.uniform(speed/2, speed*1.5))

    def human_click(self, element, attempts=3):
        """Simulate human-like clicking with retry and element interception handling"""
        for attempt in range(attempts):
            try:
                # Scroll element into view first
                self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
                time.sleep(random.uniform(0.3, 0.7))
                
                # Dismiss any popups that might be blocking
                self.dismiss_popups()
                
                # Use JavaScript click as fallback
                try:
                    ActionChains(self.driver).move_to_element(element).pause(
                        random.uniform(0.2, 0.5)).click().perform()
                    return True
                except Exception:
                    self.driver.execute_script("arguments[0].click();", element)
                    return True
                
            except Exception as e:
                self.logger.warning(f"Click attempt {attempt + 1} failed: {e}")
                if attempt == attempts - 1:
                    return False
                time.sleep(1)
        return False

    def dismiss_popups(self):
        """Try to dismiss any popups that might be blocking interaction"""
        try:
            # Try to close any popups
            close_buttons = self.driver.find_elements(
                By.XPATH, '//div[@aria-label="Close" or @aria-label="Close dialog"]'
            )
            for button in close_buttons:
                try:
                    if button.is_displayed():
                        self.driver.execute_script("arguments[0].click();", button)
                        time.sleep(0.5)
                except:
                    continue
        except:
            pass

    def wait_for_page_ready(self, timeout=30):
        """Wait for page to be fully loaded"""
        try:
            self.wait.until(lambda d: d.execute_script("return document.readyState") == "complete")
            time.sleep(random.uniform(0.5, 1.5))
        except Exception as e:
            self.logger.warning(f"Page ready check failed: {e}")

    def login(self):
        """Enhanced login with multiple login strategies"""
        login_url = "https://www.facebook.com/login"
        
        try:
            self.driver.get(login_url)
            self.wait_for_page_ready()
            time.sleep(random.uniform(2.0, 4.0))
            
            # Wait for email field to be present
            try:
                email_field = self.wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "input[name='email'], input[id='email']"))
                )
                email_field.clear()
                self.human_type(email_field, self.username)
                time.sleep(random.uniform(0.5, 1.5))
            except Exception as e:
                self.logger.error("Could not find email field")
                raise

            # Find password field
            try:
                password_field = self.wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "input[name='pass'], input[id='pass']"))
                )
                password_field.clear()
                self.human_type(password_field, self.password)
                time.sleep(random.uniform(0.5, 1.5))
            except Exception as e:
                self.logger.error("Could not find password field")
                raise

            # Click login button
            try:
                login_button = self.wait.until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "button[name='login'], button[type='submit'], #loginbutton"))
                )
                self.human_click(login_button)
            except Exception as e:
                # Try submitting with Enter key as fallback
                try:
                    password_field.send_keys(Keys.RETURN)
                except Exception as e:
                    self.logger.error("Could not find login button")
                    raise

            # Wait for successful login
            try:
                self.wait.until(
                    EC.presence_of_element_located((By.XPATH, "//div[@role='main'] | //div[@aria-label='Facebook']"))
                )
                self.logger.info("Login successful!")
                time.sleep(random.uniform(3.0, 5.0))
                return True
            except TimeoutException:
                # Check for security checks
                try:
                    security_check = self.driver.find_element(
                        By.XPATH, "//*[contains(text(), 'Enter Login Code') or contains(text(), 'Security Check')]"
                    )
                    self.logger.warning("Security check detected. Please complete manually.")
                    input("Press Enter after completing security check...")
                    return True
                except NoSuchElementException:
                    self.logger.error("Login verification failed")
                    raise
        
        except Exception as e:
            self.logger.error(f"Critical login error: {e}")
            self.driver.save_screenshot("login_error.png")
            raise

    def click_view_more_comments(self):
        """Click all 'View more comments' buttons with better handling"""
        view_more_selectors = [
            '//div[contains(text(), "View more comments")]',
            '//div[contains(text(), "View more") and contains(text(), "comment")]',
            '//span[contains(text(), "View more comments")]',
            '//div[@aria-label="View more comments"]'
        ]
        
        for selector in view_more_selectors:
            try:
                buttons = self.driver.find_elements(By.XPATH, selector)
                for button in buttons:
                    try:
                        if button.is_displayed():
                            # Scroll to button first
                            self.driver.execute_script(
                                "arguments[0].scrollIntoView({block: 'center'});", 
                                button
                            )
                            time.sleep(random.uniform(0.5, 1.5))
                            
                            # Dismiss any popups
                            self.dismiss_popups()
                            
                            # Try clicking with multiple methods
                            try:
                                button.click()
                            except:
                                self.driver.execute_script("arguments[0].click();", button)
                                
                            time.sleep(random.uniform(1.0, 2.0))
                    except Exception:
                        continue
            except Exception:
                continue

    def scrape_comments(self, video_url: str, max_comments: int = 8000):
        """Scrape comments from a video"""
        self.logger.info(f"Starting to scrape comments from: {video_url}")
        
        try:
            # Navigate to video
            self.driver.get(video_url)
            self.wait_for_page_ready()
            time.sleep(random.uniform(3.0, 5.0))
            
            # Scroll and load comments
            scroll_attempts = 0
            last_loaded_count = 0
            stale_count = 0
            
            while scroll_attempts < 200 and stale_count < 20:
                # Random scroll
                scroll_distance = random.randint(300, 800)
                self.driver.execute_script(f"window.scrollBy(0, {scroll_distance});")
                time.sleep(random.uniform(1.5, 3.0))
                
                # Click view more comments
                self.click_view_more_comments()
                
                # Estimate comments
                try:
                    comment_elements = self.driver.find_elements(
                        By.XPATH, '//div[@data-testid="UFI2Comment"] | //div[@role="article"]'
                    )
                    current_count = len(comment_elements)
                except Exception:
                    current_count = 0
                
                # Progress tracking
                if current_count == last_loaded_count:
                    stale_count += 1
                else:
                    stale_count = 0
                    last_loaded_count = current_count
                
                scroll_attempts += 1
                
                # Log progress
                if scroll_attempts % 5 == 0:
                    self.logger.info(f"Scrolled {scroll_attempts} times, loaded {current_count} comments")
                
                # Exit if max reached
                if current_count >= max_comments:
                    break
            
            # Find comment elements
            comment_elements = self.driver.find_elements(
                By.XPATH, '//div[@data-testid="UFI2Comment"] | //div[@role="article"]'
            )
            
            self.logger.info(f"Found {len(comment_elements)} comment elements")
            
            # Scrape comments
            comments_data = []
            for idx, comment in enumerate(comment_elements):
                if len(comments_data) >= max_comments:
                    break
                
                try:
                    comment_data = self._extract_comment_data(comment)
                    if comment_data:
                        comments_data.append(comment_data)
                    
                    # Progress logging
                    if (idx + 1) % 10 == 0:
                        self.logger.info(f"Processed {idx + 1} comments (found {len(comments_data)})")
                
                except Exception as e:
                    self.logger.warning(f"Error processing comment {idx + 1}: {e}")
            
            self.logger.info(f"Finished scraping comments. Total comments: {len(comments_data)}")
            return comments_data
        
        except Exception as e:
            self.logger.error(f"Error scraping video {video_url}: {e}")
            return []

    def _extract_comment_data(self, comment_element):
        """Extract data from a single comment element with better error handling"""
        try:
            # Expand full comment text with improved click handling
            try:
                see_more = WebDriverWait(comment_element, 5).until(
                    EC.presence_of_element_located((By.XPATH, './/div[contains(text(), "See more")]'))
                )
                if see_more.is_displayed():
                    self.human_click(see_more)
                    time.sleep(random.uniform(0.5, 1.0))
            except (NoSuchElementException, TimeoutException):
                pass
            
            # Get comment text with more robust selection
            comment_text = ""
            try:
                comment_text_element = WebDriverWait(comment_element, 5).until(
                    EC.presence_of_element_located((By.XPATH, './/div[@dir="auto"] | .//div[contains(@class, "xdj266r")]'))
                )
                comment_text = comment_text_element.text.strip()
            except (NoSuchElementException, TimeoutException):
                comment_text = comment_element.text.strip()
            
            if not comment_text:
                return None
            
            # Get comment metadata with better error handling
            comment_author = ""
            comment_time = ""
            try:
                author_element = WebDriverWait(comment_element, 3).until(
                    EC.presence_of_element_located((By.XPATH, './/a[contains(@href, "user") or contains(@href, "profile")]'))
                )
                comment_author = author_element.text.strip()
            except (NoSuchElementException, TimeoutException):
                pass
            
            try:
                time_element = WebDriverWait(comment_element, 3).until(
                    EC.presence_of_element_located((By.XPATH, './/a[contains(@href, "permalink")]'))
                )
                comment_time = time_element.text.strip()
            except (NoSuchElementException, TimeoutException):
                pass
            
            return {
                'author': comment_author,
                'time': comment_time,
                'text': comment_text
            }
        
        except Exception as e:
            self.logger.warning(f"Error extracting comment: {e}")
            return None

    def save_to_csv(self, data: List[Dict], output_file: str):
        """Save scraped comments to CSV"""
        try:
            os.makedirs(os.path.dirname(output_file) or '.', exist_ok=True)
            
            with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow(['Author', 'Timestamp', 'Comment'])
                for comment in data:
                    writer.writerow([
                        comment.get('author', ''),
                        comment.get('time', ''),
                        comment.get('text', '')
                    ])
            
            self.logger.info(f"Comments saved to: {output_file}")
        
        except Exception as e:
            self.logger.error(f"Error saving to CSV: {e}")
            raise

    def close(self):
        """Close the browser"""
        try:
            self.driver.quit()
        except Exception:
            pass

def main():
    # Configuration
    USERNAME = 'gmail'  # Replace with your credentials
    PASSWORD = 'password'  # Replace with your password
    VIDEO_URL = 'https://www.facebook.com/PiumiSrinayaka/videos/508592312022519'  # Replace with your video URL
    COMMENTS_OUTPUT_FILE = 'facebook_comments.csv'
    MAX_COMMENTS = 8000  # Target number of comments to scrape
    
    # Initialize scraper
    scraper = None
    try:
        print("Starting Facebook Comment Scraper...")
        scraper = FacebookCommentScraper(USERNAME, PASSWORD, headless=False)
        
        # Login
        scraper.login()
        
        # Scrape comments
        comments_data = scraper.scrape_comments(VIDEO_URL, max_comments=MAX_COMMENTS)
        
        # Save comments to CSV
        if comments_data:
            scraper.save_to_csv(comments_data, COMMENTS_OUTPUT_FILE)
            print(f"Successfully scraped {len(comments_data)} comments.")
        else:
            print("No comments were scraped.")
    
    except Exception as e:
        print(f"Fatal error: {e}")
        if scraper:
            scraper.driver.save_screenshot("error_screenshot.png")
    
    finally:
        if scraper:
            scraper.close()

if __name__ == '__main__':
    main()




Starting Facebook Comment Scraper...


2025-03-30 11:02:16 - INFO: Get LATEST chromedriver version for google-chrome
2025-03-30 11:02:16 - INFO: Get LATEST chromedriver version for google-chrome
2025-03-30 11:02:16 - INFO: Driver [/Users/pasindumalinda/.wdm/drivers/chromedriver/mac64/134.0.6998.165/chromedriver-mac-arm64/chromedriver] found in cache
2025-03-30 11:02:56 - INFO: Login successful!
2025-03-30 11:03:00 - INFO: Starting to scrape comments from: https://www.facebook.com/PiumiSrinayaka/videos/508592312022519
2025-03-30 11:07:08 - INFO: Scrolled 5 times, loaded 52 comments
2025-03-30 11:10:52 - INFO: Scrolled 10 times, loaded 102 comments
2025-03-30 11:14:37 - INFO: Scrolled 15 times, loaded 152 comments
2025-03-30 11:18:21 - INFO: Scrolled 20 times, loaded 202 comments
2025-03-30 11:22:06 - INFO: Scrolled 25 times, loaded 252 comments
2025-03-30 11:25:51 - INFO: Scrolled 30 times, loaded 302 comments
2025-03-30 11:29:36 - INFO: Scrolled 35 times, loaded 352 comments
2025-03-30 11:33:24 - INFO: Scrolled 40 times, lo

Successfully scraped 2002 comments.


# Combine scraped data togather and save it as new csv file

In [16]:
import pandas as pd
import os

def combine_csv_files(file1_path, file2_path, output_path, target_column='Comment'):
    """
    Combine two CSV files by stacking the second dataset after the first one,
    handling the case where the second file has no proper headers.
    
    Args:
        file1_path (str): Path to the first CSV file
        file2_path (str): Path to the second CSV file
        output_path (str): Path where the combined CSV will be saved
        target_column (str): The column name to use for the combined dataset
    
    Returns:
        pandas.DataFrame: The combined DataFrame if successful
    """
    try:
        # Check if input files exist
        if not os.path.exists(file1_path):
            print(f"Error: First input file '{file1_path}' does not exist")
            return None
            
        if not os.path.exists(file2_path):
            print(f"Error: Second input file '{file2_path}' does not exist")
            return None
        
        # Read the first CSV file normally
        print(f"Reading first file: {file1_path}")
        df1 = pd.read_csv(file1_path)
        
        # Read the second CSV file with no header, treating first row as data
        print(f"Reading second file: {file2_path}")
        df2 = pd.read_csv(file2_path, header=None)
        
        # Check if the files have data
        if df1.empty:
            print("Warning: First file is empty")
        if df2.empty:
            print("Warning: Second file is empty")
            
        # Print information for debugging
        print(f"Columns in first dataset: {df1.columns.tolist()}")
        print(f"Column count in second dataset (no headers): {len(df2.columns)}")
        
        # Verify the target column exists in the first dataset
        if target_column not in df1.columns:
            print(f"Warning: Target column '{target_column}' not found in first dataset.")
            print(f"Available columns are: {df1.columns.tolist()}")
            print("Using the first column as the target column.")
            target_column = df1.columns[0]
        
        # For the second dataset, if it has only one column, rename it to match the target column
        if len(df2.columns) == 1:
            print(f"Second dataset has one column. Renaming it to '{target_column}'")
            df2.columns = [target_column]
        else:
            # If there are multiple columns in df2, we need to determine which one contains the comments
            print(f"Second dataset has {len(df2.columns)} columns. Using the first column as '{target_column}'")
            # Create a new DataFrame with only the relevant column
            df2 = pd.DataFrame({target_column: df2.iloc[:, 0]})
        
        # Extract only the target column from df1
        df1_filtered = df1[[target_column]].copy()
        
        # Combine the DataFrames by appending df2 after df1
        print("Combining datasets...")
        combined_df = pd.concat([df1_filtered, df2], ignore_index=True)
        
        # Create directory for output if it doesn't exist
        output_dir = os.path.dirname(output_path)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)
            
        # Save the combined DataFrame to a new CSV file
        print(f"Saving combined dataset to: {output_path}")
        combined_df.to_csv(output_path, index=False)
        
        print(f"Successfully combined {len(df1)} rows from first file and {len(df2)} rows from second file.")
        print(f"Total rows in combined file: {len(combined_df)}")
        print(f"Final columns: {combined_df.columns.tolist()}")
        
        return combined_df
        
    except pd.errors.EmptyDataError:
        print("Error: One of the files is empty or not formatted correctly")
        return None
    except pd.errors.ParserError:
        print("Error: Unable to parse one of the CSV files. Please check the file format")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        return None

# Example usage in Jupyter notebook
file1_path = '/Volumes/KODAK/folder 02/web scrapping/facebook_data.csv'
file2_path = '/Volumes/KODAK/folder 02/web scrapping/comments_data_03.csv'
output_path = '/Volumes/KODAK/folder 02/web scrapping/comment_data05.csv'

# Call the function with the specific column name from the first CSV
combined_data = combine_csv_files(file1_path, file2_path, output_path, target_column='Comment')

# If successful, you can work with the combined data directly
if combined_data is not None:
    # Display the first few rows of the combined dataset
    combined_data.head()
    
    # Get basic statistics
    combined_data.describe()
    
    # Check the shape of the combined data
    print(f"Combined data shape: {combined_data.shape}")

Reading first file: /Volumes/KODAK/folder 02/web scrapping/facebook_data.csv
Reading second file: /Volumes/KODAK/folder 02/web scrapping/comments_data_03.csv
Columns in first dataset: ['Comment']
Column count in second dataset (no headers): 1
Second dataset has one column. Renaming it to 'Comment'
Combining datasets...
Saving combined dataset to: /Volumes/KODAK/folder 02/web scrapping/comment_data05.csv
Successfully combined 2002 rows from first file and 448 rows from second file.
Total rows in combined file: 2450
Final columns: ['Comment']
Combined data shape: (2450, 1)


In [5]:
data_01 = pd.read_csv(f'/Volumes/KODAK/folder 02/web scrapping/facebook_comments.csv')
data_01.head()

Unnamed: 0,Author,Timestamp,Comment
0,,,Sankalpa Adikari\nමොනා උනත් පොත්ත සුදු කෑල්ලක්...
1,,,Aqeelah Packeerally\nCard බැරිනං PickMe එකෙන් ...
2,,,"Deshan Liyanage\nඉතුරු ටික,"
3,,,"Dilshan Madhuranga\nනිළියක් කියලා, ඌ ගනන් ගන්න..."
4,,,Sameera Salinda Perera\nට්\n‍\nරිප් එක මැදදී ක...


In [13]:
data_01

Unnamed: 0,Comment
0,Sankalpa Adikari\nමොනා උනත් පොත්ත සුදු කෑල්ලක්...
1,Aqeelah Packeerally\nCard බැරිනං PickMe එකෙන් ...
2,"Deshan Liyanage\nඉතුරු ටික,"
3,"Dilshan Madhuranga\nනිළියක් කියලා, ඌ ගනන් ගන්න..."
4,Sameera Salinda Perera\nට්\n‍\nරිප් එක මැදදී ක...
...,...
1997,Naveen Tharusha\nබහින්න බැ නම් ගෙදර එක්ක පලයන්
1998,Shiyam Baii\nComish kapanawakiyanne upparimma bag
1999,Madushanka Veerasinghe\nMan wagenam sallith ep...
2000,Tharindu Nishan Samarasekara\nහැබැයි ඉතිං කෑෂ්...


In [14]:
data_01.to_csv('facebook_data.csv', index= False)

# Remiving duplicates data

In [18]:
import pandas as pd


df = pd.read_csv(f'/Volumes/KODAK/folder 02/web scrapping/comment_data05.csv')

# Check for duplicates in the single column
duplicates = df['Comment'].duplicated(keep=False)  # keep=False marks all duplicates as True

# Get all duplicated values (appearing more than once)
duplicated_values = df['Comment'][duplicates].value_counts()

print("Duplicate analysis:")
print("===================")
print(f"Total rows: {len(df)}")
print(f"Unique values: {df['Comment'].nunique()}")
print(f"Duplicate rows: {duplicates.sum()}")
print("\nDuplicated values and their counts:")
print(duplicated_values)

# Optional: Create a DataFrame showing all duplicate rows
duplicate_rows = df[df['Comment'].duplicated(keep=False)].sort_values('Comment')
print("\nAll duplicate rows:")
print(duplicate_rows)

Duplicate analysis:
Total rows: 2450
Unique values: 2288
Duplicate rows: 292

Duplicated values and their counts:
Comment
Honestly, I’m done with PickMe. Booked a PickMe Flash, and when the driver arrived, I handed him the parcel and told him payment was on card. He threw it right back at me and took off without a word! This isn’t the first time PickMe has treated customers like trash. If they can’t even show basic respect, why should anyone use their service? It’s time people stop supporting this app and find a delivery service that actually values its customers.#stopusingpickme                                                                                                                                              8
Jeff Julian Lappen is with PickMe and Treshan Weerasooriya Pereira.8 November 2024  · Shared with Public                                                                                                                                                                      

In [20]:
import pandas as pd

# Load your dataset (replace with your actual file path)
df = pd.read_csv('/Volumes/KODAK/folder 02/web scrapping/comment_data05.csv')  

# Assuming your column is named 'Category' - CHANGE THIS TO YOUR COLUMN NAME
column_name = 'Comment'  

# Remove duplicates (keeping first occurrence)
deduplicated_df = df.drop_duplicates(subset=[column_name], keep='first')

# Save the cleaned data to a new CSV file
output_path = '/Volumes/KODAK/folder 02/web scrapping/comment_data_06.csv'
deduplicated_df.to_csv(output_path, index=False, encoding='utf-8-sig')

# Print summary
print(f"Original rows: {len(df)}")
print(f"Unique rows after deduplication: {len(deduplicated_df)}")
print(f"Duplicates removed: {len(df) - len(deduplicated_df)}")
print(f"Cleaned data saved to: {output_path}")

Original rows: 2450
Unique rows after deduplication: 2288
Duplicates removed: 162
Cleaned data saved to: /Volumes/KODAK/folder 02/web scrapping/comment_data_06.csv


# Youtube data mining 

In [None]:
import time
import csv
import random
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
import undetected_chromedriver as uc

def scrape_youtube_comments(url):
    """
    Scrape comments from a YouTube video using Selenium with undetected_chromedriver.
    
    Args:
        url (str): URL of the YouTube video
        
    Returns:
        list: List of dictionaries containing comment data
    """
    all_comments = []
    
    try:
        # Initialize undetected_chromedriver
        print("Initializing undetected Chrome browser...")
        options = uc.ChromeOptions()
        options.add_argument("--window-size=1920,1080")
        
        # Random user agent
        user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0"
        ]
        options.add_argument(f"user-agent={random.choice(user_agents)}")
        
        # Initialize driver
        driver = uc.Chrome(options=options)
        
        # Navigate to the YouTube video
        print(f"Navigating to {url}")
        driver.get(url)
        
        # Wait for video page to load
        time.sleep(random.uniform(5, 8))
        
        # Handle cookie consent if appears
        try:
            consent_buttons = driver.find_elements(By.XPATH, "//button[contains(text(), 'Accept all') or contains(text(), 'I agree')]")
            if consent_buttons:
                consent_buttons[0].click()
                print("Clicked cookie consent button")
                time.sleep(2)
        except Exception as e:
            print(f"No cookie consent needed or error: {str(e)}")
        
        # Scroll down to load comments section
        print("Scrolling to comments section...")
        driver.execute_script("window.scrollTo(0, window.scrollY + 700);")
        time.sleep(3)
        
        # Wait for comments to load
        try:
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "ytd-comments"))
            )
            print("Comments section loaded")
        except TimeoutException:
            print("Timed out waiting for comments section to load")
            driver.save_screenshot("youtube_no_comments.png")
            # Try scrolling more
            driver.execute_script("window.scrollTo(0, window.scrollY + 500);")
            time.sleep(5)
        
        # Function to scroll and load more comments
        def scroll_and_load_comments(max_scrolls=20, target_comment_count=500):
            print("Loading comments by scrolling...")
            
            # Initial count
            comments = driver.find_elements(By.CSS_SELECTOR, "ytd-comment-thread-renderer")
            initial_count = len(comments)
            print(f"Initially found {initial_count} comments")
            
            scrolls = 0
            last_count = initial_count
            
            # Scroll until we have enough comments or reach max scrolls
            while scrolls < max_scrolls and len(comments) < target_comment_count:
                # Scroll to the bottom of the comments section
                driver.execute_script(
                    "document.querySelector('ytd-comments').scrollIntoView(false);"
                )
                
                # Add random wait time between scrolls
                time.sleep(random.uniform(2, 4))
                
                # Get updated comment count
                comments = driver.find_elements(By.CSS_SELECTOR, "ytd-comment-thread-renderer")
                
                # Print progress
                new_count = len(comments)
                if new_count > last_count:
                    print(f"Now loaded {new_count} comments")
                    last_count = new_count
                
                # If no new comments are loaded after 3 scrolls, stop
                if scrolls >= 3 and new_count == initial_count:
                    print("No new comments loaded after multiple scrolls. Stopping.")
                    break
                
                scrolls += 1
            
            print(f"Finished loading comments after {scrolls} scrolls. Found {len(comments)} comments.")
            return comments
        
        # Load comments by scrolling
        comment_elements = scroll_and_load_comments()
        
        # Function to expand "Read more" buttons
        def expand_comment_texts():
            print("Expanding 'Read more' buttons...")
            try:
                # Find all "Read more" buttons using the class name provided
                expanders = driver.find_elements(By.CSS_SELECTOR, ".style-scope.ytd-expander[role='button'][aria-expanded='false']")
                expanded_count = 0
                
                for expander in expanders:
                    try:
                        if "more" in expander.text.lower():
                            # Scroll to the button
                            driver.execute_script("arguments[0].scrollIntoView(true);", expander)
                            time.sleep(0.5)
                            
                            # Click the expander
                            expander.click()
                            expanded_count += 1
                            
                            # Add small delay to prevent rate limiting
                            if expanded_count % 5 == 0:
                                time.sleep(random.uniform(0.5, 1))
                    except Exception as e:
                        # Continue with other expanders if one fails
                        continue
                
                print(f"Expanded {expanded_count} 'Read more' buttons")
            except Exception as e:
                print(f"Error expanding comments: {str(e)}")
        
        # Expand comments with "Read more" buttons
        expand_comment_texts()
        
        # Re-fetch comment elements to get the expanded ones
        comment_elements = driver.find_elements(By.CSS_SELECTOR, "ytd-comment-thread-renderer")
        print(f"Processing {len(comment_elements)} comments...")
        
        # Extract comment data
        for i, comment_element in enumerate(comment_elements):
            try:
                # Get author information
                try:
                    author_element = comment_element.find_element(By.CSS_SELECTOR, "#author-text")
                    author = author_element.text.strip()
                except NoSuchElementException:
                    author = f"User #{i+1}"
                
                # Get comment text (use the specific class name provided)
                try:
                    # Try first with the specified class
                    content_element = comment_element.find_element(By.CSS_SELECTOR, ".style-scope.ytd-expander")
                    content = content_element.text.strip()
                except NoSuchElementException:
                    # Fallback to other selectors
                    try:
                        content_element = comment_element.find_element(By.CSS_SELECTOR, "#content-text")
                        content = content_element.text.strip()
                    except:
                        content = "Content not found"
                
                # Get timestamp
                try:
                    time_element = comment_element.find_element(By.CSS_SELECTOR, ".published-time-text")
                    timestamp = time_element.text.strip()
                except NoSuchElementException:
                    timestamp = "Unknown time"
                
                # Get like count
                try:
                    like_element = comment_element.find_element(By.CSS_SELECTOR, "#vote-count-middle")
                    like_count = like_element.text.strip()
                    # Convert "1.2K" format to numbers
                    if like_count.lower().endswith('k'):
                        like_count = float(like_count[:-1]) * 1000
                except NoSuchElementException:
                    like_count = "0"
                
                # Check for replies
                reply_count = 0
                try:
                    reply_button = comment_element.find_elements(By.CSS_SELECTOR, "#replies ytd-button-renderer")
                    if reply_button:
                        reply_text = reply_button[0].text.strip()
                        if "reply" in reply_text.lower() or "replies" in reply_text.lower():
                            # Extract number from text like "View 5 replies"
                            import re
                            numbers = re.findall(r'\d+', reply_text)
                            if numbers:
                                reply_count = int(numbers[0])
                            else:
                                reply_count = 1
                except:
                    pass
                
                # Create comment object
                comment_obj = {
                    'author': author,
                    'content': content,
                    'timestamp': timestamp,
                    'likes': like_count,
                    'reply_count': reply_count,
                    'scraped_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                }
                
                all_comments.append(comment_obj)
                
                # Print progress every 10 comments
                if (i + 1) % 10 == 0:
                    print(f"Processed {i + 1} comments...")
                
            except Exception as e:
                print(f"Error processing comment {i+1}: {str(e)}")
        
        print(f"Successfully extracted {len(all_comments)} comments")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        # Take screenshot to help debug
        try:
            driver.save_screenshot("youtube_error.png")
            print("Saved screenshot of error state")
        except:
            pass
    
    finally:
        try:
            driver.quit()
        except:
            pass
    
    return all_comments

def save_to_csv(comments, filename='youtube_comments.csv'):
    """
    Save the comments to a CSV file.
    
    Args:
        comments (list): List of comment dictionaries
        filename (str): Name of the output file
    """
    try:
        if not comments:
            print("No comments to save.")
            return
            
        # Create DataFrame from comments list
        df = pd.DataFrame(comments)
        
        # Save to CSV
        df.to_csv(filename, index=False, encoding='utf-8')
        print(f"Successfully saved {len(comments)} comments to {filename}")
        
        # Also save a sample to console
        print("\nSample of first few comments:")
        print(df.head().to_string())
            
    except Exception as e:
        print(f"Error saving to CSV with pandas: {str(e)}")
        
        # Fallback to basic CSV writer if pandas fails
        try:
            with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
                if comments:
                    fieldnames = comments[0].keys()
                    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                    writer.writeheader()
                    for comment in comments:
                        writer.writerow(comment)
                    print(f"Successfully saved {len(comments)} comments to {filename} using csv module")
        except Exception as e2:
            print(f"Fallback CSV writer also failed: {str(e2)}")

def main():
    # URL of the YouTube video
    url = "https://www.youtube.com/watch?v=5BnNUUqkI6I&t=224s"
    
    print(f"Starting to scrape comments from: {url}")
    
    # Scrape the comments
    comments = scrape_youtube_comments(url)
    
    # Save the comments to a CSV file
    if comments:
        save_to_csv(comments)
    else:
        print("No comments were scraped.")

if __name__ == "__main__":
    main()

# Raddit data mining 

In [None]:
import time
import csv
import random
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException, StaleElementReferenceException
import undetected_chromedriver as uc

def scrape_reddit_comments_with_replies(url):
    """
    Scrape comments and replies from a Reddit post using undetected_chromedriver.
    Expands all 'show more' buttons to capture all content.
    
    Args:
        url (str): URL of the Reddit post
        
    Returns:
        list: List of dictionaries containing comment and reply data
    """
    all_comments = []
    
    try:
        # Initialize undetected_chromedriver instead of regular webdriver
        # This helps bypass bot detection
        print("Initializing undetected Chrome browser...")
        options = uc.ChromeOptions()
        options.add_argument("--window-size=1920,1080")
        
        # Disable images to speed up loading
        prefs = {"profile.managed_default_content_settings.images": 2}
        options.add_experimental_option("prefs", prefs)
        
        # Use random user agent to avoid detection
        user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0"
        ]
        options.add_argument(f"user-agent={random.choice(user_agents)}")
        
        # Initialize the driver
        driver = uc.Chrome(options=options)
        
        print(f"Navigating to {url}")
        driver.get(url)
        
        # Add random sleep to mimic human behavior
        time.sleep(random.uniform(3, 6))
        
        # Check if we need to handle any consent dialogs or overlays
        try:
            print("Checking for consent dialogs or overlays...")
            consent_buttons = driver.find_elements(By.XPATH, "//button[contains(text(), 'Accept') or contains(text(), 'I Agree') or contains(text(), 'Continue')]")
            for button in consent_buttons:
                try:
                    button.click()
                    print("Clicked a consent button")
                    time.sleep(2)
                except:
                    pass
        except:
            pass
        
        # Try to detect NSFW or age verification prompts
        try:
            nsfw_buttons = driver.find_elements(By.XPATH, "//button[contains(text(), 'Yes') or contains(text(), 'Continue') or contains(text(), 'Accept')]")
            for button in nsfw_buttons:
                try:
                    button.click()
                    print("Clicked an NSFW/age verification button")
                    time.sleep(2)
                except:
                    pass
        except:
            pass
        
        # Wait for comments to load
        print("Waiting for page content to load...")
        time.sleep(10)  # Give more time for the page to load completely
        
        # Check if we're getting a "Something went wrong" Reddit error page
        if "something went wrong" in driver.page_source.lower():
            print("Reddit is showing an error page. Taking a screenshot for debugging...")
            driver.save_screenshot("reddit_error.png")
            
            # Try refreshing the page
            driver.refresh()
            time.sleep(5)
        
        # Save page source for debugging
        with open("reddit_page_source.html", "w", encoding="utf-8") as f:
            f.write(driver.page_source)
        print("Saved page source for debugging")
        
        # Try different selectors for comments
        print("Trying various selectors to find comments...")
        
        # Method 1: Direct Reddit API approach (more reliable)
        # Modify URL to get JSON data
        json_url = url.split('?')[0] + '.json'
        print(f"Attempting to fetch JSON data from: {json_url}")
        
        driver.get(json_url)
        time.sleep(3)
        
        # Parse the JSON data
        try:
            # Get the pre-formatted JSON content
            json_text = driver.find_element(By.TAG_NAME, "pre").text
            import json
            json_data = json.loads(json_text)
            
            # Process the JSON data to extract comments
            print("Successfully retrieved JSON data, extracting comments...")
            
            # Extract post details
            post_data = json_data[0]['data']['children'][0]['data']
            post_title = post_data.get('title', 'Unknown Title')
            post_author = post_data.get('author', 'Unknown Author')
            
            print(f"Post title: {post_title}")
            print(f"Post author: {post_author}")
            
            # Extract comments
            comments_data = json_data[1]['data']['children']
            
            # Function to recursively extract comments
            def extract_comments(comment_data, level=0, parent_author=None):
                comments_list = []
                
                if comment_data.get('kind') == 't1':  # Regular comment
                    data = comment_data.get('data', {})
                    
                    # Basic comment data
                    author = data.get('author', 'deleted')
                    content = data.get('body', '')
                    created_utc = data.get('created_utc', 0)
                    timestamp = datetime.fromtimestamp(created_utc).strftime('%Y-%m-%d %H:%M:%S')
                    score = data.get('score', 0)
                    
                    # Create comment object
                    comment_obj = {
                        'id': data.get('id', ''),
                        'author': author,
                        'timestamp': timestamp,
                        'content': content,
                        'score': score,
                        'is_reply': level > 0,
                        'reply_level': level,
                        'parent_author': parent_author,
                        'scraped_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                    }
                    
                    comments_list.append(comment_obj)
                    
                    # Process replies recursively
                    replies = data.get('replies', {})
                    if replies and 'data' in replies and 'children' in replies['data']:
                        for reply in replies['data']['children']:
                            comments_list.extend(extract_comments(reply, level+1, author))
                
                elif comment_data.get('kind') == 'more':
                    # These are collapsed comments we can't access without JS interaction
                    pass
                    
                return comments_list
            
            # Process all top-level comments
            for comment_data in comments_data:
                extracted = extract_comments(comment_data)
                all_comments.extend(extracted)
            
            print(f"Extracted {len(all_comments)} comments from JSON data")
            
            # If we got comments, return them
            if all_comments:
                return all_comments
                
        except Exception as e:
            print(f"Error processing JSON data: {str(e)}")
        
        # If JSON approach failed, go back to the original URL and try HTML scraping
        print("Falling back to HTML scraping approach")
        driver.get(url)
        time.sleep(5)
        
        # Try different comment selectors
        comment_selectors = [
            "div[data-testid='comment']",
            "div.Comment",
            "div[data-test-id='comment']",
            "div.t1_*",  # Reddit comment IDs start with t1_
            "div.md"
        ]
        
        # Try each selector until we find comments
        comment_containers = []
        for selector in comment_selectors:
            print(f"Trying selector: {selector}")
            comment_containers = driver.find_elements(By.CSS_SELECTOR, selector)
            if len(comment_containers) > 0:
                print(f"Found {len(comment_containers)} comments with selector: {selector}")
                break
        
        # If we still couldn't find comments, try XPath
        if len(comment_containers) == 0:
            print("Trying XPath to find comments...")
            try:
                comment_containers = driver.find_elements(By.XPATH, "//div[contains(@class, 'Comment') or contains(@class, 'comment')]")
                print(f"Found {len(comment_containers)} comments with XPath")
            except:
                print("XPath approach also failed")
        
        # If we still have no comments, take a screenshot and return empty list
        if len(comment_containers) == 0:
            print("Could not find any comments. Taking a screenshot for debugging...")
            driver.save_screenshot("reddit_no_comments.png")
            return []
        
        # Process each comment and its replies
        for i, container in enumerate(comment_containers):
            try:
                # Extract comment text with fallbacks
                content = "Content not found"
                author = f"User #{i+1}"
                timestamp = "Unknown Time"
                
                # Multiple attempts to get comment content
                content_selectors = [
                    ".md",
                    "[data-testid='comment-content']",
                    "p",
                    "div.RichTextJSON-root"
                ]
                
                for selector in content_selectors:
                    try:
                        content_element = container.find_element(By.CSS_SELECTOR, selector)
                        content = content_element.text
                        if content:
                            break
                    except:
                        continue
                
                # Multiple attempts to get author
                author_selectors = [
                    "[data-testid='comment_author']",
                    "a.author",
                    ".author"
                ]
                
                for selector in author_selectors:
                    try:
                        author_element = container.find_element(By.CSS_SELECTOR, selector)
                        author = author_element.text
                        if author:
                            break
                    except:
                        continue
                
                # Try to determine nesting level
                is_reply = False
                reply_level = 0
                parent_author = None
                
                # Create comment object
                comment_obj = {
                    'id': i + 1,
                    'author': author,
                    'timestamp': timestamp,
                    'content': content,
                    'is_reply': is_reply,
                    'reply_level': reply_level,
                    'parent_author': parent_author,
                    'scraped_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                }
                
                all_comments.append(comment_obj)
                
            except Exception as e:
                print(f"Error processing comment {i+1}: {str(e)}")
        
        print(f"Successfully extracted {len(all_comments)} comments and replies")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    
    finally:
        try:
            driver.quit()
        except:
            pass
    
    return all_comments

def save_to_csv(comments, filename='reddit_comments.csv'):
    """
    Save the comments to a CSV file.
    
    Args:
        comments (list): List of comment dictionaries
        filename (str): Name of the output file
    """
    try:
        if not comments:
            print("No comments to save.")
            return
            
        # Create DataFrame from comments list
        df = pd.DataFrame(comments)
        
        # Save to CSV
        df.to_csv(filename, index=False, encoding='utf-8')
        print(f"Successfully saved {len(comments)} comments to {filename}")
        
        # Also save a sample to console
        print("\nSample of first few comments:")
        print(df.head().to_string())
            
    except Exception as e:
        print(f"Error saving to CSV: {str(e)}")
        
        # Fallback to basic CSV writer if pandas fails
        try:
            with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
                if comments:
                    fieldnames = comments[0].keys()
                    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                    writer.writeheader()
                    for comment in comments:
                        writer.writerow(comment)
                    print(f"Successfully saved {len(comments)} comments to {filename} using csv module")
        except Exception as e2:
            print(f"Fallback CSV writer also failed: {str(e2)}")

def main():
    # URL of the Reddit post
    url = "https://www.reddit.com/r/srilanka/comments/150xxzl/why_uber_pickme_riders_hate_credit_cards/?rdt=51720"
    
    print(f"Starting to scrape comments from: {url}")
    
    # Scrape the comments
    comments = scrape_reddit_comments_with_replies(url)
    
    # Save the comments to a CSV file
    if comments:
        save_to_csv(comments)
    else:
        print("No comments were scraped.")

if __name__ == "__main__":
    main()

# Trip advisor data mining 

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import random

def get_reviews_from_page(soup):
    """Scrape review data from a single page."""
    reviews = []
    
    # Find all review containers with the specified class
    review_containers = soup.select('.ui_column.is-9')
    
    # If no reviews found with that class, try alternative selectors
    if not review_containers:
        review_containers = soup.select('.review-container')
        
    for container in review_containers:
        try:
            # Extract review data
            review_data = {}
            
            # Review title
            title_elem = container.select_one('.title') or container.select_one('.noQuotes')
            review_data['title'] = title_elem.text.strip() if title_elem else "No Title"
            
            # Review text
            text_elem = container.select_one('.prw_reviews_text_summary_hsx') or container.select_one('.reviewSelector')
            if text_elem:
                review_text = text_elem.select_one('.partial_entry')
                review_data['text'] = review_text.text.strip() if review_text else "No Text"
            else:
                review_data['text'] = "No Text"
            
            # Rating
            rating_elem = container.select_one('.ui_bubble_rating')
            if rating_elem and 'class' in rating_elem.attrs:
                bubble_class = ' '.join(rating_elem['class'])
                rating_match = re.search(r'bubble_(\d+)', bubble_class)
                review_data['rating'] = int(rating_match.group(1))/10 if rating_match else "N/A"
            else:
                review_data['rating'] = "N/A"
            
            # Date of visit
            date_elem = (container.select_one('.prw_reviews_stay_date_hsx') or 
                         container.select_one('.ratingDate'))
            review_data['date_of_visit'] = date_elem.text.strip() if date_elem else "Not specified"
            
            # Reviewer name
            reviewer_elem = (container.select_one('.info_text') or 
                            container.select_one('.memberOverlayLink'))
            review_data['reviewer'] = reviewer_elem.text.strip() if reviewer_elem else "Anonymous"
            
            # Reviewer location
            location_elem = container.select_one('.userLoc')
            review_data['location'] = location_elem.text.strip() if location_elem else "Not specified"
            
            reviews.append(review_data)
        except Exception as e:
            print(f"Error extracting review: {e}")
    
    return reviews

def get_next_page_url(soup, base_url):
    """Get the URL for the next page using the 'Next' button."""
    try:
        next_button = soup.select_one('.nav.next.ui_button.primary')
        if next_button and 'href' in next_button.attrs:
            next_page_path = next_button['href']
            return f"https://www.tripadvisor.com{next_page_path}"
        else:
            print("No 'Next' button found. This might be the last page.")
            return None
    except Exception as e:
        print(f"Error finding 'Next' button: {e}")
        return None

def scrape_reviews_with_next_button(base_url, max_pages):
    """Scrape reviews by simulating clicks on the 'Next' button."""
    all_reviews = []
    current_url = base_url
    page_count = 0
    
    while page_count < max_pages:
        try:
            print(f"Scraping page {page_count + 1}...")
            print(f"URL: {current_url}")
            
            # Send a request to the current page
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                'Accept-Language': 'en-US,en;q=0.9',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Connection': 'keep-alive',
                'Referer': 'https://www.tripadvisor.com/'
            }
            response = requests.get(current_url, headers=headers)
            response.raise_for_status()
            
            # Parse the page content
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Scrape reviews from the current page
            reviews = get_reviews_from_page(soup)
            all_reviews.extend(reviews)
            
            # Check if we've reached the last page
            next_page_url = get_next_page_url(soup, base_url)
            if not next_page_url:
                print("Reached the last page.")
                break
            
            # Update the URL for the next iteration
            current_url = next_page_url
            page_count += 1
            
            # Add a random delay to avoid being blocked
            time.sleep(random.uniform(2.0, 4.0))
        except Exception as e:
            print(f"Error scraping page {page_count + 1}: {e}")
            break
    
    return all_reviews

def save_to_csv(reviews, filename="tripadvisor_reviews.csv"):
    """Save the reviews to a CSV file."""
    if not reviews:
        print("No reviews to save.")
        return
    
    # Get fields from the first review
    fieldnames = reviews[0].keys()
    
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(reviews)
    
    print(f"Successfully saved {len(reviews)} reviews to {filename}")

def main():
    base_url = "https://www.tripadvisor.com/ShowUserReviews-g293962-d10792982-r730596695-PickMe-Colombo_Western_Province.html#REVIEWS"
    
    # Ask user for the number of pages to scrape
    max_pages = int(input("Enter the number of pages to scrape (e.g., 30): "))
    
    print(f"Starting to scrape TripAdvisor reviews for {max_pages} pages...")
    all_reviews = scrape_reviews_with_next_button(base_url, max_pages)
    
    print(f"Total reviews collected: {len(all_reviews)}")
    
    # Save to CSV
    save_to_csv(all_reviews)

if __name__ == "__main__":
    main()