In [1]:
!pip install selenium beautifulsoup4 pandas openpyxl


Collecting selenium
  Downloading selenium-4.25.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting websocket-client~=1.8 (from selenium)
  Downloading websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting attrs>=23.2.0 (from trio~=0.17->selenium)
  Downloading attrs-24.2.0-py3-none-any.whl.metadata (11 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting sniffio>=1.3.0 (from trio~=0.17->selenium)
  Downloading sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting cffi>=1.14 (from trio~=0.17->selenium)
  Downloading cffi-1.17.1-cp39-cp39-win_amd64

In [13]:
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from tkinter import Tk, Label, Entry, Button, filedialog, messagebox, Text, Scrollbar
import time
import logging

# Set up logging to identify any issues during scraping
logging.basicConfig(level=logging.INFO)

# Configure WebDriver with anti-detection features
def setup_webdriver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run browser in headless mode
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-extensions")
    options.add_argument("--incognito")  # Use incognito mode for scraping
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
    )
    logging.info("WebDriver setup complete.")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Scroll down the page incrementally to trigger all dynamic loads
def scroll_to_bottom(driver):
    """Scrolls to the bottom of the page incrementally."""
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        logging.info("Scrolling down...")
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  # Wait for content to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# Improved scraping method
def scrape_dynamic(url):
    """Scrapes dynamic content using Selenium WebDriver."""
    driver = setup_webdriver()
    driver.get(url)
    logging.info(f"Accessed URL: {url}")

    # Scroll to the bottom to ensure all content loads
    scroll_to_bottom(driver)

    try:
        # Wait for player stats table to load
        logging.info("Waiting for player stats elements...")
        players = WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.XPATH, "//div[contains(@class, 'player-stats-table')]//ul"))
        )
    except Exception as e:
        driver.quit()
        raise ValueError(f"Could not locate player stats: {str(e)}")

    # Extract data from the elements
    data = []
    logging.info("Extracting player stats...")
    for player in players:
        row = [elem.text for elem in player.find_elements(By.TAG_NAME, "li")]
        if row:
            data.append(row)

    driver.quit()

    if not data:
        raise ValueError("No relevant data found on the page.")
    
    logging.info(f"Successfully scraped {len(data)} rows.")
    return pd.DataFrame(data)

# Handling the scraping process with a user interface
def scrape_data():
    """Handles the scraping process based on user input."""
    url = url_entry.get()
    try:
        # Use Selenium to scrape dynamic content
        data = scrape_dynamic(url)

        # Display the data in the GUI
        text_widget.delete(1.0, 'end')
        text_widget.insert('end', data.to_string())

        # Save the data as CSV or Excel
        def save_file():
            file_path = filedialog.asksaveasfilename(
                defaultextension=".csv",
                filetypes=[("CSV files", "*.csv"), ("Excel files", "*.xlsx")]
            )
            if file_path.endswith('.csv'):
                data.to_csv(file_path, index=False)
            else:
                data.to_excel(file_path, index=False)
            messagebox.showinfo("Success", "Data saved successfully!")

        save_button.config(command=save_file)
        save_button.pack(pady=5)

    except Exception as e:
        messagebox.showerror("Error", f"An error occurred: {str(e)}")

# GUI Setup
app = Tk()
app.title("Advanced Web Scraper")

# URL Input
Label(app, text="Enter URL:").pack()
url_entry = Entry(app, width=50)
url_entry.pack(pady=5)

# Scrape Button
Button(app, text="Go", command=scrape_data).pack(pady=5)

# Display Scraped Data in a Scrollable Text Widget
scrollbar = Scrollbar(app)
scrollbar.pack(side='right', fill='y')
text_widget = Text(app, wrap='none', yscrollcommand=scrollbar.set)
text_widget.pack(expand=True, fill='both')
scrollbar.config(command=text_widget.yview)

# Save Button (Initially hidden)
save_button = Button(app, text="Save as CSV/Excel")

# Run the App
app.geometry("800x600")
app.mainloop()


INFO:root:WebDriver setup complete.
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [C:\Users\devan\.wdm\drivers\chromedriver\win64\130.0.6723.69\chromedriver-win32/chromedriver.exe] found in cache
INFO:root:Accessed URL: https://fbref.com/en/comps/9/stats/Premier-League-Stats
INFO:root:Scrolling down...
INFO:root:Scrolling down...
INFO:root:Waiting for player stats elements...


In [1]:
!pip install webdriver-manager


Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting python-dotenv (from webdriver-manager)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, webdriver-manager
Successfully installed python-dotenv-1.0.1 webdriver-manager-4.0.2


In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

def setup_webdriver():
    """Auto-download and set up the latest ChromeDriver."""
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()))


In [None]:
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from tkinter import Tk, Label, Entry, Button, filedialog, messagebox, Text, Scrollbar
import time

def setup_webdriver():
    """Sets up Selenium WebDriver with auto-downloaded ChromeDriver."""
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode (no GUI)
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def scrape_static(url):
    """Scrapes static content using requests and BeautifulSoup."""
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

def scrape_dynamic(url):
    """Scrapes dynamic content using Selenium WebDriver."""
    driver = setup_webdriver()
    driver.get(url)
    time.sleep(3)  # Wait for the page to fully load
    page_content = driver.page_source
    driver.quit()
    return BeautifulSoup(page_content, 'html.parser')

def extract_tables(soup):
    """Extracts tables from the BeautifulSoup object."""
    tables = soup.find_all('table')
    if not tables:
        raise ValueError("No tables found on the page.")
    return pd.read_html(str(tables[0]))[0]  # Convert the first table to DataFrame

def scrape_data():
    """Handles the scraping process based on user input."""
    url = url_entry.get()
    try:
        # Determine if the URL needs dynamic scraping
        if 'dynamic:' in url:
            soup = scrape_dynamic(url.replace('dynamic:', ''))
        else:
            soup = scrape_static(url)

        # Extract and display the data
        table_data = extract_tables(soup)
        text_widget.delete(1.0, 'end')
        text_widget.insert('end', table_data.to_string())

        # Save the data as CSV or Excel
        def save_file():
            file_path = filedialog.asksaveasfilename(
                defaultextension=".csv",
                filetypes=[("CSV files", "*.csv"), ("Excel files", "*.xlsx")]
            )
            if file_path.endswith('.csv'):
                table_data.to_csv(file_path, index=False)
            else:
                table_data.to_excel(file_path, index=False)
            messagebox.showinfo("Success", "Data saved successfully!")

        save_button.config(command=save_file)
        save_button.pack(pady=5)

    except Exception as e:
        messagebox.showerror("Error", str(e))

# GUI Setup
app = Tk()
app.title("Advanced Web Scraper")

# URL Input
Label(app, text="Enter URL (prefix 'dynamic:' for JavaScript-heavy sites):").pack()
url_entry = Entry(app, width=50)
url_entry.pack(pady=5)

# Scrape Button
Button(app, text="Go", command=scrape_data).pack(pady=5)

# Display Scraped Data in a Scrollable Text Widget
scrollbar = Scrollbar(app)
scrollbar.pack(side='right', fill='y')
text_widget = Text(app, wrap='none', yscrollcommand=scrollbar.set)
text_widget.pack(expand=True, fill='both')
scrollbar.config(command=text_widget.yview)

# Save Button (Initially hidden)
save_button = Button(app, text="Save as CSV/Excel")

# Run the App
app.geometry("800x600")
app.mainloop()


In [6]:
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from tkinter import Tk, Label, Entry, Button, filedialog, messagebox, Text, Scrollbar
import time

# Setup WebDriver with Options
def setup_webdriver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Headless mode for no UI
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-blink-features=AutomationControlled")  # Avoid detection
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Scroll Down Method
def scroll_to_bottom(driver):
    """Scroll down the page to load all dynamic content."""
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Adjust this delay based on the page's loading speed
        
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# Scrape Player Stats using Selenium
def scrape_dynamic(url):
    """Scrapes dynamic content using Selenium WebDriver."""
    driver = setup_webdriver()
    driver.get(url)

    # Scroll to the bottom to ensure all content loads
    scroll_to_bottom(driver)

    # Wait for the player stats table to appear
    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'statistics-detail')]"))
        )
    except Exception as e:
        driver.quit()
        raise ValueError(f"Could not find the player stats table: {str(e)}")

    # Extract the relevant content
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    stats_table = soup.find_all("div", class_="statistics-detail__skeleton")

    data = []
    for stat in stats_table:
        rows = stat.find_all("div", class_="skl-table-row")
        for row in rows:
            player_data = [cell.get_text(strip=True) for cell in row.find_all("div", class_="skl-cell")]
            if player_data:
                data.append(player_data)

    driver.quit()

    # Convert the data into a DataFrame
    if not data:
        raise ValueError("No relevant data found on the page.")
    
    return pd.DataFrame(data)

# Handle Scraping Process
def scrape_data():
    """Handles the scraping process based on user input."""
    url = url_entry.get()
    try:
        # Use Selenium for dynamic scraping
        data = scrape_dynamic(url)

        # Display the data in the GUI
        text_widget.delete(1.0, 'end')
        text_widget.insert('end', data.to_string())

        # Save Data Function
        def save_file():
            file_path = filedialog.asksaveasfilename(
                defaultextension=".csv",
                filetypes=[("CSV files", "*.csv"), ("Excel files", "*.xlsx")]
            )
            if file_path.endswith('.csv'):
                data.to_csv(file_path, index=False)
            else:
                data.to_excel(file_path, index=False)
            messagebox.showinfo("Success", "Data saved successfully!")

        save_button.config(command=save_file)
        save_button.pack(pady=5)

    except Exception as e:
        messagebox.showerror("Error", f"An error occurred: {str(e)}")

# GUI Setup
app = Tk()
app.title("Advanced Web Scraper")

# URL Input
Label(app, text="Enter URL:").pack()
url_entry = Entry(app, width=50)
url_entry.pack(pady=5)

# Scrape Button
Button(app, text="Go", command=scrape_data).pack(pady=5)

# Display Scraped Data in a Scrollable Text Widget
scrollbar = Scrollbar(app)
scrollbar.pack(side='right', fill='y')
text_widget = Text(app, wrap='none', yscrollcommand=scrollbar.set)
text_widget.pack(expand=True, fill='both')
scrollbar.config(command=text_widget.yview)

# Save Button (Initially hidden)
save_button = Button(app, text="Save as CSV/Excel")

# Run the App
app.geometry("800x600")
app.mainloop()


KeyboardInterrupt: 

In [7]:
!pip install --upgrade selenium




In [10]:
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from tkinter import Tk, Label, Entry, Button, filedialog, messagebox, Text, Scrollbar
import time

# Configure WebDriver
def setup_webdriver():
    """Sets up Selenium WebDriver with auto-downloaded ChromeDriver."""
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run browser in headless mode (no UI)
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument(
        "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
    )
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Scroll down the page to load all dynamic content
def scroll_to_bottom(driver):
    """Scrolls down the page to trigger dynamic loading."""
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  # Wait for content to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# Scraping the player stats using Selenium
def scrape_dynamic(url):
    """Scrapes dynamic content using Selenium WebDriver."""
    driver = setup_webdriver()
    driver.get(url)

    # Scroll to the bottom to ensure all content loads
    scroll_to_bottom(driver)

    try:
        # Wait until the player stats table is present
        players = WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.XPATH, "//div[@class='player-stats-table']//ul"))
        )
    except Exception as e:
        driver.quit()
        raise ValueError(f"Could not find the player stats table: {str(e)}")

    data = []
    for player in players:
        row = [elem.text for elem in player.find_elements(By.TAG_NAME, "li")]
        if row:
            data.append(row)

    driver.quit()

    if not data:
        raise ValueError("No relevant data found on the page.")

    return pd.DataFrame(data)

# Handling the scraping process
def scrape_data():
    """Handles the scraping process based on user input."""
    url = url_entry.get()
    try:
        # Use Selenium for dynamic scraping
        data = scrape_dynamic(url)

        # Display the data in the GUI
        text_widget.delete(1.0, 'end')
        text_widget.insert('end', data.to_string())

        # Save the data as CSV or Excel
        def save_file():
            file_path = filedialog.asksaveasfilename(
                defaultextension=".csv",
                filetypes=[("CSV files", "*.csv"), ("Excel files", "*.xlsx")]
            )
            if file_path.endswith('.csv'):
                data.to_csv(file_path, index=False)
            else:
                data.to_excel(file_path, index=False)
            messagebox.showinfo("Success", "Data saved successfully!")

        save_button.config(command=save_file)
        save_button.pack(pady=5)

    except Exception as e:
        messagebox.showerror("Error", f"An error occurred: {str(e)}")

# GUI Setup
app = Tk()
app.title("Advanced Web Scraper")

# URL Input
Label(app, text="Enter URL:").pack()
url_entry = Entry(app, width=50)
url_entry.pack(pady=5)

# Scrape Button
Button(app, text="Go", command=scrape_data).pack(pady=5)

# Display Scraped Data in a Scrollable Text Widget
scrollbar = Scrollbar(app)
scrollbar.pack(side='right', fill='y')
text_widget = Text(app, wrap='none', yscrollcommand=scrollbar.set)
text_widget.pack(expand=True, fill='both')
scrollbar.config(command=text_widget.yview)

# Save Button (Initially hidden)
save_button = Button(app, text="Save as CSV/Excel")

# Run the App
app.geometry("800x600")
app.mainloop()
