In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from urllib.parse import urljoin, urlparse

In [4]:
class TableScraper:
    def __init__(self, delay=1):
        """
        Initialize the scraper with optional delay between requests
        """
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        self.delay = delay
    
    def scrape_tables(self, url, table_selector=None, save_to_csv=False, filename=None):
        """
        Scrape all tables from a webpage
        
        Args:
            url (str): URL of the webpage
            table_selector (str): CSS selector for specific tables (optional)
            save_to_csv (bool): Whether to save tables as CSV files
            filename (str): Base filename for CSV files
        
        Returns:
            list: List of pandas DataFrames containing table data
        """
        try:
            print(f"Fetching data from: {url}")
            response = self.session.get(url)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find tables
            if table_selector:
                tables = soup.select(table_selector)
            else:
                tables = soup.find_all('table')
            
            if not tables:
                print("No tables found on the page")
                return []
            
            print(f"Found {len(tables)} table(s)")
            
            dataframes = []
            
            for i, table in enumerate(tables):
                print(f"Processing table {i+1}/{len(tables)}")
                
                # Extract table data
                df = self._parse_table(table)
                
                if df is not None and not df.empty:
                    dataframes.append(df)
                    
                    # Save to CSV if requested
                    if save_to_csv:
                        csv_filename = f"{filename or 'table'}_{i+1}.csv"
                        df.to_csv(csv_filename, index=False)
                        print(f"Saved table {i+1} to {csv_filename}")
                
                # Add delay between processing
                if i < len(tables) - 1:
                    time.sleep(self.delay)
            
            return dataframes
            
        except requests.RequestException as e:
            print(f"Error fetching the webpage: {e}")
            return []
        except Exception as e:
            print(f"An error occurred: {e}")
            return []
    
    def _parse_table(self, table):
        """
        Parse a BeautifulSoup table element into a pandas DataFrame
        """
        try:
            # Find all rows
            rows = table.find_all('tr')
            if not rows:
                return None
            
            # Extract headers
            headers = []
            first_row = rows[0]
            header_cells = first_row.find_all(['th', 'td'])
            
            for cell in header_cells:
                headers.append(cell.get_text(strip=True))
            
            # If no proper headers found, create generic ones
            if not headers or all(not h for h in headers):
                headers = [f"Column_{i+1}" for i in range(len(header_cells))]
            
            # Extract data rows
            data = []
            start_row = 1 if first_row.find('th') else 0
            
            for row in rows[start_row:]:
                cells = row.find_all(['td', 'th'])
                row_data = []
                
                for cell in cells:
                    cell_text = cell.get_text(strip=True)
                    row_data.append(cell_text)
                
                # Only add rows that have data
                if row_data and any(cell.strip() for cell in row_data):
                    # Pad row to match header length
                    while len(row_data) < len(headers):
                        row_data.append('')
                    data.append(row_data[:len(headers)])
            
            if not data:
                return None
            
            # Create DataFrame
            df = pd.DataFrame(data, columns=headers)
            return df
            
        except Exception as e:
            print(f"Error parsing table: {e}")
            return None
    
    def scrape_specific_table(self, url, table_index=0):
        """
        Scrape a specific table by index
        """
        tables = self.scrape_tables(url)
        if tables and len(tables) > table_index:
            return tables[table_index]
        return None
    
    def scrape_tables_with_pandas(self, url):
        """
        Alternative method using pandas read_html (simpler but less control)
        """
        try:
            print(f"Using pandas to scrape tables from: {url}")
            tables = pd.read_html(url)
            print(f"Found {len(tables)} table(s) using pandas")
            return tables
        except Exception as e:
            print(f"Error using pandas read_html: {e}")
            return []

In [5]:
# Example usage functions
def example_basic_scraping():
    """
    Basic example of scraping tables from a webpage
    """
    scraper = TableScraper()
    
    # Example URL (replace with your target URL)
    url = "https://feheroes.fandom.com/wiki/Sacred_Seals#List_of_Sacred_Seals"
    
    # Scrape all tables
    tables = scraper.scrape_tables(url, save_to_csv=True, filename="sacred_seals")
    
    # Display first few rows of each table
    for i, df in enumerate(tables):
        print(f"\n--- Table {i+1} ---")
        print(df.head())
        print(f"Shape: {df.shape}")

def example_specific_table():
    """
    Example of scraping a specific table with CSS selector
    """
    scraper = TableScraper()
    
    url = "https://example.com/data-page"
    
    # Scrape tables with specific CSS selector
    tables = scraper.scrape_tables(
        url, 
        table_selector="table.data-table",  # CSS selector for specific tables
        save_to_csv=True,
        filename="specific_tables"
    )
    
    return tables

def example_pandas_method():
    """
    Example using pandas read_html method
    """
    scraper = TableScraper()
    
    url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
    
    # Using pandas method (simpler but less control)
    tables = scraper.scrape_tables_with_pandas(url)
    
    if tables:
        # Save first table
        tables[0].to_csv("gdp_data.csv", index=False)
        print("GDP data saved to gdp_data.csv")
    
    return tables


In [6]:
# Main execution
if __name__ == "__main__":
    print("Web Table Scraper")
    print("=" * 50)
    
    # Choose which example to run
    print("1. Basic scraping example")
    print("2. Specific table with CSS selector")
    print("3. Using pandas read_html method")
    
    choice = input("Enter choice (1-3): ").strip()
    
    if choice == "1":
        example_basic_scraping()
    elif choice == "2":
        example_specific_table()
    elif choice == "3":
        example_pandas_method()
    else:
        print("Running basic example...")
        example_basic_scraping()

Web Table Scraper
1. Basic scraping example
2. Specific table with CSS selector
3. Using pandas read_html method


Enter choice (1-3):  1


Fetching data from: https://feheroes.fandom.com/wiki/Sacred_Seals#List_of_Sacred_Seals
Found 2 table(s)
Processing table 1/2
Saved table 1 to sacred_seals_1.csv
Processing table 2/2
Saved table 2 to sacred_seals_2.csv

--- Table 1 ---
  Icon          Name                                        Description   SP  \
0       Aerobatics 1  If unit's HP = 100%, unit can move to a space ...   60   
1       Aerobatics 2  If unit's HP ≥ 50%, unit can move to a space a...  120   
2       Aerobatics 3  Unit can move to a space adjacent to any infan...  240   
3       Air Orders 1  At start of turn, if unit's HP = 100%, grants ...   60   
4       Air Orders 2  At start of turn, if unit's HP ≥ 50%, grants t...  120   

  Badge Color  
0              
1              
2              
3              
4              
Shape: (926, 5)

--- Table 2 ---
                                     veSkills
0  WeaponsAssistsSpecialsPassivesSacred Seals
Shape: (1, 1)
