# In this code I scrap table in https://grg.org/Adams/C.HTM with BeautifulSoup library and push data in DataFrame.

In [None]:
# Install the BeautifulSoup4 library if not already installed
!pip install beautifulsoup4

# Import necessary libraries
from bs4 import BeautifulSoup  # For parsing HTML and XML documents
from urllib.request import urlopen  # To fetch data from a URL
import pandas as pd  # For data manipulation and analysis

# Define the URL of the target web page
url_base = 'https://grg.org/Adams/C_files/sheet001.htm'

# Open the URL and read its content
u = urlopen(url_base)
html = u.read()  # Read the HTML content of the page

# Try decoding the HTML content using 'utf-8', handle potential decoding errors
try:
    decoded_html = html.decode('utf-8')
except UnicodeDecodeError:
    # If decoding with 'utf-8' fails, fall back to 'latin-1' encoding and replace problematic characters
    decoded_html = html.decode('latin-1', errors='replace')

# Close the URL connection
u.close()

# Parse the decoded HTML content using BeautifulSoup
soup = BeautifulSoup(decoded_html, 'html.parser')

# Find the first table element in the HTML document
Table = soup.find("table")

# Optional: Uncomment to print a prettified version of the table's HTML
# print(Table.prettify)

# Define column names for the DataFrame
columns = ["Num", "Birthplace", "Name", "Born", "Died", "Age_Years", "Age_Days", "Race", "Sex", 
           "Deathplace", "WhenOldest_Years_Range", "WhenOldestAge_Range", "Lenght_of_Region_Years", 
           "Lenght_of_Region_Days", "Region_Lenghts_inYear", "Age_at_Accession_Years", 
           "Age_at_Accession_Days", "Case added to GRG Tablee", "C1", "C2", "C3", "C4"]

# Initialize an empty list to store table data
data = []

# Loop through specific rows in the table (from row 8 to row 72, inclusive) to extract data
for row in Table.find_all('tr')[8:73]:
    # Extract text from each table cell, strip whitespace, and append to row_data
    row_data = [cell.text.strip() for cell in row.find_all('td')]
    data.append(row_data)  # Add the row data to the data list

# Create a DataFrame using the extracted data and specified column names
df = pd.DataFrame(data, columns=columns)

# Optional: Uncomment to preview the first 10 rows of the DataFrame
# print(df.head(10))

# Slice the DataFrame to keep only the first 18 columns
df = df.iloc[:, :18]

# Print the first 5 rows of the DataFrame to verify the output
print(df.head(5))

# Save the DataFrame to a CSV file named 'webscrape.csv'
df.to_csv('webscrape.csv')


  Num    Birthplace                       Name           Born           Died  \
0   1  England (UK)                Betsy Baker  Aug. 20, 1842  Oct. 24, 1955   
1   2  England (UK)              Jennie Howell  Feb. 11, 1845  Dec. 16, 1956   
2   3       Denmark      Anne Marie Carstenson  Jan. 24, 1849  Mar. 30, 1958   
3   4     U.S. (IN)                 Nancy Ryan  Sept. 5, 1849  Oct. 17, 1958   
4   5   Netherlands  Christina Karnebeek-Backs  Oct.  2, 1849  Oct.  7, 1959   

  Age_Years Age_Days Race Sex   Deathplace WhenOldest_Years_Range  \
0       113       65    W   F    U.S. (NE)                 ?-1955   
1       111      309    W   F    U.S. (CA)              1955-1956   
2       109       65    W   F    U.S. (NE)              1956-1958   
3       109       42    W   F    U.S. (IN)                   1958   
4       110        5    W   F  Netherlands              1958-1959   

  WhenOldestAge_Range Lenght_of_Region_Years Lenght_of_Region_Days  \
0               ?-113             