Connected to Python 3.11.0

In [1]:
import pandas as pd
import numpy as np
import requests 
import h5py
import matplotlib.pyplot as plt
import datetime
from datetime import datetime 
import matplotlib.collections as mcol
from matplotlib.legend_handler import HandlerLineCollection, HandlerTuple
from matplotlib.lines import Line2D
import logging

#Logging setup
logging.basicConfig(filename='scraping.log', level=logging.INFO, format='%(asctime)s:%(levelname)s:%(message)s')    

# Step 1: Read URLs from CSV
csv_file = 'web_pages.csv'  # Replace with your CSV file path
urls = pd.read_csv(csv_file)

# Function to scrape data
def scrape_data(urls):
    scraped_tables = {}

    # Iterate over rows in the CSV file
    for _, row in urls.iterrows():  
        url = row['URL']

        # Get the first column name from CSV file
        first_column_name = row['start_column_name']  
        try:
            response = requests.get(url)
            tables_list = pd.read_html(response.text)
            for table in tables_list:

                # Check if the first column of the table matches the desired name
                if table.columns[0] == first_column_name:
                    scraped_tables[url] = table
                    logging.info(f"\nScraped table from: {url} \nfirst column: '{first_column_name} \nscrapped date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
                    
                    # Print the first few rows of the table
                    print(table.head())  
                    break
        except ValueError:
            logging.error(f"No tables found in {url}")
            print(f"No tables found in {url}")
        except Exception as e:
            print(f"Error occurred while scraping {url}: {e}")
    return scraped_tables

#Saving scraped data to HDF5 file
def save_to_hdf5(scraped_tables, hdf5_file, scrape_date):
    with pd.HDFStore(hdf5_file, 'a') as store:
        for url, (url, table) in enumerate(scraped_tables.items(), start=1):
            if isinstance(table, pd.DataFrame):
                group_name = f"URL_:{url}" + f"\nScrapped date: {scrape_date}"
                store.put(group_name, table)

                # Save URL and date as attributes
                store.get_storer(group_name).attrs.metadata = {f"url_{url}": url, 'date': scrape_date}
                logging.info(f"Saved table from: {url} scrapped date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
            else:
                logging.error(f"Skipped saving non-DataFrame data from {url}")
                print(f"Skipped saving non-DataFrame data from {url}")
           

# HDF5 file name
hdf5_file = 'scraped_data.h5'
 
#scraping count
number_of_times = 1
for _ in range(number_of_times):
    scrape_date = datetime.now().strftime("%Y-%m-%d")
    scraped_data = scrape_data(urls)

    # Save to HDF5 and visualize after processing all URLs
    save_to_hdf5(scraped_data, hdf5_file, scrape_date)
    print(f"Saved scraped data to {hdf5_file}")

   In Kilometres Cape Town Durban Johannesburg
0   Aliwal North       963    722          597
1  Beaufort West       459   1147          934
2     Beitbridge      1948   1060          547
3      Bethlehem      1243    388          259
4   Bloemfontein       997    628          394
   #               Team  Pl   W  D  L   F   A  GD  Pts  Last 6
0  1            Arsenal  18  12  4  2  36  16  20   40     NaN
1  2          Liverpool  18  11  6  1  37  16  21   39     NaN
2  3        Aston Villa  18  12  3  3  38  22  16   39     NaN
3  4  Tottenham Hotspur  18  11  3  4  37  24  13   36     NaN
4  5    Manchester City  17  10  4  3  40  20  20   34     NaN
   #             Team  Pl   W  D  L   F   A  GD  Pts  Last 6
0  1      Real Madrid  18  14  3  1  39  11  28   45     NaN
1  2           Girona  18  14  3  1  42  21  21   45     NaN
2  3  Atletico Madrid  18  12  2  4  36  19  17   38     NaN
3  4        Barcelona  18  11  5  2  34  21  13   38     NaN
4  5  Athletic Bilbao  18  10  5  3

  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
