# Exploration of Data mining the BCPC database

http://www.bcpcpesticidecompendium.org/index_cn_frame.html

In [None]:
from bs4 import BeautifulSoup
from IPython.display import display
import requests
from datetime import date, timedelta, datetime
import time
from datetime import datetime
import pandas as pd
import numpy as np
import os
import winsound

In [None]:
def try_get_column_value_else_nan(df: pd.DataFrame, col_name: str):
    """Get the value of the first row with the column set in the arguments. If not column doens't exist, return NaN.

    When webscraping tables you sometimes stumble upon a table that's missing a certain column.
    To avoid KeyErrors breaking your scrape loop, this function catches the KeyError and returns a NaN instead.
    """
    try:
        value = df.loc[0, col_name]
    except KeyError:
        value = np.nan

    return value

## Scraping

In [None]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0"
}

### BCPC database
http://www.bcpcpesticidecompendium.org/index_cn_frame.html

In [None]:
url_database = "http://www.bcpcpesticidecompendium.org/index_cn.html"

file_name = "Pesticide_Database_BCPC"
file_location = "../data/raw/"
ENABLE_FINISHED_BEEP = True

date_today = datetime.today().strftime("%Y%m%d")

# Load scraped database if it exists already
if os.path.isfile(f"{file_location}{file_name}"):
    df_pesticide = pd.read_excel(f"{file_location}{date_today}_{file_name}.xlsx")
    print("Dataframe loaded")
# Else: scrape database
else:
    # Inialisation
    pesticides = (
        {}
    )  # library with each element being a single pesticide with its respective properties
    i_pest = (
        0  # index of pesticide, used to move scraped info into the pesticides library
    )
    time_start = (
        time.time()
    )  # start time of the scraping => to display total scrape time
    # Play beep after scrape finished:
    duration = 750  # milliseconds
    freq = 4 * 440  # Hz

    # Beautiful soup the database
    response = requests.get(url_database, headers=headers)
    # read encoding from webpage to ensure special characters are scraped correctly, if missing: None
    encoding = (
        response.encoding
        if "charset" in response.headers.get("content-type", "").lower()
        else None
    )
    soup = BeautifulSoup(response.content, "html.parser", from_encoding=encoding)

    # Pesticides are alphabetically grouped: loop over all 'p' tags.
    # Note: there are 38 'p' tags, but only 27 contain 'a' tags == 26 letters + 1 numerical,
    # the rest are empty and can be still looped over
    groups = soup.find_all("p")
    n_groups = len(groups)

    # Initialise progress display
    disp1 = display(f"BCPC Database Scrape has started:", display_id=True)
    disp2 = display(f"Elapsed Time:", display_id=True)

    # Loop over all groups
    for i_group in range(1, n_groups):  # skip first group as this contains the header
        group = groups[i_group]
        # Get pesticide items from this group
        items = group.select("a")
        n_item = len(items)

        if n_item > 0:  # if group contains no item => skip group
            # the first letter of this group is used for displaying progress
            group_letter = items[0].text[0].upper()

            # Loop over all items in one group
            for i_item, item in enumerate(items):
                # Extract name and url from the item
                name_pest = item.text
                url_pest = url_database.replace("index_cn.html", "") + item.get("href")

                # use pandas automatically extract tables from the webpage
                df = pd.read_html(url_pest)[0]
                df = df.set_index(0).transpose().reset_index(drop=True)

                # Not all fields are always present: check them one by one and if missing => NaN
                casname = try_get_column_value_else_nan(df=df, col_name="CAS name:")
                casrn = try_get_column_value_else_nan(df=df, col_name="CAS Reg. No.:")
                iupac = try_get_column_value_else_nan(df=df, col_name="IUPAC name:")
                activity = try_get_column_value_else_nan(df=df, col_name="Activity:")
                if pd.notnull(activity):
                    # comma seperate string values
                    activity = activity.replace(") ", "), ")

                # Store pesticide info in library
                pesticides[i_pest] = {
                    "Pesticide Common Name": name_pest,
                    "url": url_pest,
                    "CAS name": casname,
                    "CAS RN": casrn,
                    "IUPAC name:": iupac,
                    "Activity:": activity,
                }
                i_pest = i_pest + 1  # update pesticide index

                # Update progress and time displays
                time_elapsed = time.time() - time_start
                disp1.update(
                    f"Group: {group_letter} - Page: {i_item+1}/{n_item} - Pesticides scraped: {i_pest}"
                )
                disp2.update(f"Time elapsed: {round(time_elapsed/60, 1)} min")

                time.sleep(0.3)  # wait between each scrape to avoid ban
                del df  # to make sure no info from the previous iteration leaks into the next

    # Dict to dataframe
    df_pesticide = pd.DataFrame.from_dict(pesticides, "index")

    # Export
    #   Store dataframe so you don't have to scrape every time
    df_pesticide.to_excel(f"{file_location}{date_today}_{file_name}.xlsx", index=False)
    print("Dataframe saved")

    # Scrape timer
    time_elapsed = time.time() - time_start
    disp2.update(
        f"Total Time: {round(time_elapsed/60, 1)} min. (avg. {round(time_elapsed/i_pest, 1)} sec. per pesticide)"
    )

    if ENABLE_FINISHED_BEEP:
        winsound.Beep(freq, duration)


# Print df info:
print(df_pesticide.shape)
df_pesticide.head()

### Save requirements.txt 

In [None]:
!pipreqsnb --use-local --encoding=iso-8859-1 --ignore .venv --force ..
# --use-local ONLY local package info instead of querying PyPI
# --encoding=iso-8859-1 to avoid encoding errors related to utf-8
# --force overwrites current file
# .. saves it in parent folder