## The below scrapes the coe expiry information and persists the same in `../data/raw/` folder

Practice caution before you run this as it contains computation-heavy operations and could slow down your system

**Note:** The below requires the **dask** which you can get from conda/pip

In [None]:
%load_ext jupyterlab_notify

In [None]:
%%notify

import requests
import dask

from tqdm import tqdm
from bs4 import BeautifulSoup
from dask.distributed import Client


client = Client(serializers=["dask", "pickle"], deserializers=["dask", "msgpack"])
client.cluster.scale(10)


def scrape_coe_left(listing_id):

    logging.info(
        f"CoeStartDateFeatureCreator - scraping sgcarmart for listing_id - {listing_id}"
    )

    response = requests.get(
        f"https://www.sgcarmart.com/used_cars/info.php?ID={listing_id}"
    )
    try:
        soup = BeautifulSoup(response.text, features="html.parser")
        result = soup.find(id="carInfo").contents[3].contents[-2].contents[-1]
    except AttributeError as e:
        result = ""

    return (listing_id, result)


In [None]:
futures = []
for listing_id in tqdm(test.index):
    future = client.submit(scrape_coe_left, listing_id)
    futures.append(future)

test_results = client.gather(futures, errors="skip")
coe_text_info = pd.DataFrame(test_results, columns=["listing_id", "coe_text"])
coe_text_info.set_index("listing_id", inplace=True)
coe_text_info.to_csv("../data/raw/test_coe_text.csv")

futures = []
for listing_id in tqdm(train.index):
    future = client.submit(scrape_coe_left, listing_id)
    futures.append(future)

train_results = client.gather(futures, errors="skip")
coe_text_info = pd.DataFrame(train_results, columns=["listing_id", "coe_text"])
coe_text_info.set_index("listing_id", inplace=True)
coe_text_info.to_csv("../data/raw/train_coe_text.csv")

In [None]:
client.shutdown()