In [None]:
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
import time
import random
import os
import pickle

In [None]:
import time
from functools import wraps

def retry_on_exception(max_retries=5, retry_delay=5):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            retries = max_retries
            while retries > 0:
                try:
                    return func(*args, **kwargs)
                except Exception as e:
                    print(f"Error running function {func.__name__}: {str(e)}")
                    retries -= 1
                    if retries == 0:
                        print(f"Max retries reached for function {func.__name__}.")
                        raise
                    print(f"Retrying in {retry_delay} seconds ({retries} retries left)...")
                    time.sleep(retry_delay)
        return wrapper
    return decorator

import signal

class TimeoutError(Exception):
    pass

def timeout(seconds):
    def decorator(func):
        def handler(signum, frame):
            raise TimeoutError()

        def wrapper(*args, **kwargs):
            signal.signal(signal.SIGALRM, handler)
            signal.alarm(seconds)
            try:
                result = func(*args, **kwargs)
            finally:
                signal.alarm(0)
            return result

        return wrapper

    return decorator


In [None]:
@timeout(60)
@retry_on_exception(max_retries=5, retry_delay=5)
def get_html_content(url):
    response = requests.get(url)
    return BeautifulSoup(response.content, "html.parser")


@timeout(60)
@retry_on_exception(max_retries=5, retry_delay=5)
def scrape_categories(url):
    soup = get_html_content(url)
    categories = {url: {"details": []}}
    for x in soup.find("div", {"class": "tabs-content w-tab-content"}).findAll(
        "a", href=True
    ):
        categories[url]['details'].append(f"https://dimensions.com{x['href']}")
    return categories


@timeout(60)
@retry_on_exception(max_retries=5, retry_delay=5)
def scrape_individual_items(url):
    soup = get_html_content(url)
    individual_items = {url: {'details': []}}
    for x in soup.findAll("a", {"data-link": True}, href=True):
        individual_items[url]['details'].append(f"https://dimensions.com{x['href']}")

    return individual_items


@timeout(60)
@retry_on_exception(max_retries=5, retry_delay=5)
def scrape_details(url):
    soup = get_html_content(url)
    output = {url: {"details": []}}
    output[url]["title"] = soup.find("h1", {"class": "text-page-title"}).text
    for x in soup.find("div", {"class": "detail-content-wrapper"}).children:
        if x.name == "div":
            for y in x:
                output[url]["details"].append(y.text)
        else:
            output[url]["details"].append(x.text)
    return output


def write_index(filename, index):
    with open(filename, "w") as f:
        f.write(str(index))


def read_index(filename):
    # Check if the index file exists, and load the last index if it does
    if os.path.exists(filename):
        with open(filename, "r") as f:
            last_index = int(f.read().strip())
    else:
        last_index = -1
    return last_index

def read_pickle(filename):
    filepath = f"./{filename}.pickle"
    if os.path.exists(filepath):
        with open(filepath, "rb") as f:
            output_data = pickle.load(f)
            return output_data



def scrape_urls(urls, scrape_fn, type):
    index_filename = f"./{type}_index.txt"
    pickle_filename = f"./{type}.pickle"
    output_data = []
    last_index = read_index(index_filename)
    if os.path.exists(pickle_filename):
        with open(pickle_filename, "rb") as f:
            output_data = pickle.load(f)
    else:
        # os.makedirs(str(pickle_filename.split('/')[:-1]))
        last_index = -1
        output_data = []

    for index, url in enumerate(tqdm(urls)):
        if index <= last_index:
            continue
        res = scrape_fn(url)
        output_data.append(res)
        write_index(index_filename, index)
        with open(pickle_filename, "wb") as f:
            pickle.dump(output_data, f)
        time.sleep(random.randint(0, 2))

    return output_data


In [None]:
categories = []
all_items = []
all_items_details = {}

In [None]:
classifications = [
    # "https://dimensions.com/classifications/humans",
# "https://dimensions.com/classifications/animals",
"https://dimensions.com/classifications/plants",
# "https://dimensions.com/classifications/objects",
# "https://dimensions.com/classifications/furniture",
# "https://dimensions.com/classifications/fixtures",
# "https://dimensions.com/classifications/layouts",
# "https://dimensions.com/classifications/buildings",
# "https://dimensions.com/classifications/transport",
# "https://dimensions.com/classifications/sports",
# "https://dimensions.com/classifications/digital",
# "https://dimensions.com/classifications/pop-culture"
]

for classification in classifications:
    categories = scrape_urls([classification],scrape_categories,classification.split('/')[-1])
    for category in categories:
        for classification in list(category.keys()):
            classification_name = classification.split('/')[-1]
            unique_urls = list(set(category[classification]['details']))
            classification_items = scrape_urls(unique_urls,scrape_individual_items,classification_name)
            for item in tqdm(classification_items):
                for collection in list(item.keys()):
                    collection_name = collection.split('/')[-1]
                    unique_urls = list(set(item[collection]['details']))
                    scrape_urls(unique_urls,scrape_details,f"{classification_name}/{collection_name}")


In [None]:
for category in categories:
    for classification in list(category.keys()):
        classification_name = classification.split('/')[-1]
        classification_items = read_pickle(classification_name)
        