In [1]:
import re
import requests
import os
from datetime import datetime

from bs4 import BeautifulSoup, Comment
from bs4.element import Tag

import pandas as pd
from tqdm import tqdm

In [2]:
def get_description(breed_soup):
#     print('breed_soup', breed_soup);
    try:
        description = breed_soup.find(
                'p', class_='breed-page__about__read-more__text'
        ).get_text().strip()
    except:
        description = ''
        
#     print('description', description)
    
    # Removing weird characters
    # Probably not exhaustive
    description = description.replace(
        '\n', '').replace('\u200b', '').replace('\xa0', ' ')
    
#     print('description', description)
    return description

In [3]:
def get_temperament(breed_soup):
    temperment_class = 'breed-page__intro__temperment'
    try:
        return breed_soup.find(
            'p', class_=temperment_class
        ).get_text().replace(' / ', ',')
    except:
        return ''

In [4]:
def general_regex(text, var, mul=1):
    reg = re.compile('(\d+\.?\d*)')
    results = reg.findall(text)
    numbers = [float(value) * mul for value in results]
    if len(numbers) == 1:
        numbers = numbers * 2
    elif len(numbers) == 0:
        numbers = [0, 0]
    return {
        'min_{}'.format(var): min(numbers),
        'max_{}'.format(var): max(numbers)
    }

In [5]:
# Define scraping functions for each attribute
def get_popularity(attribute):
    return attribute.replace('Ranks', '').split('of')[0].strip().split(':')[1]

def get_height(attribute):
    return general_regex(attribute, 'height')

def get_weight(attribute):
    return general_regex(attribute, 'weight') 

def get_expectancy(attribute):
    return general_regex(attribute, 'expectancy')

def get_group(attribute):
    return attribute.split(':')[1].strip()

# Create a dictionary mapping attribute names to their respective scraping functions
attr_function = {
    'akc_breed_popularity': get_popularity,
    'height': get_height,
    'weight': get_weight,
    'life_expectancy': get_expectancy,
    'group': get_group
}

# Function to extract attributes from a comment
def get_attributes_from_comment(comment):
    attributes_data = {}
    
    comment_content = BeautifulSoup(comment, 'html.parser')
    
    data_objects = comment_content.find_all('dataobject', type='document')
    
    for data_object in data_objects:
        attributes = data_object.find_all('attribute')
        
        for attribute in attributes:
            name = attribute.get('name')
            value = attribute.get_text(strip=True)
            
            # Check if the attribute name is in the attr_function dictionary
            if name in attr_function:
                scraping_function = attr_function[name]
                if(name in ['height', 'weight', 'life_expectancy']):
                    attr_dict = scraping_function(value)
                    for key in attr_dict.keys():
                        attributes_data[key] = attr_dict[key]
                    
                else:
                    attributes_data[name] = scraping_function(value)
    
    return attributes_data

In [6]:
breed_attributes = []

def get_main_attributes(breed_soup):
    # Find all comments in the HTML
    comments = breed_soup.find_all(string=lambda text: isinstance(text, Comment))

    # Iterate through comments
    for comment in comments:
        comment_content = BeautifulSoup(comment, 'html.parser')

        if comment_content.find('pagemap'):
            breed_attributes = get_attributes_from_comment(comment)
    
    print('breed_attributes', breed_attributes)
    return breed_attributes

In [7]:
def get_care_info(breed_soup):
    titles = breed_soup.find_all(
        'h4', class_='bar-graph__title'
    )
    
    values = breed_soup.find_all(
        'div', class_='bar-graph__section'
    )
    
    categories = breed_soup.find_all(
        'div', class_='bar-graph__text'
    )
    
    care_dict = {}
    
    for (title, value, category) in zip (titles, values, categories):
        t = title.get_text().lower().replace(' ', '_')
        t = t[t.find('/') + 1:]
        care_dict[t + '_value'] = float(
            value['style'].split()[1].split('%')[0]
        ) / 100
        care_dict[t + '_category'] = category.get_text()
        
    print('care_dict', care_dict)
    
    return care_dict

In [8]:
# Working
def get_breed_characteristics(breed_soup):
    breed_characteristics = {}
    characteristic_classes = [tab_element.getText() for tab_element in breed_soup.find_all('li', class_='tabs__single-tab')]
    for characteristic_class in characteristic_classes:
        if('Traits' not in characteristic_class):
            characteristic_class_name = f'breed-page__traits__{characteristic_class.lower().replace(" ", "-")}'
            print(characteristic_class_name)
            characteristics_div = breed_soup.find('div', id=characteristic_class_name)
            breed_traits = characteristics_div.find_all('div', class_='breed-trait-group__trait')
            breed_group_characteristics = {}
            for breed_trait in breed_traits:
                trait = breed_trait.find('h4', class_='accordion__header__text').getText()
                print('trait - jashp', trait)
                if trait in ['Coat Type', 'Coat Length']:
                    coatParentDivs = breed_trait.find_all(class_='breed-trait-score__choice--selected')
                    coatNames = []
                    for coatParentDiv in coatParentDivs:
                        span_text = coatParentDiv.find('span').text
                        coatNames.append(span_text)
                    trait_info = ",".join(coatNames)
                else:
                    trait_info = len(breed_trait.find_all('div', class_='breed-trait-score__score-unit breed-trait-score__score-unit--filled'))
                breed_group_characteristics[trait] = trait_info
                breed_characteristics[trait] = trait_info
    print(breed_characteristics)
    return breed_characteristics

In [9]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

class Breed:
    def __init__(self, breed_url, breed_name):
        self.url = breed_url
        self.name = breed_name
        
        # Set up a headless Chrome browser
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')  # Run in headless mode (no GUI)
        driver = webdriver.Chrome(options=options)

        # Load the web page
        driver.get(breed_url)

        # Get the page source after content has loaded
        page_source = driver.page_source

        # Create a BeautifulSoup object from the page source
        breed_soup = BeautifulSoup(page_source, 'html.parser')

        # Scrape content as needed
        self.breed_info = {}
        self.breed_info['breed'] = self.name
        self.breed_info['description'] = get_description(breed_soup)
        self.breed_info['temperament'] = get_temperament(breed_soup)
        self.breed_info.update(get_main_attributes(breed_soup))
        self.breed_info.update(get_care_info(breed_soup))
        self.breed_info.update(get_breed_characteristics(breed_soup))

        # Close the browser
        driver.quit()

    def get_breed_info(self):
        return self.breed_info


In [10]:
import requests
from bs4 import BeautifulSoup
from bs4 import Tag
from tqdm import tqdm  # Import tqdm for the progress bar

def get_data():
    page = requests.get('https://www.akc.org/dog-breeds/')
    soup = BeautifulSoup(page.content, 'html.parser')
    
    breed_select = soup.find('select', id='breed-search')

    breeds = []
    
    for tag in breed_select.children:
        if isinstance(tag, Tag):
            if 'value' in tag.attrs and tag['value']:
                breeds.append(tag)

    print(len(breeds))
                
    breed_dict = {}
    
    # Use tqdm to create a progress bar
    for breed in tqdm(breeds[150:200],desc='scraping Breeds'):
        breed_name = breed.get_text()
        breed_url = breed['value']
        breed_info = Breed(breed_url, breed_name).get_breed_info()
        breed_dict[breed_name] = breed_info
    
#     print('breed_dict', breed_dict)
    
    return breed_dict

# Define the Breed class and its methods here if not already defined

# Call get_data() to start the scraping process

In [None]:
breed_dict = get_data()

288


scraping Breeds:   0%|                                                                         | 0/100 [00:00<?, ?it/s]

breed_attributes {'akc_breed_popularity': '  148', 'min_height': 9.0, 'max_height': 11.5, 'min_weight': 7.0, 'max_weight': 10.0, 'min_expectancy': 12.0, 'max_expectancy': 15.0, 'group': 'Toy Group'}
care_dict {}
breed-page__traits__family-life
trait - jashp Affectionate With Family
trait - jashp Good With Young Children
trait - jashp Good With Other Dogs
breed-page__traits__physical
trait - jashp Shedding Level
trait - jashp Coat Grooming Frequency
trait - jashp Drooling Level
trait - jashp Coat Type
trait - jashp Coat Length
breed-page__traits__social
trait - jashp Openness To Strangers
trait - jashp Playfulness Level
trait - jashp Watchdog/Protective Nature
trait - jashp Adaptability Level
breed-page__traits__personality
trait - jashp Trainability Level
trait - jashp Energy Level
trait - jashp Barking Level
trait - jashp Mental Stimulation Needs
{'Affectionate With Family': 3, 'Good With Young Children': 3, 'Good With Other Dogs': 3, 'Shedding Level': 3, 'Coat Grooming Frequency': 3,

scraping Breeds:   1%|▋                                                                | 1/100 [00:11<18:34, 11.26s/it]

breed_attributes {'akc_breed_popularity': '  117', 'min_height': 25.0, 'max_height': 27.0, 'min_weight': 50.0, 'max_weight': 60.0, 'min_expectancy': 12.0, 'max_expectancy': 18.0, 'group': 'Hound Group'}
care_dict {}
breed-page__traits__family-life
trait - jashp Affectionate With Family
trait - jashp Good With Young Children
trait - jashp Good With Other Dogs
breed-page__traits__physical
trait - jashp Shedding Level
trait - jashp Coat Grooming Frequency
trait - jashp Drooling Level
trait - jashp Coat Type
trait - jashp Coat Length
breed-page__traits__social
trait - jashp Openness To Strangers
trait - jashp Playfulness Level
trait - jashp Watchdog/Protective Nature
trait - jashp Adaptability Level
breed-page__traits__personality
trait - jashp Trainability Level
trait - jashp Energy Level
trait - jashp Barking Level
trait - jashp Mental Stimulation Needs
{'Affectionate With Family': 3, 'Good With Young Children': 3, 'Good With Other Dogs': 3, 'Shedding Level': 1, 'Coat Grooming Frequency'

scraping Breeds:   2%|█▎                                                               | 2/100 [00:22<18:18, 11.21s/it]

breed_attributes {'akc_breed_popularity': '  65', 'min_height': 23.0, 'max_height': 23.0, 'min_weight': 50.0, 'max_weight': 70.0, 'min_expectancy': 11.0, 'max_expectancy': 14.0, 'group': 'Terrier Group'}
care_dict {}
breed-page__traits__family-life
trait - jashp Affectionate With Family
trait - jashp Good With Young Children
trait - jashp Good With Other Dogs
breed-page__traits__physical
trait - jashp Shedding Level
trait - jashp Coat Grooming Frequency
trait - jashp Drooling Level
trait - jashp Coat Type
trait - jashp Coat Length
breed-page__traits__social
trait - jashp Openness To Strangers
trait - jashp Playfulness Level
trait - jashp Watchdog/Protective Nature
trait - jashp Adaptability Level
breed-page__traits__personality
trait - jashp Trainability Level
trait - jashp Energy Level
trait - jashp Barking Level
trait - jashp Mental Stimulation Needs
{'Affectionate With Family': 3, 'Good With Young Children': 3, 'Good With Other Dogs': 3, 'Shedding Level': 1, 'Coat Grooming Frequency

scraping Breeds:   3%|█▉                                                               | 3/100 [00:34<18:42, 11.57s/it]

breed_attributes {'akc_breed_popularity': '  55', 'min_height': 24.0, 'max_height': 28.0, 'min_weight': 70.0, 'max_weight': 130.0, 'min_expectancy': 10.0, 'max_expectancy': 14.0, 'group': 'Working Group'}
care_dict {}
breed-page__traits__family-life
trait - jashp Affectionate With Family
trait - jashp Good With Young Children
trait - jashp Good With Other Dogs
breed-page__traits__physical
trait - jashp Shedding Level
trait - jashp Coat Grooming Frequency
trait - jashp Drooling Level
trait - jashp Coat Type
trait - jashp Coat Length
breed-page__traits__social
trait - jashp Openness To Strangers
trait - jashp Playfulness Level
trait - jashp Watchdog/Protective Nature
trait - jashp Adaptability Level
breed-page__traits__personality
trait - jashp Trainability Level
trait - jashp Energy Level
trait - jashp Barking Level
trait - jashp Mental Stimulation Needs
{'Affectionate With Family': 3, 'Good With Young Children': 3, 'Good With Other Dogs': 1, 'Shedding Level': 3, 'Coat Grooming Frequenc

scraping Breeds:   4%|██▌                                                              | 4/100 [00:47<19:40, 12.29s/it]

breed_attributes {'akc_breed_popularity': '', 'min_height': 12.0, 'max_height': 17.0, 'min_weight': 6.0, 'max_weight': 25.0, 'min_expectancy': 13.0, 'max_expectancy': 16.0, 'group': 'Foundation Stock Service'}
care_dict {}
breed-page__traits__family-life
trait - jashp Affectionate With Family
trait - jashp Good With Young Children
trait - jashp Good With Other Dogs
breed-page__traits__physical
trait - jashp Shedding Level
trait - jashp Coat Grooming Frequency
trait - jashp Drooling Level
trait - jashp Coat Type
trait - jashp Coat Length
breed-page__traits__social
trait - jashp Openness To Strangers
trait - jashp Playfulness Level
trait - jashp Watchdog/Protective Nature
trait - jashp Adaptability Level
breed-page__traits__personality
trait - jashp Trainability Level
trait - jashp Energy Level
trait - jashp Barking Level
trait - jashp Mental Stimulation Needs
{'Affectionate With Family': 3, 'Good With Young Children': 3, 'Good With Other Dogs': 5, 'Shedding Level': 3, 'Coat Grooming Fre

scraping Breeds:   5%|███▎                                                             | 5/100 [01:00<19:51, 12.54s/it]

breed_attributes {'akc_breed_popularity': '  67', 'min_height': 23.0, 'max_height': 25.0, 'min_weight': 75.0, 'max_weight': 85.0, 'min_expectancy': 10.0, 'max_expectancy': 14.0, 'group': 'Working Group'}
care_dict {}
breed-page__traits__family-life
trait - jashp Affectionate With Family
trait - jashp Good With Young Children
trait - jashp Good With Other Dogs
breed-page__traits__physical
trait - jashp Shedding Level
trait - jashp Coat Grooming Frequency
trait - jashp Drooling Level
trait - jashp Coat Type
trait - jashp Coat Length
breed-page__traits__social
trait - jashp Openness To Strangers
trait - jashp Playfulness Level
trait - jashp Watchdog/Protective Nature
trait - jashp Adaptability Level
breed-page__traits__personality
trait - jashp Trainability Level
trait - jashp Energy Level
trait - jashp Barking Level
trait - jashp Mental Stimulation Needs
{'Affectionate With Family': 3, 'Good With Young Children': 3, 'Good With Other Dogs': 3, 'Shedding Level': 3, 'Coat Grooming Frequency

scraping Breeds:   6%|███▉                                                             | 6/100 [01:12<19:06, 12.20s/it]

breed_attributes {'akc_breed_popularity': '', 'min_height': 20.0, 'max_height': 25.0, 'min_weight': 60.0, 'max_weight': 100.0, 'min_expectancy': 10.0, 'max_expectancy': 12.0, 'group': 'Foundation Stock Service'}
care_dict {}
breed-page__traits__family-life
trait - jashp Affectionate With Family
trait - jashp Good With Young Children
trait - jashp Good With Other Dogs
breed-page__traits__physical
trait - jashp Shedding Level
trait - jashp Coat Grooming Frequency
trait - jashp Drooling Level
trait - jashp Coat Type
trait - jashp Coat Length
breed-page__traits__social
trait - jashp Openness To Strangers
trait - jashp Playfulness Level
trait - jashp Watchdog/Protective Nature
trait - jashp Adaptability Level
breed-page__traits__personality
trait - jashp Trainability Level
trait - jashp Energy Level
trait - jashp Barking Level
trait - jashp Mental Stimulation Needs
{'Affectionate With Family': 0, 'Good With Young Children': 3, 'Good With Other Dogs': 3, 'Shedding Level': 2, 'Coat Grooming F

scraping Breeds:   7%|████▌                                                            | 7/100 [01:23<18:25, 11.89s/it]

breed_attributes {'akc_breed_popularity': '  182', 'min_height': 23.0, 'max_height': 26.0, 'min_weight': 45.0, 'max_weight': 65.0, 'min_expectancy': 11.0, 'max_expectancy': 12.0, 'group': 'Hound Group'}
care_dict {}
breed-page__traits__family-life
trait - jashp Affectionate With Family
trait - jashp Good With Young Children
trait - jashp Good With Other Dogs
breed-page__traits__physical
trait - jashp Shedding Level
trait - jashp Coat Grooming Frequency
trait - jashp Drooling Level
trait - jashp Coat Type
trait - jashp Coat Length
breed-page__traits__social
trait - jashp Openness To Strangers
trait - jashp Playfulness Level
trait - jashp Watchdog/Protective Nature
trait - jashp Adaptability Level
breed-page__traits__personality
trait - jashp Trainability Level
trait - jashp Energy Level
trait - jashp Barking Level
trait - jashp Mental Stimulation Needs
{'Affectionate With Family': 3, 'Good With Young Children': 3, 'Good With Other Dogs': 5, 'Shedding Level': 2, 'Coat Grooming Frequency'

scraping Breeds:   8%|█████▏                                                           | 8/100 [01:35<18:06, 11.81s/it]

breed_attributes {'akc_breed_popularity': '  123', 'min_height': 9.0, 'max_height': 19.0, 'min_weight': 6.0, 'max_weight': 35.0, 'min_expectancy': 13.0, 'max_expectancy': 15.0, 'group': 'Non-Sporting Group'}
care_dict {}
breed-page__traits__family-life
trait - jashp Affectionate With Family
trait - jashp Good With Young Children
trait - jashp Good With Other Dogs
breed-page__traits__physical
trait - jashp Shedding Level
trait - jashp Coat Grooming Frequency
trait - jashp Drooling Level
trait - jashp Coat Type
trait - jashp Coat Length
breed-page__traits__social
trait - jashp Openness To Strangers
trait - jashp Playfulness Level
trait - jashp Watchdog/Protective Nature
trait - jashp Adaptability Level
breed-page__traits__personality
trait - jashp Trainability Level
trait - jashp Energy Level
trait - jashp Barking Level
trait - jashp Mental Stimulation Needs
{'Affectionate With Family': 5, 'Good With Young Children': 5, 'Good With Other Dogs': 3, 'Shedding Level': 3, 'Coat Grooming Frequ

scraping Breeds:   9%|█████▊                                                           | 9/100 [01:46<17:38, 11.63s/it]

breed_attributes {'akc_breed_popularity': '  196', 'min_height': 21.0, 'max_height': 25.0, 'min_weight': 60.0, 'max_weight': 70.0, 'min_expectancy': 11.0, 'max_expectancy': 13.0, 'group': 'Hound Group'}
care_dict {}
breed-page__traits__family-life
trait - jashp Affectionate With Family
trait - jashp Good With Young Children
trait - jashp Good With Other Dogs
breed-page__traits__physical
trait - jashp Shedding Level
trait - jashp Coat Grooming Frequency
trait - jashp Drooling Level
trait - jashp Coat Type
trait - jashp Coat Length
breed-page__traits__social
trait - jashp Openness To Strangers
trait - jashp Playfulness Level
trait - jashp Watchdog/Protective Nature
trait - jashp Adaptability Level
breed-page__traits__personality
trait - jashp Trainability Level
trait - jashp Energy Level
trait - jashp Barking Level
trait - jashp Mental Stimulation Needs
{'Affectionate With Family': 3, 'Good With Young Children': 5, 'Good With Other Dogs': 5, 'Shedding Level': 3, 'Coat Grooming Frequency'

scraping Breeds:  10%|██████▍                                                         | 10/100 [02:08<22:18, 14.87s/it]

breed_attributes {'akc_breed_popularity': '  120', 'min_height': 12.0, 'max_height': 16.0, 'min_weight': 12.0, 'max_weight': 28.0, 'min_expectancy': 14.0, 'max_expectancy': 16.0, 'group': 'Terrier Group'}
care_dict {}
breed-page__traits__family-life
trait - jashp Affectionate With Family
trait - jashp Good With Young Children
trait - jashp Good With Other Dogs
breed-page__traits__physical
trait - jashp Shedding Level
trait - jashp Coat Grooming Frequency
trait - jashp Drooling Level
trait - jashp Coat Type
trait - jashp Coat Length
breed-page__traits__social
trait - jashp Openness To Strangers
trait - jashp Playfulness Level
trait - jashp Watchdog/Protective Nature
trait - jashp Adaptability Level
breed-page__traits__personality
trait - jashp Trainability Level
trait - jashp Energy Level
trait - jashp Barking Level
trait - jashp Mental Stimulation Needs
{'Affectionate With Family': 5, 'Good With Young Children': 5, 'Good With Other Dogs': 3, 'Shedding Level': 1, 'Coat Grooming Frequenc

In [None]:
breed_df = pd.DataFrame.from_dict(breed_dict, orient='index')

In [None]:
breed_df

In [None]:
excel_file_path = 'to_excel_3.xlsx'

In [None]:
breed_df.to_excel(excel_file_path, index=False)