In [19]:
import re
import requests
import os
from datetime import datetime

from bs4 import BeautifulSoup
from bs4.element import Tag

import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [20]:
def get_description(breed_soup):
    try:
        first_part = breed_soup.find(
                'div', class_='breed-info__content-wrap'
        ).get_text().strip()
    except:
        first_part = ''
    
    try:
        second_part = breed_soup.find(
                'div', class_='breed-hero__footer'
        ).get_text().strip()
    except:
        second_part = ''
    
    description = ' '.join([first_part, second_part])
    
    # Removing weird characters
    # Probably not exhaustive
    description = description.replace(
        '\n', '').replace('\u200b', '').replace('\xa0', ' ')
    return description

In [21]:
def get_temperament(breed_soup):
    first_part = 'attribute-list__description attribute-list__text '
    second_part = 'attribute-list__text--lg mb4 bpm-mb5 pb0 d-block'
    class_ = first_part + second_part
    try:
        return breed_soup.find(
            'span', class_=class_
        ).get_text()
    except:
        return ''

In [22]:
def get_popularity(popularity_span):
    pop_text = popularity_span.get_text()
    return {'popularity': pop_text.split()[1]}

In [23]:
def general_regex(text, var, mul=1):
    reg = re.compile('(\d+\.?\d*)')
    results = reg.findall(text)
    numbers = [float(value) * mul for value in results]
    if len(numbers) == 1:
        numbers = numbers * 2
    elif len(numbers) == 0:
        numbers = [0, 0]
    return {
        'min_{}'.format(var): min(numbers),
        'max_{}'.format(var): max(numbers)
    }

In [24]:
def get_height(height_span):
    ht_text = height_span.get_text()
    
     # one inch corresponds to 2.54 cm
    return general_regex(ht_text, 'height', 2.54)

In [25]:
def get_weight(weight_span):
    wt_text = weight_span.get_text()
    
     # one pound corresponds to 0.45359237 kg
    return general_regex(wt_text, 'weight', 0.45359237) 

In [26]:
def get_expectancy(expectancy_span):
    exp_text = expectancy_span.get_text()
    return general_regex(exp_text, 'expectancy') 

In [27]:
def get_group(group_span):
    return {'group': group_span.get_text()}

In [28]:
attr_function = {
    'AKC Breed Popularity': get_popularity,
    'Height': get_height,
    'Weight': get_weight,
    'Life Expectancy': get_expectancy,
    'Group': get_group
}

def get_main_attributes(breed_soup):
        
    breed_attr_terms = breed_soup.find_all(
        'span', class_='attribute-list__term attribute-list__text'
    )
    # When pressent, the first span is the temperament
    if 'Temperament' in breed_attr_terms[0].get_text():
        breed_attr_terms = breed_attr_terms[1:]
    
    breed_attr_values = breed_soup.find_all(
        'span', class_='attribute-list__description attribute-list__text'
    )
    
    attributes = {}
    
    for term_span, value_span in zip (breed_attr_terms, breed_attr_values):
        term = term_span.get_text().replace(':', '')
        attributes.update(attr_function[term](value_span))
    
    return attributes

In [29]:
def get_care_info(breed_soup):
    titles = breed_soup.find_all(
        'h4', class_='bar-graph__title'
    )
    
    values = breed_soup.find_all(
        'div', class_='bar-graph__section'
    )
    
    categories = breed_soup.find_all(
        'div', class_='bar-graph__text'
    )
    
    care_dict = {}
    
    for (title, value, category) in zip (titles, values, categories):
        t = title.get_text().lower().replace(' ', '_')
        t = t[t.find('/') + 1:]
        care_dict[t + '_value'] = float(
            value['style'].split()[1].split('%')[0]
        ) / 100
        care_dict[t + '_category'] = category.get_text()
    
    return care_dict

In [30]:
class Breed:
    def __init__(self, url):
        self.url = url
        breed_page = requests.get(url)
        breed_soup = BeautifulSoup(breed_page.content, 'html.parser')

        self.breed_info = {}
        self.breed_info['description'] = get_description(breed_soup)    
        self.breed_info['temperament'] = get_temperament(breed_soup)
        self.breed_info.update(get_main_attributes(breed_soup))
        self.breed_info.update(get_care_info(breed_soup))
        
    def get_info(self):  
        return self.breed_info

In [37]:
def get_data():
    page = requests.get('https://www.akc.org/dog-breeds/')
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # An HTML select tag with all the breeds and their urls
    breed_select = soup.find('select', id='breed-search')
    
    # Keeping only children from breed_select which are actually breeds
    breeds = [
        tag for tag in breed_select.children if type(tag) is Tag and tag['value']
    ]
    
    breed_dict = {
        breed.get_text(): Breed(breed['value']).get_info(
        ) for breed in tqdm(breeds)
    }
    
    return breed_dict

In [38]:
breed_dict = get_data()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  ) for breed in tqdm(breeds)


  0%|          | 0/283 [00:00<?, ?it/s]

IndexError: list index out of range

In [33]:
breed_df = pd.DataFrame.from_dict(
    breed_dict, orient='index'
)

NameError: name 'breed_dict' is not defined