In [1]:
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
import requests
import re
import os

#### Gathering the data

By looking at various lists online, and taking into account my own observations of birds, I estimated that the nineteen most common birds in London are in the following list:
- Wren
- Robin
- House sparrow
- Wood pigeon
- Rock dove/feral pigeon
- Collared dove
- Blackbird
- Blue tit
- Great tit
- Coal tit
- Chaffinch
- Greenfinch
- Goldfinch
- Starling
- Herring gull
- Black-headed gull
- Carrion crow
- Jackdaw
- Magpie

I'll get data on each of these birds by scraping from the RSPB website

In [1]:
## List of relevant webpages
bird_urls = ['https://www.rspb.org.uk/birds-and-wildlife/wildlife-guides/bird-a-z/wren/',
             'https://www.rspb.org.uk/birds-and-wildlife/wildlife-guides/bird-a-z/robin/',
             'https://www.rspb.org.uk/birds-and-wildlife/wildlife-guides/bird-a-z/house-sparrow/',
             'https://www.rspb.org.uk/birds-and-wildlife/wildlife-guides/bird-a-z/woodpigeon/',
             'https://www.rspb.org.uk/birds-and-wildlife/wildlife-guides/bird-a-z/blackbird/',
             'https://www.rspb.org.uk/birds-and-wildlife/wildlife-guides/bird-a-z/chaffinch/',
             'https://www.rspb.org.uk/birds-and-wildlife/wildlife-guides/bird-a-z/blue-tit/',
             'https://www.rspb.org.uk/birds-and-wildlife/wildlife-guides/bird-a-z/great-tit/',
             'https://www.rspb.org.uk/birds-and-wildlife/wildlife-guides/bird-a-z/starling/',
             'https://www.rspb.org.uk/birds-and-wildlife/wildlife-guides/bird-a-z/goldfinch/',
             'https://www.rspb.org.uk/birds-and-wildlife/wildlife-guides/bird-a-z/collared-dove/',
             'https://www.rspb.org.uk/birds-and-wildlife/wildlife-guides/bird-a-z/coal-tit/',
             'https://www.rspb.org.uk/birds-and-wildlife/wildlife-guides/bird-a-z/rock-dove/',
             'https://www.rspb.org.uk/birds-and-wildlife/wildlife-guides/bird-a-z/herring-gull/',
             'https://www.rspb.org.uk/birds-and-wildlife/wildlife-guides/bird-a-z/jackdaw/',
             'https://www.rspb.org.uk/birds-and-wildlife/wildlife-guides/bird-a-z/carrion-crow/',
             'https://www.rspb.org.uk/birds-and-wildlife/wildlife-guides/bird-a-z/black-headed-gull/',
             'https://www.rspb.org.uk/birds-and-wildlife/wildlife-guides/bird-a-z/greenfinch/',
             'https://www.rspb.org.uk/birds-and-wildlife/wildlife-guides/bird-a-z/magpie/'
             ]
## This is approximately in order of most common to least

In [2]:
len(bird_urls) ## Length matches

19

In [191]:
## Testing that I can connect to a single page
site = bird_urls[0] ## Wren RSPB page
resp = requests.get(site, 'html.parser')
resp ## Response code of 200 indicates that scraping will work

<Response [200]>

In [11]:
bird_data = [] ## Empty list
colours = ['Black','Brown','Grey','Yellow','Orange','Red','Pink','Blue','Green', 'White'] ## List of various colours - for use with beak characteristics later

for link in bird_urls:
    site = link ## url of RSPB page
    resp = requests.get(site, 'html.parser')
    soup = BeautifulSoup(resp.text, 'html.parser') ## Reading the url into Python using BeautifulSoup

    species = soup.find('h1', class_='species-hero__page-title').text ## Species of bird
    sci_name = soup.find('li', class_='species-hero__stats-item').strong.text ## Scientific name
    diet = soup.find('div', class_='key-information__section').p.text ## Dietary information

    measurements = soup.find('div', class_='species-measurements-population__measurements').find_all('dl', class_='species-measurements-population__details') ## Various physical measurements
    length_cm = measurements[0].find('dd', class_='species-measurements-population__details-content').text  ## Length of the bird in centimetres
    length_cm = re.search(r'([\d\.?]*-*[\d\.?]+\s?cm)',length_cm).group(1)
    wingspan_cm = measurements[1].find('dd', class_='species-measurements-population__details-content').text ## Wingspan of the bird in centimetres
    wingspan_cm = re.search(r'([\d\.?]*-*[\d\.?]+\s?cm)',wingspan_cm).group(1)
    weight_g = measurements[2].find('dd', class_='species-measurements-population__details-content').text ## Weight of the bird in grams
    weight_g = re.search(r'([\d\.?]*-*[\d\.?]+\s?g)',weight_g).group(1)

    population = soup.find('div', class_='species-measurements-population__population').find_all('dd', class_='species-measurements-population__details-content') ## Population measurements
    breeding_pop = population[0].text   ## Breeding population: the number of pairs breeding annually
    breeding_pop = re.search(r'([\d+\.?]+[\d+,?]+\s\w+)',breeding_pop).group(1)
    try:
        wintering_pop = population[1].text ## Wintering population: the number of individuals present from October to March
        wintering_pop = re.search(r'([\d+\.?]+[\d+,?]+\s\w+)',wintering_pop).group(1)
    except IndexError:
        wintering_pop = np.NAN  ## Resident birds may not have a wintering population
    except AttributeError:
        wintering_pop = np.NAN ## This continues from the previous error
    ## NOTE: For blue tit and black-headed gull, population[2] would be Europe. It could feasibly be Africa or Asia for other birds on the RSPB website.
    ## NOTE: This code only works as is because I'm looking at common UK species - they all have a breeding population in the UK; Birds such as the aquatic warbler have 'UK Passage' as population[0]

    physical_feat = soup.find_all('div', class_='key-information__species-variation__features') ## Various physical characteristics

    num_species_variations = len(physical_feat) ## The number of significant intra-species variations arising from age, sexual dimorphism, season, etc
    variation_names = soup.find_all('h4', class_='key-information__species-variation__heading') ## The name for each variation
    variation_names = [x.text for x in variation_names]

    for i in range(num_species_variations):
        variation = variation_names[i] ## The specific variation
        identifiers = physical_feat[i].find_all('div', class_='filter-block__tags-block')
        feather_col = identifiers[0].find_all('span', class_='filter-block__tags-block__tag') ## Feather colours
        feather_col = [x.text for x in feather_col]
        leg_col = identifiers[1].find_all('span', class_='filter-block__tags-block__tag') ## Leg colours
        leg_col = [x.text for x in leg_col]
        beak_char = identifiers[2].find_all('span', class_='filter-block__tags-block__tag') ## All the beak characteristics
        beak_char = [x.text for x in beak_char]
        beak_col = [word for word in beak_char if word in colours] ## Beak colours
        beak_shape = [word for word in beak_char if word not in beak_col] ## Beak shape
        habitats = identifiers[3].find_all('span', class_='filter-block__tags-block__tag')
        habitats = [x.text for x in habitats]

        locals()[f'bird_info_{i}'] = [species,sci_name,diet,length_cm,wingspan_cm,weight_g,breeding_pop,wintering_pop,variation,feather_col,leg_col,beak_col,beak_shape,habitats] ## Creating a list with all the information for each variation of every bird

        bird_data.append(locals()[f'bird_info_{i}']) ## Adding our list to the big bird data

In [195]:
bird_data ## Checking that everything has worked

[['Wren',
  'Troglodytes troglodytes',
  '9-10cm',
  '13-17cm',
  '7-12g',
  '11,000,000 territories',
  nan,
  'Wren',
  ['Brown', 'Cream/buff', 'White'],
  ['Brown', 'Pink'],
  ['Black'],
  ['Short', 'Curved', 'Thin'],
  ['Woodland', 'Farmland', 'Heathland', 'Urban and suburban']],
 ['Robin',
  'Erithacus rubecula',
  '14cm',
  '20-22cm',
  '14-21g',
  '7,350,000 territories',
  nan,
  'Robin (adult)',
  ['Brown', 'Cream/buff', 'Grey', 'Orange', 'Red', 'White', 'Yellow'],
  ['Brown', 'Pink'],
  ['Black'],
  ['Short', 'Thin'],
  ['Woodland', 'Farmland', 'Urban and suburban']],
 ['Robin',
  'Erithacus rubecula',
  '14cm',
  '20-22cm',
  '14-21g',
  '7,350,000 territories',
  nan,
  'Robin (juvenile)',
  ['Brown', 'Cream/buff', 'Orange', 'White', 'Yellow'],
  ['Brown', 'Pink'],
  ['Black'],
  ['Short', 'Thin'],
  ['Woodland', 'Farmland', 'Urban and suburban']],
 ['House sparrow',
  'Passer domesticus',
  '14-15cm',
  '5cm',
  '24-38g',
  '5,300,000 pairs',
  nan,
  'House sparrow (adult

In [12]:
bird_columns = ['species','scientific_name','diet','length_(cm)','wingspan_(cm)','weight_(g)','population_breeding','population_wintering','variation','feather_colour','leg_colour','beak_colour','beak_shape','natural_habitats']

df = pd.DataFrame(data=bird_data, columns=bird_columns) ## Creating a pandas dataframe with all my data
df ## Looking at the dataframe

Unnamed: 0,species,scientific_name,diet,length_(cm),wingspan_(cm),weight_(g),population_breeding,population_wintering,variation,feather_colour,leg_colour,beak_colour,beak_shape,natural_habitats
0,Wren,Troglodytes troglodytes,Insects and spiders.,9-10cm,13-17cm,7-12g,"11,000,000 territories",,Wren,"[Brown, Cream/buff, White]","[Brown, Pink]",[Black],"[Short, Curved, Thin]","[Woodland, Farmland, Heathland, Urban and subu..."
1,Robin,Erithacus rubecula,"Worms, seeds, fruits, insects and other invert...",14cm,20-22cm,14-21g,"7,350,000 territories",,Robin (adult),"[Brown, Cream/buff, Grey, Orange, Red, White, ...","[Brown, Pink]",[Black],"[Short, Thin]","[Woodland, Farmland, Urban and suburban]"
2,Robin,Erithacus rubecula,"Worms, seeds, fruits, insects and other invert...",14cm,20-22cm,14-21g,"7,350,000 territories",,Robin (juvenile),"[Brown, Cream/buff, Orange, White, Yellow]","[Brown, Pink]",[Black],"[Short, Thin]","[Woodland, Farmland, Urban and suburban]"
3,House sparrow,Passer domesticus,Seeds and scraps.,14-15cm,21-25.5cm,24-38g,"5,300,000 pairs",,House sparrow (adult male),"[Black, Brown, Cream/buff, Grey, White]","[Brown, Pink]","[Black, Brown]","[Short, Chunky]","[Farmland, Urban and suburban]"
4,House sparrow,Passer domesticus,Seeds and scraps.,14-15cm,21-25.5cm,24-38g,"5,300,000 pairs",,House sparrow (adult female),"[Brown, Cream/buff, Grey]","[Brown, Pink]","[Black, Brown]","[Short, Chunky]","[Farmland, Urban and suburban]"
5,Woodpigeon,Columba palumbus,"Crops like cabbages, sprouts, peas and grain. ...",40-42cm,75-80cm,480-550g,"5,150,000 pairs",,Woodpigeon (adult),"[Black, Blue, Brown, Grey, Pink/purple, White]","[Brown, Pink, Red]","[Black, Brown, Orange]","[Medium length, Thin]","[Woodland, Farmland, Grassland, Urban and subu..."
6,Woodpigeon,Columba palumbus,"Crops like cabbages, sprouts, peas and grain. ...",40-42cm,75-80cm,480-550g,"5,150,000 pairs",,Woodpigeon (juvenile),"[Black, Blue, Brown, Grey, Pink/purple, White]","[Brown, Pink, Red]","[Black, Brown, Orange]","[Medium length, Thin]","[Woodland, Farmland, Grassland, Urban and subu..."
7,Blackbird,Turdus merula,Blackbird food consists of a variety of insect...,24-25cm,34-38.5cm,80-100g,"5,050,000 pairs",15 million,Blackbird (adult male),"[Black, Brown, White]",[Brown],"[Black, Brown, Orange, Yellow]","[Medium length, Medium thickness]","[Woodland, Farmland, Grassland, Urban and subu..."
8,Blackbird,Turdus merula,Blackbird food consists of a variety of insect...,24-25cm,34-38.5cm,80-100g,"5,050,000 pairs",15 million,Blackbird (juvenile),"[Black, Brown, Cream/buff, Orange, White]",[Brown],"[Black, Brown, Yellow]","[Medium length, Medium thickness]","[Woodland, Farmland, Grassland, Urban and subu..."
9,Chaffinch,Fringilla coelebs,Insects and seeds.,14.5cm,24.5-28.5cm,18-29g,5.05 million,,Chaffinch (male),"[Black, Blue, Brown, Cream/buff, Green, Grey, ...","[Brown, Pink]","[Black, Blue, Brown]","[Short, Chunky]","[Woodland, Farmland, Grassland, Heathland, Urb..."


In [13]:
## Saving the data to a csv
df.to_csv('birds_19_common.csv', index=False)