# Crawling preparation
Exploratory analysis of sitemap and robots.txt  
Main objective is to estimate number of recipes on each website

### Common

In [1]:
# imports
import urllib.robotparser
import urllib.request

import time
from bs4 import BeautifulSoup

import zlib
import json

In [2]:
pages = [
    'https://www.allrecipes.com',
    'https://www.yummly.com',
    'http://recipes-plus.com',
    'https://www.food.com',
]

In [3]:
def crawlerInfo(url, useragent="*", verbose=False):
    rp = urllib.robotparser.RobotFileParser()
    rp.set_url(url+"/robots.txt")
    rp.read()
    rrate = rp.request_rate(useragent)
    if verbose:
        print("url:\t"+url)
        print("useragent:\t"+useragent)
        print("crawl delay:\t"+str(rp.crawl_delay(useragent)))
        print("request rate:\t" + str(rrate))
    return (url, useragent, rp.crawl_delay(useragent), rrate)

In [4]:
crawlerInfo(pages[0])

('https://www.allrecipes.com', '*', 1, None)

In [5]:
def retriveMappedLinks(url):
    req = urllib.request.Request(
            url, 
            data=None, 
            headers={
                'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0'
            }
        )
    # unzip if required
    if url.find('.gz') >= 0:
        sitemap = urllib.request.urlopen(req).read()
        sitemap = zlib.decompress(sitemap, 16+zlib.MAX_WBITS)
    else:
        sitemap = urllib.request.urlopen(req).read().decode('utf-8')
    # parse xml
    soup = BeautifulSoup(sitemap, "xml") 
    links = list(map(lambda x: x.text, soup.find_all("loc")))
    return links

In [6]:
def parseSitemap(url):
    # get robots and filter sitemap url out of it
    robots = urllib.request.urlopen(url+"/robots.txt").read().decode('utf-8').lower().split('\n')
    robots = list(filter(lambda x: x.find('sitemap')>=0, robots))
    robots = list(map(lambda x: x.split(': '), robots))
    sitemap_url = robots[0][1]
    return retriveMappedLinks(sitemap_url) #return list of list links in sitemap



## allrecipes

In [7]:
inner_sitemaps = parseSitemap(pages[0])
inner_sitemaps

['https://www.allrecipes.com/recipedetail.xml.gz',
 'https://www.allrecipes.com/recipedetail1.xml.gz',
 'https://www.allrecipes.com/recipedetail2.xml.gz',
 'https://www.allrecipes.com/recipedetail3.xml.gz',
 'https://www.allrecipes.com/recipehubs.xml.gz',
 'https://www.allrecipes.com/videodetailext.xml.gz']

In [8]:
result = []
for s in inner_sitemaps[0:4]: # get only recipes, not other pages
    print(s)
    tmp = list(filter(lambda x: x.find('/recipe/') >= 0, retriveMappedLinks(s))) # getall links from sitemap section and filter out all no - recipes
    print(len(tmp))
    result.extend(tmp)
    
print(len(result))

https://www.allrecipes.com/recipedetail.xml.gz
20000
https://www.allrecipes.com/recipedetail1.xml.gz
20000
https://www.allrecipes.com/recipedetail2.xml.gz
20000
https://www.allrecipes.com/recipedetail3.xml.gz
5258
65258


In [9]:
print(len(set(result)))

65258


In [10]:
# list(filter(lambda x: x.find('/recipe/') >= 0, retriveMappedLinks(inner_sitemaps[4])))
hubs = retriveMappedLinks(inner_sitemaps[4])
len(hubs)

2783

In [11]:
hubs[:10]

['https://www.allrecipes.com/recipes/',
 'https://www.allrecipes.com/recipes/81/side-dish/',
 'https://www.allrecipes.com/recipes/78/breakfast-and-brunch/',
 'https://www.allrecipes.com/recipes/145/breakfast-and-brunch/cereals/',
 'https://www.allrecipes.com/recipes/76/appetizers-and-snacks/',
 'https://www.allrecipes.com/recipes/77/drinks/',
 'https://www.allrecipes.com/recipes/79/desserts/',
 'https://www.allrecipes.com/recipes/80/main-dish/',
 'https://www.allrecipes.com/recipes/82/trusted-brands-recipes-and-tips/',
 'https://www.allrecipes.com/recipes/83/everyday-cooking/convenience-cooking/']

## yummly

In [14]:
# https://www.yummly.com/recipes/beets-with-vinegar-and-sugar
urllib.request.urlopen('https://www.yummly.com/recipes/beets-with-vinegar-and-sugar').read().decode('utf-8').count('single-recipe')

74

In [15]:
inner_sitemaps = parseSitemap(pages[1])
inner_sitemaps

['https://www.yummly.com/yummly-pages-recipe-0.xml',
 'https://www.yummly.com/yummly-pages-recipe-1.xml',
 'https://www.yummly.com/yummly-pages-static-0.xml',
 'https://www.yummly.com/yummly-pages-browse-0.xml',
 'https://www.yummly.com/yummly-pages-recipes-0.xml',
 'https://www.yummly.com/yummly-pages-recipes-1.xml',
 'https://www.yummly.com/yummly-pages-recipes-2.xml']

In [16]:
tmp = list(filter(lambda x: x.find('/recipe/') >= 0, retriveMappedLinks(inner_sitemaps[0]))) 
print(len(tmp))

25000


In [17]:
tmp = list(filter(lambda x: x.find('yummly') >= 0, retriveMappedLinks(inner_sitemaps[0]))) 
print(len(tmp))

25000


In [18]:
hubs = []
for i in inner_sitemaps[4:]:
    print(i)
    tmp = retriveMappedLinks(i)
    print(len(tmp))
    hubs.extend(tmp)

print(len(hubs))
hubs[:10]

https://www.yummly.com/yummly-pages-recipes-0.xml
25000
https://www.yummly.com/yummly-pages-recipes-1.xml
25000
https://www.yummly.com/yummly-pages-recipes-2.xml
19148
69148


['https://www.yummly.com/recipes/cold-water-pastry',
 'https://www.yummly.com/recipes/spinach-mozzarella-pasta',
 'https://www.yummly.com/recipes/corn-flour-eggs',
 'https://www.yummly.com/recipes/shredded-potato-balls',
 'https://www.yummly.com/recipes/protein-pancakes-without-eggs',
 'https://www.yummly.com/recipes/hamburger-biscuits',
 'https://www.yummly.com/recipes/mexican-vegetable-side-dishes',
 'https://www.yummly.com/recipes/cheerio-bars-without-peanut-butter',
 'https://www.yummly.com/recipes/hamburger-mushroom-casserole',
 'https://www.yummly.com/recipes/beets-with-vinegar-and-sugar']

In [19]:
folder = "scraping/yummly"
with open(folder + "/" + "hubs.txt", 'w+') as f:
    for h in hubs:
        f.write(h+'\n')

## recipes plus

In [20]:
inner_sitemaps = parseSitemap(pages[2])
inner_sitemaps

['http://recipes-plus.com/collection-sitemap.xml',
 'http://recipes-plus.com/recipes-sitemap.xml',
 'http://recipes-plus.com/other-sitemap.xml',
 'http://recipes-plus.com/tag-sitemap.xml']

In [21]:
tmp = list(filter(lambda x: x.find('http://recipes-plus.com/recipe/') >= 0, retriveMappedLinks(inner_sitemaps[1]))) 
print(len(tmp))

20568


In [22]:
tmp[:10]

['http://recipes-plus.com/recipe/pink-popcorn-17916',
 'http://recipes-plus.com/recipe/tomato-soup-savory-pesto-crepes-17388',
 'http://recipes-plus.com/recipe/osso-bucco-25212',
 'http://recipes-plus.com/recipe/plum-crumbles-31398',
 'http://recipes-plus.com/recipe/pasta-blue-cheese-prosciutto-25315',
 'http://recipes-plus.com/recipe/chicken-curry-19331',
 'http://recipes-plus.com/recipe/greek-salad-lamb-flatbread-22512',
 'http://recipes-plus.com/recipe/confit-duck-pappardelle-22033',
 'http://recipes-plus.com/recipe/snapper-fillets-herb-caper-butter-30203',
 'http://recipes-plus.com/recipe/crispy-green-beans-17818']

# food.com

In [29]:
inner_sitemaps = retriveMappedLinks('https://www.food.com/sitemap.xml')
inner_sitemaps

['https://www.food.com/sitemap-1.xml.gz',
 'https://www.food.com/sitemap-2.xml.gz',
 'https://www.food.com/sitemap-3.xml.gz',
 'https://www.food.com/sitemap-4.xml.gz',
 'https://www.food.com/sitemap-5.xml.gz',
 'https://www.food.com/sitemap-6.xml.gz',
 'https://www.food.com/sitemap-7.xml.gz',
 'https://www.food.com/sitemap-8.xml.gz',
 'https://www.food.com/sitemap-9.xml.gz',
 'https://www.food.com/sitemap-10.xml.gz',
 'https://www.food.com/sitemap-11.xml.gz',
 'https://www.food.com/sitemap-12.xml.gz',
 'https://www.food.com/sitemap-13.xml.gz',
 'https://www.food.com/sitemap-14.xml.gz',
 'https://www.food.com/sitemap-15.xml.gz',
 'https://www.food.com/sitemap-16.xml.gz',
 'https://www.food.com/sitemap-17.xml.gz',
 'https://www.food.com/sitemap-18.xml.gz',
 'https://www.food.com/sitemap-19.xml.gz',
 'https://www.food.com/sitemap-20.xml.gz',
 'https://www.food.com/sitemap-21.xml.gz',
 'https://www.food.com/sitemap-22.xml.gz',
 'https://www.food.com/sitemap-23.xml.gz',
 'https://www.food.c

In [32]:
result = []
for s in inner_sitemaps: # get only recipes, not other pages
    print(s)
    tmp = list(filter(lambda x: x.find('www.food.com/recipe/') >= 0, retriveMappedLinks(s))) # getall links from sitemap section and filter out all no - recipes
    print(len(tmp))
    print(tmp[:5])
    result.extend(tmp)
    
print(len(result))

https://www.food.com/sitemap-1.xml.gz
15000
['https://www.food.com/recipe/best-banana-bread-2886', 'https://www.food.com/recipe/bourbon-chicken-45809', 'https://www.food.com/recipe/creamy-cajun-chicken-pasta-39087', 'https://www.food.com/recipe/to-die-for-crock-pot-roast-27208', 'https://www.food.com/recipe/jo-mamas-world-famous-spaghetti-22782']
https://www.food.com/sitemap-2.xml.gz
15000
['https://www.food.com/recipe/vegetable-pancakes-146972', 'https://www.food.com/recipe/egyptian-spinach-soup-141888', 'https://www.food.com/recipe/avocado-with-groundnut-dressing-141780', 'https://www.food.com/recipe/steamed-lime-and-pepper-chicken-with-glazed-asparagus-134815', 'https://www.food.com/recipe/slammin-salmon-crunchy-cakes-with-gayla-sauce-130785']
https://www.food.com/sitemap-3.xml.gz
15000
['https://www.food.com/recipe/lite-fruit-salad-with-honey-poppy-seed-dressing-by-paula-deen-213485', 'https://www.food.com/recipe/mushroom-stuffed-brie-en-croute-213270', 'https://www.food.com/recipe

In [34]:
with open('scraping/foodcom/links.txt','w+') as f:
    for r in result:
        f.write(r+'\n')

# myrecipes

In [7]:
inner_sitemaps = retriveMappedLinks('https://www.myrecipes.com/sitemap-index.xml')
inner_sitemaps

['https://www.myrecipes.com/recipe-sitemap.xml?page=1',
 'https://www.myrecipes.com/recipe-sitemap.xml?page=2',
 'https://www.myrecipes.com/recipe-sitemap.xml?page=3',
 'https://www.myrecipes.com/menu-sitemap.xml',
 'https://www.myrecipes.com/quick-easy.xml',
 'https://www.myrecipes.com/healthy-diet-sitemap.xml',
 'https://www.myrecipes.com/how-to-sitemap.xml',
 'https://www.myrecipes.com/holidays-sitemap.xml',
 'https://www.myrecipes.com/grilling-sitemap.xml',
 'https://www.myrecipes.com/kids-sitemap.xml',
 'https://www.myrecipes.com/special-diet-sitemap.xml',
 'https://www.myrecipes.com/course-sitemap.xml',
 'https://www.myrecipes.com/ingredients-sitemap.xml',
 'https://www.myrecipes.com/convenience-sitemap.xml',
 'https://www.myrecipes.com/world-cuisine.xml',
 'https://www.myrecipes.com/cooking-method-sitemap.xml',
 'https://www.myrecipes.com/image-sitemap.xml?page=1',
 'https://www.myrecipes.com/image-sitemap.xml?page=2',
 'https://www.myrecipes.com/image-sitemap.xml?page=3',
 'htt

In [8]:
retriveMappedLinks('https://www.myrecipes.com/recipe-sitemap.xml?page=1')[:10]

['https://www.myrecipes.com/recipe/banbury-tarts',
 'https://www.myrecipes.com/recipe/butternut-squash-pecan-crostini',
 'https://www.myrecipes.com/recipe/double-berry-almond-galette',
 'https://www.myrecipes.com/recipe/farmers-lemonade',
 'https://www.myrecipes.com/recipe/smoked-turkey',
 'https://www.myrecipes.com/recipe/orange-roughy-with-gremolada-bread-crumbs',
 'https://www.myrecipes.com/recipe/decadent-chocolate-lemon-ganache-cake',
 'https://www.myrecipes.com/recipe/arugula-couscous',
 'https://www.myrecipes.com/recipe/chocolate-scotcheroos',
 'https://www.myrecipes.com/recipe/raspberry-frozen-yogurt']

In [9]:
result = []
for s in inner_sitemaps: # get only recipes, not other pages
    print(s)
    tmp = list(filter(lambda x: x.find('www.myrecipes.com/recipe/') >= 0, retriveMappedLinks(s))) # getall links from sitemap section and filter out all no - recipes
    print(len(tmp))
    print(tmp[:5])
    result.extend(tmp)
    
print(len(result))

https://www.myrecipes.com/recipe-sitemap.xml?page=1
50000
['https://www.myrecipes.com/recipe/banbury-tarts', 'https://www.myrecipes.com/recipe/butternut-squash-pecan-crostini', 'https://www.myrecipes.com/recipe/double-berry-almond-galette', 'https://www.myrecipes.com/recipe/farmers-lemonade', 'https://www.myrecipes.com/recipe/smoked-turkey']
https://www.myrecipes.com/recipe-sitemap.xml?page=2
20358
['https://www.myrecipes.com/recipe/chocolate-chip-coffee-cake-1', 'https://www.myrecipes.com/recipe/superspeedy-broccoli-slaw', 'https://www.myrecipes.com/recipe/sausage-gravy-0', 'https://www.myrecipes.com/recipe/portobello-mushroom-red-pepper-pizza', 'https://www.myrecipes.com/recipe/peaches-n-cream-tapioca']
https://www.myrecipes.com/recipe-sitemap.xml?page=3
3
['https://www.myrecipes.com/recipe/chicken-parmesan-spaghetti-squash', 'https://www.myrecipes.com/recipe/instant-pot-chana-dal', 'https://www.myrecipes.com/recipe/vegetarian-pozole']
https://www.myrecipes.com/menu-sitemap.xml
0
[]


In [10]:
len(set(result))

70358

In [11]:
with open('scraping/myrecipes/links.txt','w+') as f:
    for r in list(set(result)):
        f.write(r+'\n')

# food 52

In [15]:
inner_sitemaps = retriveMappedLinks('https://food52.com/sitemap-index.xml')
inner_sitemaps

['https://food52.com/sitemap-misc.xml',
 'https://food52.com/sitemap-recipes-1.xml',
 'https://food52.com/sitemap-recipes-2.xml',
 'https://food52.com/sitemap-recipes-3.xml',
 'https://food52.com/sitemap-recipes-4.xml',
 'https://food52.com/sitemap-recipes-5.xml',
 'https://food52.com/sitemap-recipes-6.xml',
 'https://food52.com/sitemap-recipes-7.xml',
 'https://food52.com/sitemap-recipes-8.xml',
 'https://food52.com/sitemap-recipes-9.xml',
 'https://food52.com/sitemap-recipes-10.xml',
 'https://food52.com/sitemap-recipes-11.xml',
 'https://food52.com/sitemap-recipes-12.xml',
 'https://food52.com/sitemap-recipes-13.xml',
 'https://food52.com/sitemap-recipes-14.xml',
 'https://food52.com/sitemap-recipes-15.xml',
 'https://food52.com/sitemap-recipes-16.xml',
 'https://food52.com/sitemap-collections-1.xml',
 'https://food52.com/sitemap-collections-2.xml',
 'https://food52.com/sitemap-users-1.xml',
 'https://food52.com/sitemap-articles-1.xml',
 'https://food52.com/sitemap-articles-2.xml',


In [16]:
retriveMappedLinks('https://food52.com/sitemap-recipes-1.xml')[:5]

['https://food52.com/recipes/13967-100-calorie-chocolate-chip-cookies',
 'https://food52.com/recipes/66792-100-sourdough-croissants-commercial-yeast-free',
 'https://food52.com/recipes/66652-100-whole-wheat-challah',
 'https://food52.com/recipes/3722-10-12-12-12-sprouted-lentil-soup',
 'https://food52.com/recipes/36078-10-hour-brisket']

In [17]:
result = []
for s in inner_sitemaps: # get only recipes, not other pages
    print(s)
    tmp = list(filter(lambda x: x.find('food52.com/recipes/') >= 0, retriveMappedLinks(s))) # getall links from sitemap section and filter out all no - recipes
    print(len(tmp))
    print(tmp[:5])
    result.extend(tmp)
    
print(len(result))

https://food52.com/sitemap-misc.xml
874
['https://food52.com/recipes/new-now', 'https://food52.com/recipes/what-to-cook', 'https://food52.com/recipes/home-design', 'https://food52.com/recipes/how-to-cook', 'https://food52.com/recipes/food52-in-5']
https://food52.com/sitemap-recipes-1.xml
5000
['https://food52.com/recipes/13967-100-calorie-chocolate-chip-cookies', 'https://food52.com/recipes/66792-100-sourdough-croissants-commercial-yeast-free', 'https://food52.com/recipes/66652-100-whole-wheat-challah', 'https://food52.com/recipes/3722-10-12-12-12-sprouted-lentil-soup', 'https://food52.com/recipes/36078-10-hour-brisket']
https://food52.com/sitemap-recipes-2.xml
5000
['https://food52.com/recipes/41883-blueberry-granola-bread', 'https://food52.com/recipes/13384-blueberry-grappa-sauce', 'https://food52.com/recipes/78482-blueberry-grown-up-grilled-cheese', 'https://food52.com/recipes/34104-blueberry-hand-pies-with-vanilla-custard', 'https://food52.com/recipes/68457-blueberry-hemp-burgers']

In [19]:
result = list(set(result[874:]))

In [20]:
with open('scraping/food52/links.txt','w+') as f:
    for r in list(set(result)):
        f.write(r+'\n')

# Epicurious
verify if that makes sense

In [5]:
inner_sitemaps = retriveMappedLinks('https://www.epicurious.com/sitemap.xml')
inner_sitemaps

['https://www.epicurious.com/sitemap.xml/taxonomies',
 'https://www.epicurious.com/sitemap.xml/static',
 'https://www.epicurious.com/sitemap.xml/other',
 'https://www.epicurious.com/sitemap.xml/member-recipes',
 'https://www.epicurious.com/sitemap.xml/editorial-recipes',
 'https://www.epicurious.com/sitemap.xml/editorial-menus',
 'https://www.epicurious.com/sitemap.xml/branded',
 'https://www.epicurious.com/sitemap.xml/archive']

In [7]:
retriveMappedLinks(inner_sitemaps[4])[:5]

['https://www.epicurious.com/sitemap.xml/editorial-recipes?year=2019&month=9&week=4',
 'https://www.epicurious.com/sitemap.xml/editorial-recipes?year=2019&month=9&week=3',
 'https://www.epicurious.com/sitemap.xml/editorial-recipes?year=2019&month=9&week=2',
 'https://www.epicurious.com/sitemap.xml/editorial-recipes?year=2019&month=9&week=1',
 'https://www.epicurious.com/sitemap.xml/editorial-recipes?year=2019&month=8&week=5']

In [14]:
result = []
for s in inner_sitemaps[3:5]: # get only recipes, not other pages
    inner_links = retriveMappedLinks(s)
    print(len(inner_links))
    for i in inner_links:
        # print(i)
        result.extend(retriveMappedLinks(i))
    
print(len(result))

53
https://www.epicurious.com/sitemap.xml/member-recipes?year=2019&month=9&week=4
https://www.epicurious.com/sitemap.xml/member-recipes?year=2019&month=9&week=3
https://www.epicurious.com/sitemap.xml/member-recipes?year=2019&month=9&week=2
https://www.epicurious.com/sitemap.xml/member-recipes?year=2019&month=9&week=1
https://www.epicurious.com/sitemap.xml/member-recipes?year=2019&month=8&week=5
https://www.epicurious.com/sitemap.xml/member-recipes?year=2019&month=8&week=4
https://www.epicurious.com/sitemap.xml/member-recipes?year=2019&month=8&week=3
https://www.epicurious.com/sitemap.xml/member-recipes?year=2019&month=8&week=2
https://www.epicurious.com/sitemap.xml/member-recipes?year=2019&month=8&week=1
https://www.epicurious.com/sitemap.xml/member-recipes?year=2019&month=7&week=4
https://www.epicurious.com/sitemap.xml/member-recipes?year=2019&month=7&week=3
https://www.epicurious.com/sitemap.xml/member-recipes?year=2019&month=7&week=2
https://www.epicurious.com/sitemap.xml/member-rec

In [16]:
result[:10]

['https://www.epicurious.com/recipes/member/views/apricot-banana-smoothie-5d893cb366e1bb0008b494d1',
 'https://www.epicurious.com/recipes/member/views/nonnos-stuffing-5d8d0cfe8400210008ea0e5b',
 'https://www.epicurious.com/recipes/member/views/bucks-peach-cake-5d7e59c28252f50008cfd8f5',
 'https://www.epicurious.com/recipes/member/views/suohern-collard-greens-5d7e8dcfb16d920009592e11',
 'https://www.epicurious.com/recipes/member/views/caramelized-butternut-squash-5d7e8ff51b065c0009bdd88a',
 'https://www.epicurious.com/recipes/member/views/dressed-up-bacon-mac-and-cheese-5d7e93c2e642300008495811',
 'https://www.epicurious.com/recipes/member/views/grape-jam-5d7e94610accde000987da78',
 'https://www.epicurious.com/recipes/member/views/peach-blueberry-crisp-5d7fbac584f76e00083170c4',
 'https://www.epicurious.com/recipes/member/views/buffalo-deviled-eggs-5d803543b16d920009592e16',
 'https://www.epicurious.com/recipes/member/views/healthy-sprouts-bhel-recipe-fun4fitness-5d8372e524dbf1000895af8

In [17]:
print(len(result))

106028


In [18]:
with open('scraping/epicurious/links.txt','w+') as f:
    for r in list(set(result)):
        f.write(r+'\n')

# Taste of home

In [7]:
inner_sitemaps = retriveMappedLinks('https://www.tasteofhome.com/sitemap_index.xml')
inner_sitemaps

['https://www.tasteofhome.com/post-sitemap1.xml',
 'https://www.tasteofhome.com/post-sitemap2.xml',
 'https://www.tasteofhome.com/page-sitemap.xml',
 'https://www.tasteofhome.com/collection-sitemap1.xml',
 'https://www.tasteofhome.com/collection-sitemap2.xml',
 'https://www.tasteofhome.com/collection-sitemap3.xml',
 'https://www.tasteofhome.com/recipe-sitemap1.xml',
 'https://www.tasteofhome.com/recipe-sitemap2.xml',
 'https://www.tasteofhome.com/recipe-sitemap3.xml',
 'https://www.tasteofhome.com/recipe-sitemap4.xml',
 'https://www.tasteofhome.com/recipe-sitemap5.xml',
 'https://www.tasteofhome.com/recipe-sitemap6.xml',
 'https://www.tasteofhome.com/recipe-sitemap7.xml',
 'https://www.tasteofhome.com/recipe-sitemap8.xml',
 'https://www.tasteofhome.com/recipe-sitemap9.xml',
 'https://www.tasteofhome.com/recipe-sitemap10.xml',
 'https://www.tasteofhome.com/recipe-sitemap11.xml',
 'https://www.tasteofhome.com/recipe-sitemap12.xml',
 'https://www.tasteofhome.com/recipe-sitemap13.xml',
 'h

In [9]:
result = []
for s in inner_sitemaps[6:28]: # get only recipes, not other pages
    print(s)
    tmp = list(filter(lambda x: x.find('tasteofhome.com/recipes/') >= 0, retriveMappedLinks(s))) # getall links from sitemap section and filter out all no - recipes
    print(len(tmp))
    print(tmp[:5])
    result.extend(tmp)
    
print(len(result))

https://www.tasteofhome.com/recipe-sitemap1.xml
2501
['https://www.tasteofhome.com/recipes/', 'https://www.tasteofhome.com/recipes/slow-cooker-polenta/', 'https://www.tasteofhome.com/recipes/poutine-cake/', 'https://www.tasteofhome.com/recipes/slow-cooker-chicken-vegetable-soup/', 'https://www.tasteofhome.com/recipes/apple-cinnamon-jelly/']
https://www.tasteofhome.com/recipe-sitemap2.xml
2500
['https://www.tasteofhome.com/recipes/red-velvet-cookies/', 'https://www.tasteofhome.com/recipes/peppermint-meringue-clouds/', 'https://www.tasteofhome.com/recipes/creamed-turkey-and-mashed-potatoes/', 'https://www.tasteofhome.com/recipes/country-style-pork-loin-with-gravy/', 'https://www.tasteofhome.com/recipes/lighter-lasagna-corn-carne/']
https://www.tasteofhome.com/recipe-sitemap3.xml
2500
['https://www.tasteofhome.com/recipes/spanish-rice-with-chicken/', 'https://www.tasteofhome.com/recipes/lemon-chiffon-pie/', 'https://www.tasteofhome.com/recipes/wild-rice-crab-salad/', 'https://www.tasteofh

In [10]:
with open('scraping/tasteofhome/links.txt','w+') as f:
    for r in list(set(result)):
        f.write(r+'\n')

In [11]:
len(result)

54468

# Serious eats

In [12]:
inner_sitemaps = retriveMappedLinks('https://www.seriouseats.com/sitemap.xml')
inner_sitemaps

['https://www.seriouseats.com/sitemap-categories.xml',
 'https://www.seriouseats.com/sitemap-0000.xml',
 'https://www.seriouseats.com/sitemap-2019.xml',
 'https://www.seriouseats.com/sitemap-2018.xml',
 'https://www.seriouseats.com/sitemap-2017.xml',
 'https://www.seriouseats.com/sitemap-2016.xml',
 'https://www.seriouseats.com/sitemap-2015.xml',
 'https://www.seriouseats.com/sitemap-2014.xml',
 'https://www.seriouseats.com/sitemap-2013.xml',
 'https://www.seriouseats.com/sitemap-2012.xml',
 'https://www.seriouseats.com/sitemap-2011.xml',
 'https://www.seriouseats.com/sitemap-2010.xml',
 'https://www.seriouseats.com/sitemap-2009.xml',
 'https://www.seriouseats.com/sitemap-2008.xml',
 'https://www.seriouseats.com/sitemap-2007.xml',
 'https://www.seriouseats.com/sitemap-2006.xml']

In [17]:
retriveMappedLinks(inner_sitemaps[5])[:10]

['https://www.seriouseats.com/2016/12/most-popular-posts-of-2016.html',
 'https://www.seriouseats.com/2016/12/our-favorite-features-longform-stories-2016.html',
 'https://www.seriouseats.com/2016/12/what-the-bravetart-ate-2016-favorite-recipes.html',
 'https://www.seriouseats.com/2016/12/best-new-spirits-booze-recommendations-2016.html',
 'https://www.seriouseats.com/2016/12/crispy-crackling-roast-pork-loin.html',
 'https://www.seriouseats.com/2016/12/daniels-favorite-recipes-of-2016-best-dishes-of-year.html',
 'https://www.seriouseats.com/2016/12/favorite-images-of-2016-vicky-wasik.html',
 'https://www.seriouseats.com/2016/12/kenjis-favorite-recipes-2016.html',
 'https://www.seriouseats.com/2016/12/expats-scrambled-eggs-indian-christmas.html',
 'https://www.seriouseats.com/2016/12/guide-to-making-braiding-jewish-challah-bread.html']

In [18]:
result = []
for s in inner_sitemaps: # get only recipes, not other pages
    print(s)
    tmp = list(filter(lambda x: x.find('seriouseats.com/recipes/') >= 0, retriveMappedLinks(s))) # getall links from sitemap section and filter out all no - recipes
    print(len(tmp))
    print(tmp[:5])
    result.extend(tmp)
    
print(len(result))

https://www.seriouseats.com/sitemap-categories.xml
0
[]
https://www.seriouseats.com/sitemap-0000.xml
0
[]
https://www.seriouseats.com/sitemap-2019.xml
0
[]
https://www.seriouseats.com/sitemap-2018.xml
0
[]
https://www.seriouseats.com/sitemap-2017.xml
0
[]
https://www.seriouseats.com/sitemap-2016.xml
0
[]
https://www.seriouseats.com/sitemap-2015.xml
0
[]
https://www.seriouseats.com/sitemap-2014.xml
0
[]
https://www.seriouseats.com/sitemap-2013.xml
0
[]
https://www.seriouseats.com/sitemap-2012.xml
0
[]
https://www.seriouseats.com/sitemap-2011.xml
0
[]
https://www.seriouseats.com/sitemap-2010.xml
0
[]
https://www.seriouseats.com/sitemap-2009.xml
0
[]
https://www.seriouseats.com/sitemap-2008.xml
0
[]
https://www.seriouseats.com/sitemap-2007.xml
0
[]
https://www.seriouseats.com/sitemap-2006.xml
0
[]
0
