In [1]:
import pandas as pd
import json
import requests
import random
from lxml import html
import re
import time

from  util.spider import Spider
from util.http_utility import get_http_headers
from util.user_agent import get_random_ua

# Food4Thought: Russia

url: https://www.russianfood.com/recipes/bytype/?fid=103#rcp_list

The website contains a lot of recipes from different countries, fid=103 is for Russia. 
Russianfood.com has IP blocking, need time sleep... and disable multithreading.

In [2]:
class RussianSpider(Spider): #inherit Spider Class

    def __init__(self, main_page, seeds, listing, attrs, header=None):
        super().__init__(main_page,seeds,listing,attrs,header)

    def scrape_one_item(self,url):
        """To scrape one item from a given url

        Args:
            url (str): url of the item

        Returns:
            dict: dict of the item
        """
        time.sleep(2)
        session = requests.Session() 
        session.headers.update(self.header)

        #add home page url to the obtained item url if item url is not complete
        if self.main_page not in url:
            res = session.get(self.main_page + url)
        else:
            res = session.get(url)

        if res.status_code == 200:
            try:
                #print(res.status_code)
                doc = html.document_fromstring(res.text)
                #set default values for variables
                name, total_time, ingredients, instructions, servings, category, prep_time, cook_time = '','','','','','','',''

                #Name
                if self.check_normalize_space(self.attrs['name']):
                    name = doc.xpath(self.attrs['name'])
                else: name = doc.xpath(self.attrs['name'])[0]

                #Total Time
                if self.check_normalize_space(self.attrs['total_time']):
                    total_time = doc.xpath(self.attrs['total_time'])
                else:  total_time = doc.xpath(self.attrs['total_time'])[0]

                #ingredients
                ingredients = doc.xpath(self.attrs['ingredients'])
                clean_ingredients = []
                for ing in ingredients:
                    ing = ing.replace('\t','')
                    ing = ing.replace('\r','')
                    ing = ing.replace('\n','')
                    clean_ingredients.append(ing)

                #print(clean_ingredients)
                #ingredients = ''.join(clean_ingredients)
                #ingredients = ingredients.strip()
                # ingredient_str = ''
                # for ing in ingredients:
                #     if  ' ' not in str(ing):
                #         ingredient_str += ing

                ingredients = list(filter(None, clean_ingredients))
                

                #instructions
                instructions = doc.xpath(self.attrs['instructions'])
                #print(instructions)
                #instructions= ''.join(instructions)
                
                # instructions_str = ''
                # for ins in instructions:
                #     if  ' ' not in str(ins): 
                #         instructions_str += ins
                # instructions = instructions_str

                # if self.check_normalize_space(self.attrs['instructions']):
                #     instructions = doc.xpath(self.attrs['instructions'])
                # else:  instructions = doc.xpath(self.attrs['instructions'])[0]

                #servings
                if self.attrs['servings']:
                    if self.check_normalize_space(self.attrs['servings']):
                        servings = doc.xpath(self.attrs['servings'])
                    else: servings = doc.xpath(self.attrs['servings'])[0]

                #category
                if self.attrs['category']:
                    if self.check_normalize_space(self.attrs['category']):
                        category = doc.xpath(self.attrs['category'])
                    else: category = doc.xpath(self.attrs['category'])[0]

                #prep time
                if self.attrs['prep_time']:
                    if self.check_normalize_space(self.attrs['prep_time']):
                        prep_time = doc.xpath(self.attrs['prep_time'])
                    else: prep_time = doc.xpath(self.attrs['prep_time'])[0]
                
                #cooking time
                if self.attrs['cook_time']:
                    if self.check_normalize_space(self.attrs['cook_time']):
                        cook_time = doc.xpath(self.attrs['cook_time'])
                    else: cook_time = doc.xpath(self.attrs['cook_time'])[0]
                
                return {'name': name, 'total_time': total_time, 'ingredients': ingredients, 'instructions': instructions, 'servings': servings,
                'category': category, 'prep_time': prep_time, 'cook_time': cook_time,}
            except Exception as e:
                raise Exception('something is wrong for item: {}'.format(url)) from e


        else:
            raise Exception('Cannot open one menu url, status code = {}'.format(res.status_code))


In [3]:
attrs = {
        'name':         'normalize-space(//title)',
        'ingredients':  '//tr[contains(@class,"ingr_tr_")]//text()',
        'total_time':   'normalize-space(//div[@class="sub_info"]/div[@class="el"][2])',
        'instructions': '//div[@class="step_n"]//p//text()',
        'servings':     'normalize-space(//div[@class="sub_info"]/div[@class="el"][1])',
        'category':     '',
        'prep_time':    '',
        'cook_time':    '',
}

listing={'items': '//div[contains(@class,"recipe_list_new")]//div[contains(@class,"title")]/a/@href', 'next': { 'next_page_str': '&page={}', 'type': 'url'}}
seeds = ['https://www.russianfood.com/recipes/bytype/?fid=103']

In [4]:
#setup variables
user_agent = get_random_ua()
custom_header = { #setup custom header because romania requires certain headers
        'referer': 'https://www.google.com/',
        'Accept-Language': '*',
        'Accept-Encoding': '*',
        'Accept': '*',
        'user-agent': user_agent}
ru_spider= RussianSpider('https://www.russianfood.com/', seeds= seeds, listing =listing,attrs= attrs, header=custom_header)
#print(romania_spider.attrs)
ru_spider.scrape_one_item('https://www.russianfood.com/recipes/recipe.php?rid=164402') #try out scraping one item

{'name': 'Рецепт: Картофельные драники с чесноком (без яиц и муки) на RussianFood.com',
 'total_time': '\xa030 мин (ваши 30 мин)',
 'ingredients': ['Картофель - 800 г',
  'Лук репчатый - 150 г',
  'Сметана - 1 ст. ложка',
  'Соль - 0,5 ч. ложки',
  'Перец молотый (смесь перцев) - 1 щепотка',
  'Масло растительное - 4 ст. ложки'],
 'instructions': ['Подготовьте все для драников - картофель, лук, чеснок, сметану, специи и растительное масло для жарки.',
  'Картофель очистите от кожуры. Репчатый лук и чеснок очистите от шелухи.',
  'Репчатый лук и чеснок натрите на мелкой терке в глубокую миску.',
  'Туда же натрите на мелкой тёрке картофель, периодически перемешивая его с луком. Добавьте соль, перемешайте.',
  'Картофельно-луковую массу откиньте на дуршлаг или мелкое сито, сливая жидкость в миску. Оставьте на 2-3 минуты, слегка придавливая массу лопаткой. Сильно выжимать не нужно, чтобы драники получились сочными.',
  'У меня получилось примерно 120-130 мл жидкости. Аккуратно слейте верх

In [5]:
result_list = ru_spider.start_scrape(max_pages=65,multithread=False) #max page is only 65, have to disable multithreading because of  blocking

spider is scraping page: 1


KeyboardInterrupt: 

In [None]:
result_list

In [None]:
result_df = pd.DataFrame(result_list)

In [None]:
result_df

In [None]:
result_df.to_csv('data/russia/russian_food_com.csv')

I got blocked from accessing this website whilst I was scraping the 7th page. Changing IP addresses with VPN gives the same problem.