### Requests

In [None]:
!pip install requests

In [9]:
import requests

response = requests.get('https://ru.wikipedia.org/wiki/%D0%A4%D1%80%D0%B0%D0%BA%D1%82%D0%B0%D0%BB')

if response.status_code == 200:
    print('Success!')
    html_content = response.text
    with open('fractal_info.txt', 'w', encoding='utf-8') as file:
        file.writelines(html_content)
else:
    print('Error:', response.status_code)

Success!


### Упражнения Beautiful Soup

In [None]:
!pip install beautifulsoup4

In [38]:
# извлечение заголовков новостей и ссылок на статьи

from bs4 import BeautifulSoup as bs

response = requests.get('https://www.nytimes.com/international/')
if response.status_code == 200:
    html_content = response.text
    soup = bs(html_content, 'html.parser')
    links = soup.find_all('a')
    for link in links: 
        print(link.text, link['href'])

Skip to content #site-content
Skip to site index #site-index
SKIP ADVERTISEMENT #after-dfp-ad-top
Skip to content #site-content
Skip to site index #site-index
 /
U.S. /
International /international/
Canada /ca/
Español https://www.nytimes.com/es/
中文 https://cn.nytimes.com
Today’s Paper https://www.nytimes.com/section/todayspaper
 /
U.S. https://www.nytimes.com/international/section/us
U.S. https://www.nytimes.com/international/section/us
Politics https://www.nytimes.com/international/section/politics
New York https://www.nytimes.com/international/section/nyregion
California https://www.nytimes.com/spotlight/california-news
Education https://www.nytimes.com/international/section/education
Health https://www.nytimes.com/international/section/health
Obituaries https://www.nytimes.com/international/section/obituaries
Science https://www.nytimes.com/international/section/science
Climate https://www.nytimes.com/international/section/climate
Weather https://www.nytimes.com/international/secti

In [46]:
# все товары определенной категории и вывод их названий и цен

from bs4 import BeautifulSoup as bs

url = 'https://ozon.by/category/fentezi-33046/'

response = requests.get(url)

if response.status_code == 200:
    html_content = response.text
    soup = bs(html_content, 'html.parser')
    
    for product_div in soup.find_all('div', class_='js0_23'):
        price_span = product_div.find('span', class_='c3019-a1')
        title_span = product_div.find('span', class_='tsBody500Medium')
        print(title_span.text.strip(), price.text.strip())

Игры королей. Король Ардена (#3) | Анри Софи 75,81 BYN
Миры Волкодава. Братья. Комплект из 5 книг | Семёнова Мария Васильевна 75,81 BYN
Избушка на костях | Власова Ксения Игоревна 75,81 BYN
Нейромант | Гибсон Уильям 75,81 BYN
Лекс Раут. Императорский ловец | Суржевская Марина 75,81 BYN
Принцесса ледяного королевства Фокина А. Интересные книги для подростков 16+ | Фокина Анжела Валентиновна 75,81 BYN
Злодейский путь!.. Том 1 и 2 75,81 BYN
Война потерянных сердец. Книга 2. Дети павших богов | Карисса Бродбент 75,81 BYN


### Обработка и очистка данных

In [86]:
import requests
from bs4 import BeautifulSoup as bs
import csv
import random

url = 'https://ru.euronews.com/news'

response = requests.get(url)

if response.status_code == 200:
    html_content = response.text
    soup = bs(html_content, 'html.parser')

    with open('parsing.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Заголовок', 'Ссылка']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        articles = soup.find_all('h3')
        print(f'Найдено статей: {len(articles)}')
        for article in articles:
            link_tag = article.find('a')
            if link_tag is not None:
                link = link_tag['href']
                title = link_tag['aria-label']
            
                writer.writerow({'Заголовок': title, 'Ссылка': link})
    
        print('Данные успешно сохранены в parsing.csv')

        time.sleep(random.uniform(1,3))
else:
    print('Ошибка доступа к сайту')

Найдено статей: 50
Данные успешно сохранены в parsing.csv


### Этические аспекты парсинга

In [81]:
import time 
from urllib.robotparser import RobotFileParser

time.sleep(1)

rp = RobotFileParser()
rp.set_url('https://ru.wikipedia.org/wiki/robots.txt')
rp.read()
if rp.can_fetch('*', 'https://ru.wikipedia.org/wiki/Docker'):
    print('Парсинг разрешен')
else:
    print('Парсинг не разрешен')

Парсинг разрешен
