In [1]:
# selenium and chromium must be pre-installed
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import traceback
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from fake_useragent import UserAgent
from time import sleep

### RBC.ru

In [2]:
urls = ["https://www.rbc.ru/finances/?utm_source=topline", "https://www.rbc.ru/economics/?utm_source=topline"]

In [3]:
driver = webdriver.Chrome()

In [4]:
class Scrapper:
    """
    Collecting pages urls and texts from rbc.ru
    """
    def __init__(self, base_url, max_volume, save_path):
        self.url = base_url
        self.max_vol = max_volume
        self.path = save_path
        self.dtf = pd.DataFrame(columns=[
            "url"
        ])
    
    def get_urls(self):
        num = self.max_vol
        
        driver = webdriver.Chrome()
        driver.get(self.url)
        
        try:
            popup = driver.find_element(By.CLASS_NAME, "live-tv-popup__close")
            popup.click()
        except:
            pass
        
        urls = []
        
        while num:
            try:
                objects = driver.find_elements(By.CLASS_NAME, 'item__link')
                url = []
                for obj in objects:
                    urls.append(obj.get_attribute("href"))
                
                num -= 1
            except Exception as er:
                print(er)
                driver.quit()
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            print(f"INFO collected {len(urls)} urls")
            sleep(9)
        
        self.dtf = pd.concat([self.dtf, pd.DataFrame(urls, columns=["url"])], ignore_index=True)
        
        driver.quit()
        self.save()
        
    def extract(self):
        urls = pd.read_csv(self.path)
        
        driver = webdriver.Chrome()
        texts = []
        urls['text'] = None
            
        for i in range(urls.shape[0]):
            driver.get(urls.loc[i]["url"])
        
            try:
                popup = driver.find_element(By.CLASS_NAME, "live-tv-popup__close")
                popup.click()
            except:
                pass
            
            try:
                text = ''
                objects = driver.find_elements(By.TAG_NAME, "p")
                for obj in objects:
                    text += obj.text
                urls["text"][i] = text
                
            except:
                urls.to_csv(self.path)
                driver.quit()
            sleep(2)
            
            if i % 50 == 0:
                print(f"Collected {i} news")
        
        urls.to_csv(self.path)
        driver.quit()
    
    def save(self):
        self.dtf.to_csv(self.path)
    

In [5]:
sc = Scrapper("https://www.rbc.ru/finances/?utm_source=topline", max_volume=2, save_path="test.csv")
sc.get_urls()
sc.extract()

INFO collected 20 urls
INFO collected 52 urls


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  urls["text"][i] = text


Collected 0 news
Collected 50 news


In [None]:
fin = Scrapper(urls[0], max_volume=60, save_path="finance.scv")
fin.get_urls()
fin.extract()

INFO collected 20 urls
INFO collected 52 urls
INFO collected 96 urls
INFO collected 152 urls
INFO collected 220 urls
INFO collected 300 urls
INFO collected 380 urls
INFO collected 460 urls
INFO collected 540 urls
INFO collected 620 urls
INFO collected 700 urls
INFO collected 780 urls
INFO collected 860 urls
INFO collected 940 urls
INFO collected 1020 urls
INFO collected 1100 urls
INFO collected 1180 urls
INFO collected 1260 urls
INFO collected 1340 urls
INFO collected 1420 urls
INFO collected 1500 urls
INFO collected 1580 urls
INFO collected 1660 urls
INFO collected 1740 urls
INFO collected 1820 urls
INFO collected 1900 urls
INFO collected 1980 urls
INFO collected 2060 urls
INFO collected 2140 urls
INFO collected 2220 urls
INFO collected 2300 urls
INFO collected 2380 urls
INFO collected 2460 urls
INFO collected 2540 urls
INFO collected 2620 urls
INFO collected 2700 urls
INFO collected 2780 urls
INFO collected 2860 urls
INFO collected 2940 urls
INFO collected 3020 urls
INFO collected 31

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  urls["text"][i] = text


Collected 0 news


In [None]:
bis = Scrapper(urls[1], max_volume=60, save_path="business.csv")
bis.get_urls()
bis.extract()

### CyberLeninka

In [8]:
class CyberScrapper:
    """
    Collecting pages urls and texts from cyberleninka.ru
    """
    def __init__(self, base_url, max_volume, save_path, num_page=None):
        self.url = base_url
        self.max_vol = max_volume
        self.path = save_path
        self.num_page = num_page
        self.columns = ["url", "author", "title", "text", "year", "labels", "views", 
                                          "downloads", "likes", "dislikes", "journal"]
        self.data = pd.DataFrame(columns=self.columns)
        
    def get(self):
        
        ua = UserAgent()
        user_agent = ua.random
        
        driver = webdriver.Chrome()
        
        driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": user_agent})
        if self.num_page:
            driver.get(self.url + f"/{self.num_page}")
        else:
            driver.get(self.url)
        num = self.max_vol
        
        # num of li elements on the page
        last_paper_on_page = -7
        if self.num_page:
            page_num = self.num_page
        else:
            page_num = 2
        
        try:
            while num:
                
                print(f"Papers {self.data.shape[0]} saved")
                elements = driver.find_elements(By.TAG_NAME, "li")
                articles = elements[:last_paper_on_page]
                next_page = self.url + f"/{page_num}"
                
                for article in articles:
                    
                    num -= 1
                    href = article.find_element(By.TAG_NAME, "a").get_attribute("href") 
                    driver.get(href)
                    
                    objects = driver.find_elements(By.TAG_NAME, "p")
                    
                    # get text of paper
                    text = ''
                    for obj in objects:
                        text += obj.text
                    
                    # author
                    try:
                        author = driver.find_element(By.CLASS_NAME, "hl").text
                    except:
                        author = None
                    try:
                        views = driver.find_element(By.CLASS_NAME, "statitem.views").text
                    except:
                        views = None
                    try:
                        down = driver.find_element(By.CLASS_NAME, "statitem.downloads").text
                    except:
                        down = None
                    try:
                        likes = driver.find_element(By.CLASS_NAME, "likes").text.split("\n")
                    except:
                        likes = [None, None]
                    try:
                        year = driver.find_element(By.CLASS_NAME, "label.year").find_element(By.TAG_NAME, "time").text
                    except:
                        year = None
                    try:    
                        journal = driver.find_element(By.CLASS_NAME, "half").find_elements(By.TAG_NAME, "a")[-1].text
                    except:
                        journal = None
                    try:
                        words = [i.text for i in driver.find_element(By.CLASS_NAME, "full.keywords").find_elements(By.CLASS_NAME, "hl.to-search")]
                    except:
                        words = None
                    try:
                        title = driver.find_element(By.TAG_NAME, "i").text
                    except:
                        title = None
                    
                    lst = [(href, 
                            author, 
                            title,
                            text, 
                            year, 
                            words, 
                            views, 
                            down, 
                            likes[0], 
                            likes[1], 
                            journal)]
                    to_add = pd.DataFrame(lst, columns=self.columns)
                    self.data = pd.concat([self.data, to_add], ignore_index=True)
                    sleep(10)
                    driver.back()
                
                # Change UA 
                driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": ua.random})
                driver.get(next_page)
                page_num += 1
                
        except Exception as ex:
            print("Last page:", page_num)
            traceback.print_exc()
            driver.quit()
            
        print("Last page:", page_num)
        return self.data
    
    def save(self):
        self.data.to_csv(self.path)
    

In [9]:
url = "https://cyberleninka.ru/article/c/economics-and-business"
cyber = CyberScrapper(url, max_volume=2000, save_path="papers_6.csv", num_page=6643)
cyber.get()

Papers 0 saved
Papers 20 saved
Papers 40 saved
Papers 60 saved
Papers 80 saved
Papers 100 saved
Papers 120 saved
Papers 140 saved
Papers 160 saved
Papers 180 saved
Papers 200 saved
Papers 220 saved
Papers 240 saved
Papers 260 saved
Papers 280 saved
Papers 300 saved
Papers 320 saved
Papers 340 saved
Papers 360 saved
Papers 380 saved
Papers 400 saved
Papers 420 saved
Papers 440 saved
Papers 460 saved
Last page: 6666
Last page: 6666


Traceback (most recent call last):
  File "/tmp/ipykernel_6009/3340503310.py", line 46, in get
    href = article.find_element(By.TAG_NAME, "a").get_attribute("href")
  File "/home/igor/projects/data_wsd/data_wsd/lib/python3.10/site-packages/selenium/webdriver/remote/webelement.py", line 416, in find_element
    return self._execute(Command.FIND_CHILD_ELEMENT, {"using": by, "value": value})["value"]
  File "/home/igor/projects/data_wsd/data_wsd/lib/python3.10/site-packages/selenium/webdriver/remote/webelement.py", line 394, in _execute
    return self._parent.execute(command, params)
  File "/home/igor/projects/data_wsd/data_wsd/lib/python3.10/site-packages/selenium/webdriver/remote/webdriver.py", line 344, in execute
    self.error_handler.check_response(response)
  File "/home/igor/projects/data_wsd/data_wsd/lib/python3.10/site-packages/selenium/webdriver/remote/errorhandler.py", line 229, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.except

Unnamed: 0,url,author,title,text,year,labels,views,downloads,likes,dislikes,journal
0,https://cyberleninka.ru/article/n/analiz-effek...,Гришин А.В.,Анализ эффективности применения поощрений для ...,Представлены анализ состояния организации немо...,2010,"[мотивация, поощрения, немонетарное стимулиров...",1410,217,0,0,Бюллетень сибирской медицины
1,https://cyberleninka.ru/article/n/metodicheski...,Петров А.Г.,Методические подходы к прогнозированию позитив...,Представлены методические подходы к прогнозиро...,2010,"[прогнозирование риска, имидж, фармацевтическа...",268,64,0,0,Бюллетень сибирской медицины
2,https://cyberleninka.ru/article/n/puti-optimiz...,Баранкина Т.А.,Пути оптимизации лекарственного обеспечения в ...,С целью оптимизации лекарственного обеспечения...,2010,"[оптимизация, лекарственное обеспечение, эффек...",71,19,0,0,Бюллетень сибирской медицины
3,https://cyberleninka.ru/article/n/perspektivy-...,Воробьёв В.М.,Перспективы адсорбирующей повязки на основе на...,Проведено маркетинговое исследование по перспе...,2010,"[перевязочные средства, адсорбирующая повязка,...",435,77,0,0,Бюллетень сибирской медицины
4,https://cyberleninka.ru/article/n/setevye-medi...,Бобровский Андрей Вениаминович,Сетевые медицинские организации: стратегия раз...,Осознание преимуществ сетевых решений в здраво...,2010,"[сетевые медицинские организации, менеджмент, ...",515,61,0,0,Бюллетень сибирской медицины
...,...,...,...,...,...,...,...,...,...,...,...
456,https://cyberleninka.ru/article/n/odin-iz-aspe...,Ерзин Олег Александрович,Один из аспектов оценки эффективности технолог...,В работе предложен энергетический подход к оце...,2014,"[технологическая система, эффективность исполь...",173,303,0,0,Известия Тульского государственного университе...
457,https://cyberleninka.ru/article/n/perspektivy-...,Белянская Елена Сергеевна,Перспективы использования электронно-библиотеч...,Проанализированы перспективы использования эле...,2014,"[электронно-библиотечная система, электронные ...",233,129,0,0,Известия Тульского государственного университе...
458,https://cyberleninka.ru/article/n/povyshenie-e...,Апина Анна Михайловна,Повышение эффективности труда персонала на осн...,В статье приведена характеристика проблем повы...,2014,"[повышение эффективности труда персонала, инве...",1354,160,0,0,"Промышленность: экономика, управление, технологии"
459,https://cyberleninka.ru/article/n/rasshirenie-...,Козельский Алексей Викторович,Расширение сферы инфраструктурных услуг в проц...,"В статье обосновывается, что расширение сферы ...",2014,"[инфраструктура национальной экономики, инфрас...",471,59,0,0,"Промышленность: экономика, управление, технологии"


In [10]:
cyber.save()