In [1]:
# selenium and chromium must be pre-installed
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import traceback
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from fake_useragent import UserAgent
from time import sleep

### RBC.ru

In [2]:
urls = ["https://www.rbc.ru/finances/?utm_source=topline", "https://www.rbc.ru/economics/?utm_source=topline"]

In [15]:
class Scrapper:
    """
    Collecting pages urls and texts from rbc.ru
    """
    def __init__(self, base_url, max_volume, save_path):
        self.url = base_url
        self.max_vol = max_volume
        self.path = save_path
        self.dtf = pd.DataFrame(columns=[
            "url"
        ])
    
    def get_urls(self):
        num = self.max_vol
        
        driver = webdriver.Chrome()
        driver.get(self.url)
        
        try:
            popup = driver.find_element(By.CLASS_NAME, "live-tv-popup__close")
            popup.click()
        except:
            pass
        
        urls = []
        
        while num:
            try:
                objects = driver.find_elements(By.CLASS_NAME, 'item__link')
                url = []
                for obj in objects:
                    urls.append(obj.get_attribute("href"))
                
                num -= 1
            except Exception as er:
                print(er)
                driver.quit()
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            print(f"INFO collected {len(urls)} urls")
            sleep(9)
        
        self.dtf = pd.concat([self.dtf, pd.DataFrame(urls, columns=["url"])], ignore_index=True)
        
        driver.quit()
        self.save()
        
    def extract(self):
        urls = pd.read_csv(self.path)
        
        driver = webdriver.Chrome()
        texts = []
        urls['text'] = None
            
        for i in range(urls.shape[0]):
            driver.get(urls.loc[i]["url"])
        
            try:
                popup = driver.find_element(By.CLASS_NAME, "live-tv-popup__close")
                popup.click()
            except:
                pass
            
            try:
                text = ''
                objects = driver.find_elements(By.TAG_NAME, "p")
                for obj in objects:
                    text += obj.text
                urls["text"][i] = text
                
            except:
                urls.to_csv(self.path)
                driver.quit()
            sleep(2)
            
            if i % 50 == 0:
                print(f"Collected {i} news")
        
        urls.to_csv(self.path)
        driver.quit()
    
    def save(self):
        self.dtf.to_csv(self.path)
    

In [14]:
sc = Scrapper("https://www.rbc.ru/finances/?utm_source=topline", max_volume=2, save_path="test.csv")
sc.get_urls()
sc.extract()

INFO collected 20 urls
INFO collected 52 urls


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  urls["text"][i] = text


Collected 0 news
Collected 50 news


In [16]:
fin = Scrapper(urls[0], max_volume=1000, save_path="finance.scv")
bis = Scrapper(urls[1], max_volume=1000, save_path="business.csv")

In [17]:
fin.get_urls()
fin.extract()

INFO collected 20 urls
INFO collected 52 urls
INFO collected 96 urls
INFO collected 152 urls
INFO collected 220 urls
INFO collected 300 urls
INFO collected 392 urls
INFO collected 496 urls
INFO collected 612 urls
INFO collected 740 urls
INFO collected 880 urls
INFO collected 1032 urls
INFO collected 1196 urls
INFO collected 1372 urls
INFO collected 1560 urls
INFO collected 1760 urls
INFO collected 1960 urls
INFO collected 2160 urls
INFO collected 2360 urls
INFO collected 2560 urls
INFO collected 2760 urls
INFO collected 2960 urls
INFO collected 3160 urls
INFO collected 3360 urls
INFO collected 3560 urls
INFO collected 3760 urls
INFO collected 3960 urls
INFO collected 4160 urls
INFO collected 4360 urls
INFO collected 4560 urls
INFO collected 4760 urls
INFO collected 4960 urls
INFO collected 5160 urls
INFO collected 5360 urls
INFO collected 5560 urls
INFO collected 5760 urls


KeyboardInterrupt: 

In [None]:
bis.get_urls()
bis.extract()

### CyberLeninka

In [2]:
class CyberScrapper:
    """
    Collecting pages urls and texts from cyberleninka.ru
    """
    def __init__(self, base_url, max_volume, save_path, num_page=None):
        self.url = base_url
        self.max_vol = max_volume
        self.path = save_path
        self.num_page = num_page
        self.columns = ["url", "author", "title", "text", "year", "labels", "views", 
                                          "downloads", "likes", "dislikes", "journal"]
        self.data = pd.DataFrame(columns=self.columns)
        
    def get(self):
        
        ua = UserAgent()
        user_agent = ua.random
        
        driver = webdriver.Chrome()
        
        driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": user_agent})
        if self.num_page:
            driver.get(self.url + f"/{self.num_page}")
        else:
            driver.get(self.url)
        num = self.max_vol
        
        # num of li elements on the page
        last_paper_on_page = -7
        if self.num_page:
            page_num = self.num_page
        else:
            page_num = 2
        
        try:
            while num:
                
                print(f"Papers {self.data.shape[0]} saved")
                elements = driver.find_elements(By.TAG_NAME, "li")
                articles = elements[:last_paper_on_page]
                next_page = self.url + f"/{page_num}"
                
                for article in articles:
                    
                    num -= 1
                    href = article.find_element(By.TAG_NAME, "a").get_attribute("href") 
                    driver.get(href)
                    
                    objects = driver.find_elements(By.TAG_NAME, "p")
                    
                    # get text of paper
                    text = ''
                    for obj in objects:
                        text += obj.text
                    
                    # author
                    try:
                        author = driver.find_element(By.CLASS_NAME, "hl").text
                    except:
                        author = None
                    try:
                        views = driver.find_element(By.CLASS_NAME, "statitem.views").text
                    except:
                        views = None
                    try:
                        down = driver.find_element(By.CLASS_NAME, "statitem.downloads").text
                    except:
                        down = None
                    try:
                        likes = driver.find_element(By.CLASS_NAME, "likes").text.split("\n")
                    except:
                        likes = [None, None]
                    try:
                        year = driver.find_element(By.CLASS_NAME, "label.year").find_element(By.TAG_NAME, "time").text
                    except:
                        year = None
                    try:    
                        journal = driver.find_element(By.CLASS_NAME, "half").find_elements(By.TAG_NAME, "a")[-1].text
                    except:
                        journal = None
                    try:
                        words = [i.text for i in driver.find_element(By.CLASS_NAME, "full.keywords").find_elements(By.CLASS_NAME, "hl.to-search")]
                    except:
                        words = None
                    try:
                        title = driver.find_element(By.TAG_NAME, "i").text
                    except:
                        title = None
                    
                    lst = [(href, 
                            author, 
                            title,
                            text, 
                            year, 
                            words, 
                            views, 
                            down, 
                            likes[0], 
                            likes[1], 
                            journal)]
                    to_add = pd.DataFrame(lst, columns=self.columns)
                    self.data = pd.concat([self.data, to_add], ignore_index=True)
                    sleep(5)
                    driver.back()
                
                # Change UA 
                driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": ua.random})
                driver.get(next_page)
                page_num += 1
                
        except Exception as ex:
            print("Last page:", page_num)
            traceback.print_exc()
            driver.quit()
            
        print("Last page:", page_num)
        return self.data
    
    def save(self):
        self.data.to_csv(self.path)
    

In [3]:
url = "https://cyberleninka.ru/article/c/economics-and-business"
cyber = CyberScrapper(url, max_volume=2000, save_path="papers_5.csv", num_page=6642)
cyber.get()

In [4]:
cyber.save()