# Vesti Web Scraper: About

In [None]:
# Based on keyword and date parameters, scrapes article text from RIA.ru 
# Script takes a keyword in English or Russian and
# two dates in the form of dd.mm.yyyy
# Edit the parameters in the last code block
# Scraper returns the data as a .txt file
# When importing into Excel, select UTF 8 and * as delim

# Functions

### getLinks

In [None]:
from urllib.request import urlopen
from urllib.parse import quote
from bs4 import BeautifulSoup
import pandas as pd
import re
import math
from selenium import webdriver
import time

# getLinks----------------------------------------------------------------
def getLinks(query, startDate, endDate):
    
    # Browser opening
    
    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--incognito')
    options.add_argument('--headless')
    driver = webdriver.Chrome("/Users/MLupion/Python/chromedriver", options=options)

    
    
    # Query construction
    
    quoted_query = quote(query)
    searchURL = (f"https://www.vesti.ru/search/index/?q={quoted_query}&news=0&video=0&date_start=" + startDate + "&date_end=" + endDate) 
    startURL = "https://www.vesti.ru"
    links = []
    
    # Instruct the driver to get the searchURL 
    # and loop through clicks to display more articles
    
    driver.get(searchURL)
    print("Driver launched!")
    
    # Check for errors in the search URL 
    
    try:
        html = urlopen(searchURL)
    except HTTPError as e:
        print(e)
        exit()
    else: 
        if html is None:
            print("URL is not found. Existing program.")
            exit()
        else:
            print("URL found")
        
        
    bsObj = BeautifulSoup(html)
    more_buttons = driver.find_elements_by_class_name("load-page")
    
    # Iterate through the see more buttons.
    # Click them until they are no longer displayed.
    # Then pull the bsObj from the lxml
    
    print("Commence fast and furious scrolling!")
    while bsObj.find("a", {"style": "display: none;"}, class_= "load-page") is None:
        for x in range(len(more_buttons)):
            if more_buttons[x].is_displayed():
                driver.execute_script("arguments[0].click();", more_buttons[x])
                print("Clicked a see more button!")
                time.sleep(1)
                more_buttons = driver.find_elements_by_class_name("load-page")
                searchURL = driver.page_source
                bsObj = BeautifulSoup(searchURL, 'lxml') 
    
    
    print("Done itering through the see more buttons.")
    
    # Collect all the relevant links
    # Append them to be full URLs

    for link in bsObj.findAll("a", href=re.compile("^(/doc.html\?id=)[0-9]+$")):
        if 'href' in link.attrs:
            fullURL = startURL + link.attrs['href']
            links.append(fullURL)
    
    # Remove duplicates
    
    links = list(set(links))
    
    numberOfHits = len(links)
    print("There are " + str(numberOfHits) + " article that meet the date and keyword parameters.")
    return links

### getText

In [None]:
# getText------------------------------------------------------------------------
def getText(linkURL):
    TAG_RE = re.compile(r'<[^>]+>')
    text_return = " "
    html = urlopen(linkURL)
    bsObj = BeautifulSoup(html.read());
    
    # Get the aritcle text

    text = bsObj.find("", {"class": "b-material-body"})
    
    if text is None:
        text = bsObj.find("", {"class": "js-mediator-article"})
        
        if text is None:
            text = bsObj.find("", {"class": "article__text"})
            
            if text is None:
                text = "Text under a different tag"
            
            else:
                 # Clean up the text 
                text = text.get_text()
                text = re.sub(r'[\n\r]+', '', text)
                text = TAG_RE.sub('', text)
                text = re.sub(r'[\t]+', '', text)
        
        else:
             # Clean up the text 
                
            text = text.get_text()
            text = re.sub(r'[\n\r]+', '', text)
            text = TAG_RE.sub('', text)
            text = re.sub(r'[\t]+', '', text)
    else:            
        
        # Clean up the text 
        
        text = text.get_text()
        text = re.sub(r'[\n\r]+', '', text)
        text = TAG_RE.sub('', text)
        text = re.sub(r'[\t]+', '', text)
    
    
    return text

### getMetaData

In [None]:
# getMetaData----------------------------------------------------------------------
def getMetaData(linkURL):
    dateDict = {
        "января": "01",
        "февраля": "02",
        "марта": "03",
        "апреля": "04",
        "мая": "05",
        "июня": "06",
        "июля": "07",
        "августа": "08",
        "сентября": "09",
        "октября": "10",
        "ноября": "11",
        "декабря": "12"
    }
    html = urlopen(linkURL)
    bsObj = BeautifulSoup(html.read());
    
    # Get the article title
    
    title = bsObj.h1

    # Check to make sure it exists
    
    if title is None:
        title_return = "No title"
    
    # Makes a list of all tags that fit this parameter
    # Iterates through the list and then get text strips all tags from the text
    
    else:
        title_return = title.get_text()  
        title_return = re.sub(r'[\n]+', '',  title_return)
        title_return = title_return.strip()
        
    
    # Get the article date 
    
    date = bsObj.find("div", {"class": "b-material-date"})
    
    # Check to make sure it exists
    
    if date is None:
        date = bsObj.find("div", {"class": "article__date"})
        
        if date is None:
            date = bsObj.find("span", {"class": "article__date"})
            
            if date is None:
                date_return = "No date listed"
            
            else:
                date_return = date.get_text()
                date_return = re.sub(r'[\n]+', '', date_return)
                date_return = date_return.strip()
                date_return=date_return[0:10]
                year = date_return[6:10]
                month = date_return[3:5]
                day = date_return[0:2]
                date_return = year + "-" + month + "-" + day
        
        # Date returned in word form 
        
        else:
            date_return = date.get_text()
            date_return = re.sub(r'[\n]+', '', date_return)
            date_return = date_return[:-5]
            date_return = date_return.lstrip()
            day = date_return[0:1]
            if len(day) < 2:
                day = "0" + day
            year = date_return[-4:]
            month = date_return[2:-5]
            if month in dateDict:
                month = dateDict[month]
            date_return = year + "-" + month + "-" + day
    
    # Date returned in numerical form
    
    else:
        date_return = date.get_text()[:-17]
        if len(date_return) < 10:
            date_return = date_return + ".2019"
        else:
            date_return = date_return 
        # Put date into Year, Month, Day format
    
        date_return = date_return[6:10] + "-" + date_return[3:5] + "-" + date_return[0:2]
    
    
    # Get article tags
    
    tags = bsObj.find("", {"class":"b-material-after-body"})


    # Check to make sure it exists
    
    if tags is None:
        tags = bsObj.findAll("li", {"class":"tags-list__item"})
        
        if len(tags) < 1:
            tag_return = "No tags listed"
        
        else:
            tag_return = []
            for tag in tags:
                tag = tag.get_text()
                tag = re.sub(r'[\n]+', '', tag)
                tag_return.append(tag) 
            tag_return = set(tag_return)
                
    
    else:
        tag_return = tags.get_text()
        tag_return = re.sub(r'[\n]+', '', tag_return)
        tag_return = re.sub(r'Метки: ', '', tag_return)

    return title_return, date_return, tag_return

### scrapeVesti

In [None]:
# scrapeVesti----------------------------------------------------------------
def scrapeVesti(query, startDate, endDate):
    pullList = getLinks(query, startDate, endDate)
    
    # Lists to hold scraped information 
    URLsLst = []
    metadataLst = []
    textLst = []
    dateLst = []
    
    print(" ")
    print(" ")
    print("Scraping article text.")
    
    linksToGo = len(pullList)
   # badLinks = [1046,729]
    badLinks = []
    
    for link in pullList:
        
        if linksToGo not in badLinks:
            print(str(linksToGo) + " links to go.")
            print(link)
            URLsLst.append(link)
            print(" ")
        
            metadata = getMetaData(link)
            print(metadata)
            metadataLst.append(metadata)
            dateLst.append(metadata[1])
            print(" ")
        
            text2 = getText(link)
            print(text2)
            textLst.append(text2)
            print(" ")
        
            linksToGo = linksToGo - 1
        
        else:
            print("Bad link - skipping")
            linksToGo = linksToGo - 1
        
    # Create an empty data frame
    print(" ")
    print(" ")
    print("Building data frame...")
    df = pd.DataFrame()
    
    # Add lists to data frame
    df["URLS"] = URLsLst
    df["metadata"] = metadataLst
    df["content"] = textLst
    df["date"] = dateLst
    
    
    print(df)
    print(" ")
    print(" ")
    print("Exporting data...")
    
    # Export data frame as a text file. 
    # When importing into Excel, select UTF 8 and * as delim.
    
    df.to_csv("Vesti_data_CC.txt", sep='*', index=False)    
    

# Edit Parameters and Run Scraper

In [None]:
# Change these 
query = "sample query" 
startDate =  "01.01.2016"
endDate = "31.08.2019"

scrapeVesti(query, startDate, endDate)  