# RIA.ru Web Scraper: About

In [None]:
# Based on keyword and date parameters, scrapes article text from RIA.ru 
# Script takes a keyword in English or Russian, 
# two four-digit years, and two one- or two-digit months
# Edit the parameters in the last code block
# Scraper the data as a .txt file
# When importing into Excel, select UTF 8 and * as delim

# Functions

In [None]:
from urllib.request import urlopen
from urllib.parse import quote
from bs4 import BeautifulSoup
import pandas as pd
import re
import math

# getLinks----------------------------------------------------------------
# Function takes a keyword, two four-digit years, and two one- or two-digit months

def getLinks(query, yearStart, monthStart, yearEnd, monthEnd):
    
    # Make the query ASCII, if it's not
    # Add it to the search URL
    
    quoted_query = quote(query)
    searchURL = (f"https://ria.ru/search/?query={quoted_query}")
    
    # Create an empty list to hold links
    
    links = []
    
    # Pull the HTML while anticipating HTTP error
    
    try:
        html = urlopen(searchURL)
    
    except HTTPError as e:
        print(e)
    
    # If no HTTP error, then check for HTML error
    
    else:
        if html is None:
            print("URL is not found. Existing program.")
            exit()
            
        # If no HTML error, proceed with cleaning the HTML
        
        else:
            print("URL found.")
            bsObj = BeautifulSoup(html)        
       
    
    if bsObj.span is None:
        print("Tag was not found. Exiting program.") 
        exit()
    else:
        print("Tag found.")

    
    # Find and print number of hits 

    totalHits = (bsObj.find("div", {"class":"rubric-count m-active"}).get_text())
    totalHits = int(re.sub("[^0-9]", "", totalHits))     
    print("There are " + str(totalHits) + " hits for that key word combination.")
    print("Begin collecting relevant article links.")
    
    # Find the total number of cycles to go through 

    cycles = math.ceil(totalHits/20) 
    print("This will take " + str(cycles) + " cycles.")
        
    # Use a while statement to cycle through pages
    
    counter = 20
    while cycles > 0:
    
        print(str(cycles) + " cycles left.")
        
        # Collect page links based on span tag and data-url attribute.
        # Append it to the link list.
    
        for link in bsObj.findAll("span"):
            if 'data-url' in link.attrs:
                links.append(link.attrs['data-url'])
                #time.sleep(2)
        
        # Update search link and collect HTML for it

        searchURL = searchURL + "&offset=" + str(counter)
        html = urlopen(searchURL)
        bsObj = BeautifulSoup(html)
    
        counter = counter + 20
        cycles = cycles - 1
    
    print("There are " + str(len(links)) + " relevant articles before filtering by date.")
    print(" ")
    print(" ")
    print(" ")
    print("Now we will filter by date...")
    
    # Filter by date
    
    # Make year and month strings
    
    yearStart = str(yearStart)
    monthStart = str(monthStart)
    yearEnd = str(yearEnd)
    monthEnd = str(monthEnd)
    
    # If neccessary, add a proceeding 0 to the month inputs
    
    if len(monthStart) == 1:
        monthStart = "0" + monthStart
    else:
        monthStart = monthStart
    if len(monthEnd) == 1:
        monthEnd = "0" + monthEnd
    else:
        monthEnd = monthEnd
    
    # One month only
    
    if yearStart == yearEnd and monthStart == monthEnd: 
        
        # Combine year and month to form a match statement 
        
        match = yearStart + monthStart
        
        # Dates fall within links from index 15 to 21
        
        links = [link for link in links if link[15:21] == match]
        
        # Return relevant links 
        
        print(" ")
        print(" ")
        print("There are " + str(len(links)) + " relevant articles after filtering by date.")
        return links
    
    # Multiple months/years
    
    else:
        linksFiltered = []
        
        startParameter = int(yearStart + monthStart)
        endParameter = int(yearEnd + monthEnd)
        
        for link in links:
            if int(link[15:21])>= startParameter and int(link[15:21]) <= endParameter:
                linksFiltered.append(link)

         # Return relevant links 
        
        print(" ")
        print(" ")
        print("There are " + str(len(linksFiltered)) + " relevant articles after filtering by date.")
        return linksFiltered


# getMetaData----------------------------------------------------------------------
def getMetaData(linkURL):
    
    html = urlopen(linkURL)
    bsObj = BeautifulSoup(html.read());
    
    # Get the article title
    
    titleList = bsObj.findAll("", {"class":"article__title"})

    # Check to make sure it exists
    
    if len(titleList) == 0:
        title_return = "No title"
    
    # Makes a list of all tags that fit this parameter
    # Iterates through the list and then get text strips all tags from the text
    
    else:
        for title in titleList:
            title_return = title.get_text()            
        
    
    # Get the article date
    
    dList = bsObj.findAll("", {"itemprop": "dateModified"})
    
    # Check to make sure it exists
    
    if len(dList) == 0:
        date_return = "No date listed"
    
    # Makes a list of all tags that fit this parameter
    # Iterates through the list and then get text strips all tags from the text
    
    else:
        
        for d in dList:
            date_return = d.get_text()[0:10]
    
    # Get article tags
    
    tList = bsObj.findAll("", {"class":"article__tags"})

    # Check to make sure it exists
    
    if len(tList) == 0:
        tag_return = "No tags listed"
        
    # Makes a list of all tags that fit this parameter
    # Iterates through the list and then get text strips all tags from the text
    
    else:
        
        for t in tList:
            tag_return = t.get_text()
    
    # Return items
    
    return title_return, date_return, tag_return


# getText------------------------------------------------------------------------

def getText(linkURL):
    text_return = " "
    html = urlopen(linkURL)
    bsObj = BeautifulSoup(html.read());
    
    # Get the aritcle text

    textList = bsObj.findAll("", {"class": "article__text"})
    
    
    if len(textList) == 0:
        text_return = "Text under a different tag"
    else:
        # Makes a list of all tags that fit this parameter
        
        for text in textList:
            
            # Iterates through the list and then get text strips all tags from the text
            
            text_return = text_return + (text.get_text())
    
    
    text_return = re.sub(r'[\n\r]+', '', text_return)
    
    
    return text_return
 
# scrapeRIA----------------------------------------------------------------
def scrapeRIA(query, yearStart, monthStart, yearEnd, monthEnd):
    global title_return, date_return, tag_return
    
    # Use getLinks to retrieve all the links and save to list pullList 
    
    pullList = getLinks(query, yearStart, monthStart, yearEnd, monthEnd)
    
    # Create empty lists to hold scraped content 
    
    URLsLst = []
    metadataLst = []
    textLst = []
    dateLst = []
    
    print(" ")
    print(" ")
    print("Scraping article text.")
   
    for link in pullList:
        print(link)
        URLsLst.append(link)
        print(" ")
        
        metadata = getMetaData(link)
        print(metadata)
        metadataLst.append(metadata)
        print(" ")
        
        text2 = getText(link)
        print(text2)
        textLst.append(text2)
        print(" ")
        
        dateLst.append(link[15:19] + "-" + link[19:21] + "-" + link[21:23])
    
    # Create an empty data frame
    
    print(" ")
    print(" ")
    print("Building data frame...")
    df = pd.DataFrame()
    
    # Add lists to data frame
    
    df["URLS"] = URLsLst
    df["metadata"] = metadataLst
    df["content"] = textLst
    df["date"] = dateLst
    
    
    print(df)
    print(" ")
    print(" ")
    print("Exporting data...")
    
    # Export data frame as a text file. 
    # When importing into Excel, select UTF 8 and * as delim.
    
    df.to_csv("RIA_data.txt", sep='*', index=False)

  

# Edit Parameters and Run Scraper

In [None]:
# Change these 

query = "sample query"
yearStart = 1
monthStart = 2020
yearEnd = 4
monthEnd = 2020

# Function takes a keyword, two four-digit years, and two one- or two-digit months

scrapeRIA(query, yearStart, monthStart, yearEnd, monthEnd) 


