#### Import all the necessary libs:

In [1]:
import requests
from lxml import html
from datetime import datetime
import pymongo
import pprint

#### Function that returns a DOM object by using the url link:

In [2]:
def get_dom_from_url(session):
    url = "https://yandex.ru/news/"
    headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
    }
    try:
        response = session.get(url, headers=headers)
        return html.fromstring(response.text)
    except Exception as e:
        print(f"An error occurred while connecting the URL {url} - {e}")

#### Function that opens a mongodb connection and returns a collection:

In [3]:
def connect_to_mongodb():
    try:
        client= pymongo.MongoClient('127.0.0.1',27017)
        db = client['YandexNews'] #database
        news = db.news #collection
        news.create_index([('title', 1)], name = "unique_index", unique=True) 
        # unique field is title, link doesnt work properly. Maybe compound index (time + title) will be ok too.
        return news
    except pymongo.errors.ConnectionFailure as e:
        print(f"Mongodb connection has failed. {e}")

#### Function that inserts items into mongodb

In [4]:
def insert_new_items(collection, document):
    try: 
        collection.insert_one(document)
        global inserts
        inserts += 1 #count new inserts
    except pymongo.errors.DuplicateKeyError:
        pass

#### Function that returns a full datetime using received hours and minutes from yandex:

In [5]:
def to_full_datetime(received_time):
    string_time = datetime.now().strftime("%Y-%m-%d ")+received_time #make a string using current date and received_time
    return datetime.strptime(string_time, "%Y-%m-%d %H:%M") # return full datetime

#### Function that scraps news from DOM using XPath

In [6]:
def scrap_news(session, collection):
    global inserts 
    inserts = 0
    dom = get_dom_from_url(session)
    news = dom.xpath("//div[contains(@class,'mg-card_flexible') and (contains(@class,'fixed') or (contains(@class,'stretching')))]")
    for article in news:
        article_data = {}
        title = article.xpath(".//a/text()")[0].replace('\xa0',' ')
        source = article.xpath(".//a/text()")[1]
        link = article.xpath(".//a/@href")[0]
        received_time = article.xpath(".//span[@class='mg-card-source__time']/text()")[0]
        time_published = to_full_datetime(received_time)
        article_data['title'] = title
        article_data['source'] = source
        article_data['link'] = link
        article_data['time_published'] = time_published
        insert_new_items(collection, article_data)
    print(f"Found {len(news)} articles. {inserts} new articles added to mongodb")


#### Main function:



In [7]:
def main():
    collection = connect_to_mongodb() # connect to mongodb and open collection
    session = requests.Session() # open new session
    scrap_news(session, collection) #scrap news

In [10]:
main()

Found 0 articles. 0 new articles added to mongodb
