# Scraping article content

In [1]:
import numpy as np
import pandas as pd
import requests
import re
import time
import openai
import redis
import json
import sqlite3
import scrapy
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import bs4

In [2]:
class TertamListCrawler(scrapy.Spider):
    name = "MovsisyansNewsCrawler"

    custom_settings=dict(
        # set a download delay to avoid DOSing
        DOWNLOAD_DELAY = 0.08
    )
    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.domain = r"https://www.tert.am/en/news/"
        self.r = redis.Redis("localhost", 5211, decode_responses=True)

    def start_requests(self):
        while (to_fetch := self.r.rpop("to_fetch_list_tertam")) is not None:
            yield scrapy.Request(url=self.domain + to_fetch, callback=self.parse_article)

    def parse_article(self, response):
        resp_text = " ".join(response.text.split())
        
        # print(f"SCRAPING {response.url}")
        soup = bs4.BeautifulSoup(resp_text)

        articles = soup.findAll("li", {"class": "list__item"})

        titles = [article.find('span', {"class": "list__title db fb fs15"}).find(string=True) for article in articles]
        dates = [datetime.strptime(article.find("span", {"class": "list__date ver-top-box fb fs12"}).find(string=True), "%H:%M • %d.%m.%y").strftime("%Y/%m/%d %H:%M:%S") for article in articles]
        categories = [", ".join([d for d in ["".join(t.findAll(string=True)) for t in article.findAll("a", {"class": "list__category fb fs12 ver-top-box tu"})] if len(d) > 0]) for article in articles]
        links = [article.find("a", {"class": "list__link db"}).attrs["href"].lstrip("/en/news/") for article in articles]

        links, categories, dates, titles

        [self.r.lpush("to_store_list_tertam", json.dumps({"id": links[i], "headline": titles[i], "date": dates[i], "categories": categories[i]})) for i in range(len(links))]
        

In [3]:
proc = CrawlerProcess()
proc.crawl(TertamListCrawler)
proc.start()

2023-11-25 14:19:35 [scrapy.utils.log] INFO: Scrapy 2.11.0 started (bot: scrapybot)
2023-11-25 14:19:35 [scrapy.utils.log] INFO: Versions: lxml 4.9.3.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.11.3 (tags/v3.11.3:f3909b8, Apr  4 2023, 23:49:59) [MSC v.1934 64 bit (AMD64)], pyOpenSSL 23.3.0 (OpenSSL 3.1.4 24 Oct 2023), cryptography 41.0.5, Platform Windows-10-10.0.22621-SP0
2023-11-25 14:19:35 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2023-11-25 14:19:35 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2023-11-25 14:19:35 [scrapy.extensions.telnet] INFO: Telnet Password: 213c944f4ffd87b9
2023-11-25 14:19:36 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.lo

In [29]:
response = requests.get("https://www.tert.am/en/news/2010/03/04/")
resp_text = " ".join(response.text.split())
soup = bs4.BeautifulSoup(resp_text)

articles = soup.findAll("li", {"class": "list__item"})

titles = [article.find('span', {"class": "list__title db fb fs15"}).find(string=True) for article in articles]
dates = [datetime.strptime(article.find("span", {"class": "list__date ver-top-box fb fs12"}).find(string=True), "%H:%M • %d.%m.%y").strftime("%Y/%m/%d %H:%M:%S") for article in articles]
categories = [", ".join([d for d in ["".join(t.findAll(string=True)) for t in article.findAll("a", {"class": "list__category fb fs12 ver-top-box tu"})] if len(d) > 0]) for article in articles]
links = [article.find("a", {"class": "list__link db"}).attrs["href"].lstrip("/en/news/") for article in articles]

links, categories, dates, titles

(['2010/03/04/water/114537',
  '2010/03/04/disabled/114558',
  '2010/03/04/aliyev/114641',
  '2010/03/04/100steps/114488',
  '2010/03/04/energy/114607',
  '2010/03/04/armanmusinyan/114534',
  '2010/03/04/petrol/114565',
  '2010/03/04/pashinyan/114458',
  '2010/03/04/imam/114524',
  '2010/03/04/voting/114512',
  '2010/03/04/evarivas/114454',
  '2010/03/04/ancrally/114445',
  '2010/03/04/shiff/114440',
  '2010/03/04/olympics/114430',
  '2010/03/04/lgbtwashington/1258869',
  '2010/03/04/oneline/114413',
  '2010/03/04/e-govt/1258864',
  '2010/03/04/compilation/1258857',
  '2010/03/04/rubenhayrapetyan/114420',
  '2010/03/04/statement/1258848',
  '2010/03/04/belarus/114396',
  '2010/03/04/dalailama/1258839',
  '2010/03/04/genocide/114380',
  '2010/03/04/obamagul/114335',
  '2010/03/04/protocols/1258829',
  '2010/03/04/delegation/1258815',
  '2010/03/04/obamahealthcare/1258813',
  '2010/03/04/auction/114337'],
 ['Economy',
  '',
  '',
  '',
  '',
  '',
  'Economy',
  'Event',
  'Event',
  'Ev

In [15]:
articles[0].find("span", {"class": "list__date ver-top-box fb fs12"}).find(string=True)

'16:12 • 03.03.10'

In [None]:
response.css('span[itemprop="articleBody"].last-child')