# Telling scraper to scrape articles

In [16]:
import numpy as np
import pandas as pd
import requests
import re
import sqlite3
import time
import openai
import redis
import datetime
import json

In [2]:
r = redis.Redis("localhost", 5211, decode_responses=True)

In [12]:
db = sqlite3.connect("scraping.db")
def create_article_table(db):
    db.execute("drop table if exists article")

    db.commit()
    db.execute("""
    create table if not exists article (id integer primary key, headline varchar(255), time timestamp, content text, feedback text, region varchar(255), theme varchar(128));
    """);

    db.commit()

In [None]:
# create_article_table(db)

In [7]:
initial_date = datetime.datetime(2009, 6, 25)
on_date = initial_date

while on_date <= datetime.datetime.now():
    print(on_date.year, end="\r")
    r.lpush("to_fetch_list_newsam", f"{on_date.year}/{on_date.month}/{on_date.day}/")
    on_date += datetime.timedelta(1)

2023

In [47]:
while r.llen("to_fetch_list_newsam"): # Continue execution of notebook only after all article lists have been scraped
    time.sleep(1)

In [10]:
r.llen("to_fetch_list_newsam"), r.llen("to_store_list_newsam")

(0, 219417)

## Storing scraped list of articles to scrape

In [30]:
# datetime.datetime.now().strftime("%z")
datetext = json.loads(r.rpoplpush("to_store_list_newsam", "to_store_list_newsam"))["date"]
datetime.datetime.strptime(datetext[:10] + " " + datetext[13:], "%Y-%m-%d %H:%M:%S")

datetime.datetime(2009, 6, 30, 7, 46)

In [44]:
n = 104000
while True:
    if (data := r.rpop("to_store_list_newsam")) is None:
        print(f"{n} Nothing to store, breaking", end="\r")
        break
    else:
        if n % 1000 == 0:
            print(datetime.datetime.now(), " -- ", n)
        data = json.loads(data)
        # print(data, end="\r")
        if len(data["headline"]) == 0:
            print("ERROR, NO HEADLINE")
            continue
        if len(str(data["id"])) == 0:
            print("ERROR, NO CONTENT")
            continue
        n += 1
        try:
            db.execute(f'insert into article (id, headline, time) values (?, ?, ?)', (int(data["id"]), data["headline"], datetime.datetime.strptime(data["date"][:10] + " " + data["date"][13:], "%Y-%m-%d %H:%M:%S")))
        except sqlite3.IntegrityError as e:
            db.commit()
            print(data["id"], " not unique", end="\r")
        

2023-11-23 20:13:23.615981  --  104000
2023-11-23 20:13:24.188400  --  105000
2023-11-23 20:13:24.741999  --  106000
2023-11-23 20:13:25.329674  --  107000
2023-11-23 20:13:25.924828  --  108000
2023-11-23 20:13:26.460388  --  109000
2023-11-23 20:13:27.147735  --  110000
2023-11-23 20:13:27.884745  --  111000
2023-11-23 20:13:28.607982  --  112000
2023-11-23 20:13:29.342927  --  113000
2023-11-23 20:13:30.074355  --  114000
2023-11-23 20:13:30.867951  --  115000
2023-11-23 20:13:31.655613  --  116000
2023-11-23 20:13:32.404609  --  117000
2023-11-23 20:13:33.238939  --  118000
2023-11-23 20:13:34.267479  --  119000
2023-11-23 20:13:35.705320  --  120000
2023-11-23 20:13:36.978071  --  121000
2023-11-23 20:13:37.818651  --  122000
2023-11-23 20:13:38.567000  --  123000
2023-11-23 20:13:39.357232  --  124000
2023-11-23 20:13:40.178906  --  125000
2023-11-23 20:13:41.086391  --  126000
2023-11-23 20:13:41.915410  --  127000
2023-11-23 20:13:43.063907  --  128000
2023-11-23 20:13:44.11688

KeyboardInterrupt: 

## Telling scraper to scrape content now

In [50]:
while True:
    if r.llen("to_fetch_newsam") < 2048:
        cur = db.execute("select id, content from article where content is null order by RANDOM() limit 1024")
        db.commit()
    
        if len((cur := list(cur))) == 0:
            print("No more articles to fetch")
            break

        for row in cur:
            r.lpush("to_fetch_newsam", int(row[0]))
    else:
        time.sleep(1)
        print(f"Waiting for it to fetch ({r.llen('to_fetch_newsam')}), idling...", end="\r")

Waiting for it to fetch (2749), idling...

In [45]:
cur = db.execute("select count(*) from article")
db.commit()

(l := list(cur))

[(219401,)]

In [None]:
# db.execute("update article set headline = 'None'")
# db.commit()

In [6]:
# r.lpop("to_fetch_list_newsam", r.llen("to_fetch_list_newsam"))
# r.lpop("to_store_list_newsam", r.llen("to_store_list_newsam"))

['[{"id": 169, "headline": "War not over for Azerbaijan", "date": "2009-06-25+0513:16:00"}, {"id": 166, "headline": "Armenians no more move abroad for an employment, but do not rush back either", "date": "2009-06-25+0512:37:00"}]']

In [46]:
# cur = db.execute("select * from article limit 100")
# db.commit()

# (l := list(cur))

[(166,
  'Armenians no more move abroad for an employment, but do not rush back either',
  '2009-06-25 12:37:00',
  None,
  None,
  None,
  None),
 (169,
  'War not over for Azerbaijan',
  '2009-06-25 13:16:00',
  None,
  None,
  None,
  None),
 (178,
  'Armenian authorities forgot about World Refugee Day',
  '2009-06-26 07:02:00',
  None,
  None,
  None,
  None),
 (187,
  'Mad scramble rather than a brawl',
  '2009-06-26 10:02:00',
  None,
  None,
  None,
  None),
 (193,
  'Expert: Armenian Government makes crisis worse',
  '2009-06-26 10:50:00',
  None,
  None,
  None,
  None),
 (198,
  'Spending Russian money',
  '2009-06-26 11:52:00',
  None,
  None,
  None,
  None),
 (213,
  'Marie Yovanovitch: Genocide denial White House’s policy',
  '2009-06-27 06:52:00',
  None,
  None,
  None,
  None),
 (219,
  'Relatives of sentenced for life addressed the President of Armenia',
  '2009-06-27 10:05:00',
  None,
  None,
  None,
  None),
 (226,
  'Young athletes from Armenia and Nagorno-Karabak