# Telling scraper to scrape articles

In [None]:
import numpy as np
import pandas as pd
import requests
import re
import sqlite3
import time
import openai
import redis
import datetime
import json

In [None]:
r = redis.Redis("localhost", 5211, decode_responses=True)

In [None]:
db = sqlite3.connect("scraping.db")
def create_article_table(db):
    db.execute("drop table if exists article")

    db.commit()
    db.execute("""
    create table if not exists article (id integer primary key, headline varchar(255), time timestamp, content text, feedback text, region varchar(255), theme varchar(128));
    """);

    db.commit()

In [None]:
# create_article_table(db)

In [None]:
initial_date = datetime.datetime(2009, 6, 25)
on_date = initial_date

while on_date <= datetime.datetime.now():
    print(on_date.year, end="\r")
    r.lpush("to_fetch_list_newsam", f"{on_date.year}/{on_date.month}/{on_date.day}/")
    on_date += datetime.timedelta(1)

In [None]:
while r.llen("to_fetch_list_newsam"): # Continue execution of notebook only after all article lists have been scraped
    time.sleep(1)

In [None]:
r.llen("to_fetch_list_newsam"), r.llen("to_store_list_newsam")

## Storing scraped list of articles to scrape

In [None]:
# datetime.datetime.now().strftime("%z")
datetext = json.loads(r.rpoplpush("to_store_list_newsam", "to_store_list_newsam"))["date"]
datetime.datetime.strptime(datetext[:10] + " " + datetext[13:], "%Y-%m-%d %H:%M:%S")

In [None]:
n = 104000
while True:
    if (data := r.rpop("to_store_list_newsam")) is None:
        print(f"{n} Nothing to store, breaking", end="\r")
        break
    else:
        if n % 1000 == 0:
            print(datetime.datetime.now(), " -- ", n)
        data = json.loads(data)
        # print(data, end="\r")
        if len(data["headline"]) == 0:
            print("ERROR, NO HEADLINE")
            continue
        if len(str(data["id"])) == 0:
            print("ERROR, NO CONTENT")
            continue
        n += 1
        try:
            db.execute(f'insert into article (id, headline, time) values (?, ?, ?)', (int(data["id"]), data["headline"], datetime.datetime.strptime(data["date"][:10] + " " + data["date"][13:], "%Y-%m-%d %H:%M:%S")))
        except sqlite3.IntegrityError as e:
            db.commit()
            print(data["id"], " not unique", end="\r")
        

## Telling scraper to scrape content now

In [None]:
while True:
    if r.llen("to_fetch_newsam") < 2s048:
        cur = db.execute("select id, content from article where content is null order by RANDOM() limit 1024")
        db.commit()
    
        if len((cur := list(cur))) == 0:
            print("No more articles to fetch")
            break

        for row in cur:
            r.lpush("to_fetch_newsam", int(row[0]))
    else:
        time.sleep(1)
        print(f"Waiting for it to fetch ({r.llen('to_fetch_newsam')}), idling...", end="\r")

In [29]:
cur = db.execute("select count(*) from article where (headline like '%Russia%' or content like '%United States%') and feedback is not null")
db.commit()

(l := list(cur))

[(5353,)]

In [None]:
# db.execute("update article set headline = 'None'")
# db.commit()

In [None]:
# r.lpop("to_fetch_list_newsam", r.llen("to_fetch_list_newsam"))
# r.lpop("to_store_list_newsam", r.llen("to_store_list_newsam"))

In [23]:
cur = db.execute("select id, headline, content from article where headline like '%Russia%' or content like '%United States%' order by random() limit 16000")
db.commit()

(l := list(cur))

[(248532,),
 (455430,),
 (562954,),
 (62019,),
 (106955,),
 (738854,),
 (420780,),
 (707437,),
 (706486,),
 (739145,),
 (720830,),
 (596836,),
 (743765,),
 (708606,),
 (435839,),
 (555861,),
 (29029,),
 (643463,),
 (565095,),
 (452709,),
 (733020,),
 (340542,),
 (641272,),
 (9447,),
 (336059,),
 (703460,),
 (88976,),
 (644156,),
 (86671,),
 (689021,),
 (503098,),
 (785186,),
 (225852,),
 (685851,),
 (744555,),
 (467107,),
 (306522,),
 (613341,),
 (376046,),
 (757195,),
 (448815,),
 (642364,),
 (55634,),
 (515754,),
 (264079,),
 (480205,),
 (66007,),
 (400326,),
 (614112,),
 (750580,),
 (772378,),
 (725232,),
 (73922,),
 (288280,),
 (93712,),
 (661958,),
 (318807,),
 (45560,),
 (425370,),
 (771048,),
 (471198,),
 (624093,),
 (569884,),
 (734363,),
 (723200,),
 (376699,),
 (741905,),
 (676948,),
 (504169,),
 (727530,),
 (147626,),
 (746770,),
 (693535,),
 (462279,),
 (429624,),
 (729556,),
 (533863,),
 (742608,),
 (550036,),
 (541871,),
 (633565,),
 (730209,),
 (750617,),
 (672022,),
 (5