# Telling scraper to scrape articles

In [8]:
import numpy as np
import pandas as pd
import requests
import re
import sqlite3
import time
import openai
import redis
import datetime
import json

In [9]:
r = redis.Redis("localhost", 5211, decode_responses=True)

In [15]:
db = sqlite3.connect("scraping.db")
def create_article_table(db):
    db.execute("drop table if exists article")

    db.commit()
    db.execute("""
    create table if not exists article (id varchar(48) primary key, headline varchar(255), time timestamp, content text, feedback text, categories varchar(255));
    """);

    db.commit()

In [16]:
# create_article_table(db)

In [19]:
initial_date = datetime.datetime(2010, 3, 2)
on_date = initial_date

while on_date <= datetime.datetime.now():
    print(on_date.year, end="\r")
    r.lpush("to_fetch_list_tertam", on_date.strftime("%Y/%m/%d"))
    on_date += datetime.timedelta(1)

2023

In [20]:
while r.llen("to_fetch_list_tertam"): # Continue execution of notebook only after all article lists have been scraped
    time.sleep(1)

In [21]:
r.llen("to_fetch_list_tertam"), r.llen("to_store_list_tertam")

(0, 84475)

## Storing scraped list of articles to scrape

In [22]:
n = 0
while True:
    if (data := r.rpop("to_store_list_tertam")) is None:
        print(f"{n} Nothing to store, breaking", end="\r")
        break
    else:
        if n % 1000 == 0:
            print(datetime.datetime.now(), " -- ", n)
        data = json.loads(data)
        
        if len(data["headline"]) == 0:
            print("ERROR, NO HEADLINE")
            continue
        if len(data["id"]) == 0:
            print("ERROR, NO ID")
            continue

        n += 1
        try:
            db.execute(f'insert into article (id, headline, time, categories) values (?, ?, ?, ?)', (data["id"], data["headline"], data["date"], data["categories"]))
        except sqlite3.IntegrityError as e:
            db.commit()
            print(data["id"], " not unique", end="\r")
        

2023-11-25 14:30:10.760052  --  0
2023-11-25 14:30:11.678161  --  1000
2023-11-25 14:30:13.058592  --  2000
2023-11-25 14:30:14.042541  --  3000
2023-11-25 14:30:15.192013  --  4000
2023-11-25 14:30:16.657639  --  5000
2023-11-25 14:30:18.949861  --  6000
2023-11-25 14:30:20.968119  --  7000
2023-11-25 14:30:23.250148  --  8000
2023-11-25 14:30:25.323891  --  9000  not unique
2023-11-25 14:30:27.335830  --  10000quenique
2023-11-25 14:30:29.251456  --  11000
2023-11-25 14:30:31.664413  --  12000unique
2023-11-25 14:30:33.909060  --  13000
2023-11-25 14:30:35.900904  --  14000
2023-11-25 14:30:37.308659  --  15000unique
2023-11-25 14:30:38.470476  --  16000
2023-11-25 14:30:39.613865  --  17000nique
2023-11-25 14:30:40.758572  --  18000queueuniqueue
2023-11-25 14:30:41.748362  --  19000ot uniqueunique
2023-11-25 14:30:42.900155  --  2000094  not uniquee
2023-11-25 14:30:43.920037  --  210005910  not unique
2023-11-25 14:30:45.537773  --  22000uniqueique
2023-11-25 14:30:46.781273  --  2

## Telling scraper to scrape content now

In [31]:
while True:
    if r.llen("to_fetch_tertam") < 2048:
        cur = db.execute("select id, content from article where content is null order by RANDOM() limit 1024")
        db.commit()
    
        if len((cur := list(cur))) == 0:
            print("No more articles to fetch")
            break

        for row in cur:
            r.lpush("to_fetch_tertam", row[0])
    else:
        time.sleep(1)
        print(f"Waiting for it to fetch ({r.llen('to_fetch_tertam')}), idling...", end="\r")

Waiting for it to fetch (2123), idling...

KeyboardInterrupt: 

In [35]:
cur = db.execute("select count(*) from article where content is null")
db.commit()

list(cur)

[(193,)]

In [None]:
# r.lpop("to_fetch_list_tertam", r.llen("to_fetch_list_tertam"))
# r.lpop("to_store_list_tertam", r.llen("to_store_list_tertam"))