In [1]:
from utils import utils
from crawler.crawler import Crawler
import random
import os

In [2]:
import os.path
import csv

from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

In [3]:
def create_and_write_to_sheet(service, spreadsheet_id, sheet_name, csv_file):
    # Create sheet
    batch_update_values_request_body = {
        "requests": [{"addSheet": {"properties": {"title": sheet_name}}}]
    }
    request = service.spreadsheets().batchUpdate(
        spreadsheetId=spreadsheet_id, body=batch_update_values_request_body
    )
    response = request.execute()

    f = open(csv_file, "r", encoding="utf-8")
    values = [r for r in csv.reader(f)]
    request = (
        service.spreadsheets()
        .values()
        .update(
            spreadsheetId=spreadsheet_id,
            range=sheet_name,
            valueInputOption="USER_ENTERED",
            body={"values": values},
        )
        .execute()
    )

In [4]:
SCOPES = ["https://www.googleapis.com/auth/spreadsheets"]

SPREADSHEET_ID = "1ohhSCKTjbn2aZofQgrd2tlXmpW-CfRNUparEa28ZZ2Y"

creds = None
if os.path.exists("token.json"):
    creds = Credentials.from_authorized_user_file("token.json", SCOPES)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
else:
    flow = InstalledAppFlow.from_client_secrets_file("credentials.json", SCOPES)
    creds = flow.run_local_server(port=0)
    # Save the credentials for the next run
    with open("token.json", "w") as token:
        token.write(creds.to_json())

try:
    service = build("sheets", "v4", credentials=creds)
except HttpError as err:
    print(err)

In [5]:
# Get all the answer page hrefs from the main page
hrefs = utils.export_all_href(
    "https://vietjack.me/lop-10-canh-dieu-tn/trac-nghiem-toan-lop-10-co-dap-an-canh-dieu"
)

In [6]:
for href in hrefs:
    list_model = [
        "llama-3.1-70b-versatile",
        "llama3-8b-8192",
        "llama-3.1-8b-instant",
        "llama3-70b-8192",
    ]  # Groq's model to use
    model_name = random.choice(list_model)

    path = f"crawl_result/cd/{href.split('.me/')[1][:-5]}.csv"
    if os.path.exists(path):
        print(f"Existed! Skipping {path}.....")
        continue
    print(f"Using model {model_name} to crawl {path}")
    crawler = Crawler(url_list=[href], model_name=model_name, site="vietjack_me")
    crawler.crawl()
    crawler.save(path)
    try:
        create_and_write_to_sheet(
            service, SPREADSHEET_ID, href.split(".me/")[1][:-5], path
        )
    except HttpError as err:
        print(err)

Existed! Skipping crawl_result/cd/top-15-cau-trac-nghiem-menh-de-toan-hoc-canh-dieu-2023-co-dap-an-toan-123671.csv.....
Existed! Skipping crawl_result/cd/top-15-cau-trac-nghiem-tap-hop-cac-phep-toan-tren-tap-hop-canh-dieu-20-123672.csv.....
Existed! Skipping crawl_result/cd/top-15-cau-trac-nghiem-on-tap-chuong-1-canh-dieu-2023-co-dap-an-toan-1-123735.csv.....
Existed! Skipping crawl_result/cd/top-15-cau-trac-nghiem-bat-phuong-trinh-bac-nhat-hai-an-canh-dieu-2023-123736.csv.....
Existed! Skipping crawl_result/cd/top-15-cau-trac-nghiem-he-bat-phuong-trinh-bac-nhat-hai-an-canh-dieu-2-123737.csv.....
Existed! Skipping crawl_result/cd/top-30-cau-trac-nghiem-on-tap-chuong-2-canh-dieu-2023-co-dap-an-toan-1-123738.csv.....
Existed! Skipping crawl_result/cd/top-15-cau-trac-nghiem-ham-so-va-do-thi-canh-dieu-2023-co-dap-an-toan-123739.csv.....
Existed! Skipping crawl_result/cd/top-15-cau-trac-nghiem-ham-so-bac-hai-do-thi-ham-so-bac-hai-va-ung-dun-123740.csv.....
Existed! Skipping crawl_result/cd/