In [1]:
from utils import utils
from crawler.crawler import Crawler
import random
import os

In [2]:
import os.path
import csv

from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

In [3]:
def create_and_write_to_sheet(service, spreadsheet_id, sheet_name, csv_file):
    # Create sheet
    batch_update_values_request_body = {
        "requests": [{"addSheet": {"properties": {"title": sheet_name}}}]
    }
    request = service.spreadsheets().batchUpdate(
        spreadsheetId=spreadsheet_id, body=batch_update_values_request_body
    )
    response = request.execute()

    f = open(csv_file, "r", encoding="utf-8")
    values = [r for r in csv.reader(f)]
    request = (
        service.spreadsheets()
        .values()
        .update(
            spreadsheetId=spreadsheet_id,
            range=sheet_name,
            valueInputOption="USER_ENTERED",
            body={"values": values},
        )
        .execute()
    )

In [4]:
SCOPES = ["https://www.googleapis.com/auth/spreadsheets"]

SPREADSHEET_ID = "1xnwcV6J6NSokWHDzxw1MEFAHS39j-v-U_4uKa6OixgY"

creds = None
if os.path.exists("token.json"):
    creds = Credentials.from_authorized_user_file("token.json", SCOPES)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
else:
    flow = InstalledAppFlow.from_client_secrets_file("credentials.json", SCOPES)
    creds = flow.run_local_server(port=0)
    # Save the credentials for the next run
    with open("token.json", "w") as token:
        token.write(creds.to_json())

try:
    service = build("sheets", "v4", credentials=creds)
except HttpError as err:
    print(err)

In [5]:
# Get all the answer page hrefs from the main page
hrefs = utils.export_all_href(
    "https://www.vietjack.com/bai-tap-trac-nghiem-giai-tich-12/",
    "vietjack_com"
)

In [6]:
for href in hrefs:
    list_model = [
        "llama3-8b-8192",
        "llama-3.1-8b-instant",
        "llama3-70b-8192",
    ]  # Groq's model to use
    model_name = random.choice(list_model)

    path = f"crawl_result/giaitich12/{href.split('/')[-1][:-4]}.csv"
    if os.path.exists(path):
        print(f"Existed! Skipping {path}.....")
        continue
    print(f"Using model {model_name} to crawl {path}")
    crawler = Crawler(url_list=[href], model_name=model_name, site="vietjack_com")
    crawler.crawl()
    crawler.save(path)
    try:
        create_and_write_to_sheet(
            service, SPREADSHEET_ID, href.split('/')[-1][:-4], path
        )
    except HttpError as err:
        print(err)

Existed! Skipping crawl_result/giaitich12/bai-tap-trac-nghiem-su-dong-bien-nghich-bien-cua-ham-so-phan-1.csv.....
Using model llama3-70b-8192 to crawl crawl_result/giaitich12/bai-tap-trac-nghiem-su-dong-bien-nghich-bien-cua-ham-so-phan-2.csv
Getting all questions from the given urls...
Getting questions from https://vietjack.com/../bai-tap-trac-nghiem-giai-tich-12/bai-tap-trac-nghiem-su-dong-bien-nghich-bien-cua-ham-so-phan-2.jsp...
Error fetching question: list index out of range
--------------ERROR-------------------
Question tag: <p style="color:green;"><b>Câu 12:</b> Hỏi hàm số </p>
Choices tags: []
---------------------------------


Error fetching question: list index out of range
--------------ERROR-------------------
Question tag: <p style="color:green;"><b>Câu 18:</b> Cho hàm số y = x<sup>3</sup> - x<sup>2</sup> + (m-1)x + m. Tìm điều kiện của tham số m để hàm số đồng biến trên R</p>
Choices tags: [<p><img alt="Bài tập trắc nghiệm Giải tích 12 | Câu hỏi trắc nghiệm Giải tích 1