## Import libraries

In [1]:
import requests
import json
import time
import csv

##  Crawl data from website
After analyzing the website, finding the website's source API for high school exam scores and then retrieving the data

In [None]:
# Generate registration numbers
# The number like 'XX0YYYYY'
# With XX is area code and 0YYYYY is registration numbers
# Use 0YYYYY because i saw the number of candidates in a province is only about less than 10,000
def generate_registration_numbers():
    for i in range(1, 65):
        for j in range(0, 10000):
            registration_number = f'{i:02}0{j:05}'
            yield registration_number

# fetch_data is called by the server to fetch data from the server and return results
def fetch_data(code, year=2024):
    url = f'https://hoctap.coccoc.com/composer/proxyapi2/graduation_grades/score_search?code={code}&nam={year}'
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        return None


results = []
count = 0

for code in generate_registration_numbers():
    data = fetch_data(code)
    if data:
        results.append(data)
        print(f'Fetched data for code {code}: {data}')
    count += 1
    # Pause for a moment to avoid sending too many requests in a short period of time
    if count % 100 == 0:  # Alert every 100
        print(f'{count} requests sent.')
    time.sleep(0.1) 
    # Save result to file JSON
with open('graduation_grades_data.json', 'w') as f:
    json.dump(results, f)

print("Finished fetching data.")

## Convert JSON to CSV format

In [None]:
# Read JSON data from file
with open("graduation_grades_data.json", "r", encoding="utf-8") as file:
    json_data = json.load(file)

# Extracting the required data
rows = []
for item in json_data:
    for student in item["proxyapi2"]:
        rows.append(student)

# Dynamically get field names from the first row of data
if rows:
    fieldnames = rows[0].keys()
else:
    fieldnames = []

# Write to CSV
with open("students_scores.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    for row in rows:
        writer.writerow(row)

print("Data has been written to students_scores.csv")

## Exchange the structure to get the necessary parts

In [2]:
fieldnames = [
    "CityName", "Code", "DiaLi", "GDCD", "HoaHoc", "LichSu", 
    "NgoaiNgu", "NguVan", "SinhHoc", "Toan", "VatLi"
]

# Read data from the current CSV file
with open("students_scores.csv", "r", encoding="utf-8") as infile:
    reader = csv.DictReader(infile)
    
    # Extract and format data
    formatted_rows = []
    for row in reader:
        formatted_row = {
            "CityName": row.get("CityName", ""),
            "Code": row.get("Code", ""),
            "DiaLi": row.get("DiaLi", ""),
            "GDCD": row.get("GDCD", ""),
            "HoaHoc": row.get("HoaHoc", ""),
            "LichSu": row.get("LichSu", ""),
            "NgoaiNgu": row.get("NgoaiNgu", ""),
            "NguVan": row.get("NguVan", ""),
            "SinhHoc": row.get("SinhHoc", ""),
            "Toan": row.get("Toan", ""),
            "VatLi": row.get("VatLi", "")
        }
        formatted_rows.append(formatted_row)

# Write data to CSV file
with open("formatted_students_scores.csv", mode="w", newline="", encoding="utf-8") as outfile:
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in formatted_rows:
        writer.writerow(row)

print("Data has been written to formatted_students_scores.csv")


Data has been written to formatted_students_scores.csv
