In [1]:
import requests
import csv

In [2]:

API_KEY = "AIzaSyDX_kJ-HB9cs0Z8KYDvzhENTDWw3oKTTdQ"
API_URL = "https://pagespeedonline.googleapis.com/pagespeedonline/v5/runPagespeed"
INPUT_FILE = "../../Warehouse/Links/links.csv"
OUTPUT_FILE = "website_metrics.csv"

In [3]:
def read_urls_from_csv(file):
    with open(file, "r", newline="", encoding="utf-8") as csvfile:
        reader = csv.reader(csvfile)
        urls = set()
        for row in reader:
            urls.add(row[0])
        return list(urls)

def evaluate_website(url):
    params = {"url": url, "key": API_KEY}
    response = requests.get(API_URL, params=params)
    response_json = response.json()

    if "lighthouseResult" not in response_json:
        return None

    overall_score = response_json["lighthouseResult"]["categories"]["performance"]["score"]

    metrics = {
        "url": url,
        "overall_score": format(overall_score, ".8f"),
        "first_contentful_paint": response_json["lighthouseResult"]["audits"]["first-contentful-paint"]["numericValue"],
        "speed_index": response_json["lighthouseResult"]["audits"]["speed-index"]["numericValue"],
        "largest_contentful_paint": response_json["lighthouseResult"]["audits"]["largest-contentful-paint"]["numericValue"],
        "interactive": response_json["lighthouseResult"]["audits"]["interactive"]["numericValue"],
    }
    return metrics

def save_results_to_csv(result, file):
    with open(file, "a", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["url", "overall_score", "first_contentful_paint", "speed_index", "largest_contentful_paint", "interactive"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        if csvfile.tell() == 0:
            writer.writeheader()

        writer.writerow(result)

def save_failed_urls_to_csv(failed_urls, file):
    with open(file, "a", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["url", "error"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for failed_url in failed_urls:
            writer.writerow(failed_url)

In [4]:
urls = read_urls_from_csv(INPUT_FILE)
total_urls = len(urls)
failed_urls = []

for index, url in enumerate(urls):
    try:
        result = evaluate_website(url)
        if result:
            save_results_to_csv(result, OUTPUT_FILE)
            print(f"Progress: {index + 1}/{total_urls} - URL: {url} - Status: Success")
        else:
            error_message = "'lighthouseResult' not found"
            failed_urls.append({"url": url, "error": error_message})
            print(f"Progress: {index + 1}/{total_urls} - URL: {url} - Status: Failed - Error: {error_message}")
    except Exception as e:
        error_message = str(e)
        failed_urls.append({"url": url, "error": error_message})
        print(f"Progress: {index + 1}/{total_urls} - URL: {url} - Status: Failed - Error: {error_message}")
        save_failed_urls_to_csv(failed_urls, "failed_urls.csv")

Progress: 1/411 - URL: https://fampay.in/blog/7-cool-insta-dads-of-bollywood/ - Status: Success
Progress: 2/411 - URL: https://fampay.in/blog/pocket-money-for-teenagers-why-it-matters/ - Status: Failed - Error: unsupported format string passed to NoneType.__format__
Progress: 3/411 - URL: https://wa.me/?text=GenZs%20commit%20to%20fight%20Covid-19%20%F0%9F%8E%AF%20https://fampay.in/blog/genz-commits-to-fight-covid-19/ - Status: Success
Progress: 4/411 - URL: https://fampay.in/blog/tag/teens/page/4/ - Status: Success
Progress: 5/411 - URL: https://fampay.in/blog/from-a-self-taught-programmer-since-age-9-to-becoming-a-ceo-at-15/ - Status: Success
Progress: 6/411 - URL: https://fampay.in/blog/tag/parents/ - Status: Success
Progress: 7/411 - URL: https://fampay.in/blog/what-are-the-perks-of-using-fampay/ - Status: Failed - Error: 'lighthouseResult' not found
Progress: 8/411 - URL: https://twitter.com/intent/tweet?text=Quick%20Guide%20to%20be%20a%20Teenfluencer%20%F0%9F%91%B8%F0%9F%A4%B4&url