<h2 style="color:blue">1. Configurations</h2>

In [None]:
# Name of the Excel file that contains the list of URLs to check.
input_path = "input_urls.xlsx"

# Name of the folder where the downloaded OG images will be stored.
folder_name = "og images"

<h2 style="color:blue">2. Libraries that are required to run the script. Only need to run once</h2>

In [None]:
import pandas as pd
import requests
import os
from bs4 import BeautifulSoup
from openpyxl import load_workbook
from tqdm import tqdm
from urllib.parse import urlparse

<h2 style="color:blue">3. Run the script below to:</h2>
    <ol>
    <li>Extract the URL of OG image for each URL.</li>
    <li>Check if the OG image is accessible or not</li></ol>

In [None]:
def check_url_accessibility(url):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            return "Accessible", None
        else:
            return "Broken", f"HTTP Status Code: {response.status_code}"
    except requests.RequestException as e:
        return "Broken", str(e)

def extract_og_image_url(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        og_image = soup.find('meta', property='og:image')
        if og_image and og_image.get('content'):
            return og_image['content'], None
        else:
            return None, "No OG image meta tag found"
    except requests.RequestException as e:
        return None, str(e)

def update_excel_file(file_path, data):
    book = load_workbook(file_path)
    sheet = book.active

    for idx, row in data.iterrows():
        sheet.cell(row=row.name+2, column=2, value=row['OG Image URL'])
        sheet.cell(row=row.name+2, column=3, value=row['Accessibility'])
        sheet.cell(row=row.name+2, column=4, value=row['Remarks'])

    book.save(file_path)
    print(f"Excel file updated for rows {data.index[0]+2} to {data.index[-1]+2}")

def main(file_path):
    try:
        df = pd.read_excel(file_path, engine='openpyxl')
    except Exception as e:
        print(f"Error reading Excel file: {e}")
        return

    total_urls = len(df)
    batch_size = 10

    for i in range(0, total_urls, batch_size):
        batch = df.iloc[i:i+batch_size]
        
        for j, row in batch.iterrows():
            if pd.notna(row['OG Image URL']) and pd.notna(row['Accessibility']) and pd.notna(row['Remarks']):
                continue

            url = row['URL']
            if pd.isna(row['OG Image URL']):
                og_image_url, error = extract_og_image_url(url)
                df.at[j, 'OG Image URL'] = og_image_url if og_image_url else ""
                df.at[j, 'Remarks'] = error if error else ""

            if pd.isna(row['Accessibility']):
                accessibility, error = check_url_accessibility(df.at[j, 'OG Image URL'])
                df.at[j, 'Accessibility'] = accessibility
                df.at[j, 'Remarks'] = error if error else ""

        update_excel_file(file_path, df.iloc[i:i+batch_size])
        
        print(f"Progress: {i+batch_size}/{total_urls} URLs processed")

    print(f"Script completed. Data saved to {file_path}")

if __name__ == "__main__":
    main(input_path)

<h2 style="color:blue">4. Run the script below to:</h2>
    <ol>
    <li>Download all the valid OG images for each URL in the specified folder</li>
    <li>Name the OG images accordingly</li>

In [None]:
def download_image(url, folder_name):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        parsed_url = urlparse(url)
        image_name = os.path.basename(parsed_url.path)
        image_path = os.path.join(folder_name, image_name)
        with open(image_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        return True
    return False

def main(file_path):
    try:
        df = pd.read_excel(file_path, engine='openpyxl')
    except Exception as e:
        print(f"Error reading Excel file: {e}")
        return
    
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    accessible_df = df[(df['Accessibility'] == 'Accessible') & pd.notna(df['OG Image URL'])]
    total_images = len(accessible_df)
    downloaded_count = 0

    for _, row in tqdm(accessible_df.iterrows(), total=total_images, desc="Downloading images"):
        og_image_url = row['OG Image URL']
        
        try:
            if download_image(og_image_url, folder_name):
                downloaded_count += 1
        except Exception as e:
            print(f"Error downloading image from {og_image_url}: {e}")

    print(f"\nScript completed successfully. {downloaded_count}/{total_images} images downloaded.")
    print(f"Images are saved in the folder: {os.path.abspath(folder_name)}")

if __name__ == "__main__":
    main(input_path)