In [1]:
import pandas as pd
import requests
import os
from tqdm.notebook import tqdm
import time

# Load updated CSV file
df = pd.read_csv("missing_pdfs.csv")

# Filter entries with a valid URL
valid_urls_df = df[df["URL"].notnull()]
valid_urls_df = valid_urls_df[:10]
valid_urls_df

Unnamed: 0,Bib File,Bib Key,Title,DOI,Publisher,URL
0,conf-adcs-2002.bib,conf-adcs-2002-quigley,Liquid Mir{\`{o}}: An Application Framework fo...,,,http://www.cie.ict.csiro.au/adcs2002/papers/qu...
1,conf-adcs-2002.bib,conf-adcs-2002-upstill,Buying bestsellers online: {A} case study in S...,,,http://www.cie.ict.csiro.au/adcs2002/papers/up...
2,conf-adcs-2002.bib,conf-adcs-2002-davis,Workflow Based Just-in-time Training,,,http://www.cie.ict.csiro.au/adcs2002/papers/da...
3,conf-adcs-2002.bib,conf-adcs-2002-eklund,Visual Displays for Browsing {RDF} Documents,,,http://www.cie.ict.csiro.au/adcs2002/papers/ek...
4,conf-adcs-2002.bib,conf-adcs-2002-schwitter,How to Write a Document in Controlled Natural ...,,,http://www.cie.ict.csiro.au/adcs2002/papers/sc...
5,conf-adcs-2002.bib,conf-adcs-2002-boparai,Supporting user task based conversations via e...,,,http://www.cie.ict.csiro.au/adcs2002/papers/bo...
6,conf-adcs-2002.bib,conf-adcs-2002-heyer,Tibianna: {A} Learning-Based Search Engine wit...,,,http://www.cie.ict.csiro.au/adcs2002/papers/he...
7,conf-adcs-2002.bib,conf-adcs-2002-calvo,Automatic Categorization of Announcements on t...,,,http://www.cie.ict.csiro.au/adcs2002/papers/ca...
8,conf-adcs-2002.bib,conf-adcs-2002-heyer-2,MyNewsWave: User-centered Web search and news ...,,,http://www.cie.ict.csiro.au/adcs2002/papers/he...
9,conf-adcs-2002.bib,conf-adcs-2002-barta,Managing Literature References with Topic Maps,,,http://www.cie.ict.csiro.au/adcs2002/papers/ba...


### Combine the processed batches into a single CSV

In [6]:
import glob

# Combine all enriched CSVs
files = sorted(glob.glob("Processed-files/batch_*.csv"))
dfs = [pd.read_csv(f) for f in files]
final_df = pd.concat(dfs, ignore_index=True)

# Save the final enriched dataset
final_df.to_csv("combined_updated_missing_pdfs.csv", index=False)
print("✅ All batches merged into 'combined_updated_missing_pdfs.csv'")


✅ All batches merged into 'combined_updated_missing_pdfs.csv'


## download the PDFs

In [None]:
import requests
import os
import time
import random
import pandas as pd


csv_path = "missing_pdfs.csv"
df = pd.read_csv(csv_path)


subset_df = df[df["URL"].str.startswith("http", na=False)].head(5)


output_folder = "downloaded_pdfs_subset"
os.makedirs(output_folder, exist_ok=True)

# Setup User-Agent rotation
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
    "Mozilla/5.0 (X11; Linux x86_64)"
]

# === Step 5: Download the PDFs ===
for index, row in subset_df.iterrows():
    url = row["URL"]
    bib_key = row["Bib Key"]
    filename = os.path.join(output_folder, f"{bib_key}.pdf")

    try:
        print(f"Attempting: {url}")
        headers = {"User-Agent": random.choice(user_agents)}
        response = requests.get(url, headers=headers, timeout=15)

        content_type = response.headers.get("Content-Type", "")
        if response.status_code == 200 and "application/pdf" in content_type:
            with open(filename, "wb") as f:
                f.write(response.content)
            print(f"✅ Downloaded: {filename}")
        else:
            print(f"❌ Skipped (not a PDF or bad response): {url}")

        time.sleep(random.uniform(3, 7))  # Human-like wait

    except Exception as e:
        print(f"⚠️ Error downloading {url}: {e}")


In [1]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("missing_pdfs.csv")

# Define target publishers
target_publishers = ["{ACM}", "{ACM} / {IW3C2}", "{ACM} Press"]

# Filter the rows
filtered_df = df[df['Publisher'].isin(target_publishers)]

# Save the filtered rows to a new CSV file
output_path = "acm_filtered_pdfs.csv"
filtered_df.to_csv(output_path, index=False)

print(f"Filtered data saved to: {output_path}")

Filtered data saved to: acm_filtered_pdfs.csv


## === Final Cleanup ===

In [None]:
import os
import time
import random
import pandas as pd
import requests
import browser_cookie3
from urllib.parse import quote
from tqdm.notebook import tqdm

# === CONFIGURATION ===
CSV_PATH = "missing_pdfs.csv"
OUTPUT_FOLDER = "acm_pdf_test_subset"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# === Load cookies from your browser (must be logged in to ACM) ===
cookies = browser_cookie3.load(domain_name='dl.acm.org')

# === Load CSV and filter rows with DOI-based URLs ===
df = pd.read_csv(CSV_PATH)
doi_df = df[df["URL"].str.contains("10\\.", na=False)].copy()
subset_df = doi_df.head(5)  # <-- Only first 5 for testing

# === Convert DOI URL to direct PDF URL ===
def doi_to_pdf_url(url):
    import re
    match = re.search(r"(10\.\d{4,9}/[^\s]+)", url)
    if match:
        doi = match.group(1)
        return f"https://dl.acm.org/action/showPdf?doi={quote(doi)}"
    return None

# === User-Agent list ===
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
    "Mozilla/5.0 (X11; Linux x86_64)"
]

# === Download loop ===
failed = []

for _, row in tqdm(subset_df.iterrows(), total=len(subset_df), desc="Testing ACM PDF Download"):
    bib_key = row["Bib Key"]
    original_url = row["URL"]
    pdf_url = doi_to_pdf_url(original_url)

    if not pdf_url:
        print(f"❌ Skipping invalid DOI: {original_url}")
        failed.append((bib_key, original_url, "Invalid DOI"))
        continue

    try:
        headers = {"User-Agent": random.choice(USER_AGENTS)}
        response = requests.get(pdf_url, headers=headers, cookies=cookies, timeout=20)

        content_type = response.headers.get("Content-Type", "")
        if response.status_code == 200 and "application/pdf" in content_type:
            filename = os.path.join(OUTPUT_FOLDER, f"{bib_key}.pdf")
            with open(filename, "wb") as f:
                f.write(response.content)
            print(f"✅ Downloaded: {filename}")
        else:
            print(f"⚠️ Failed to download: {pdf_url} → {content_type}")
            failed.append((bib_key, pdf_url, f"{response.status_code}, {content_type}"))

        time.sleep(random.uniform(3, 6))  # Human-like delay

    except Exception as e:
        print(f"❌ Error for {pdf_url}: {e}")
        failed.append((bib_key, pdf_url, str(e)))

# === Save failures (optional) ===
if failed:
    pd.DataFrame(failed, columns=["Bib Key", "URL", "Reason"]).to_csv("acm_failed_test_subset.csv", index=False)
    print(f"\n⚠️ Some downloads failed. Check acm_failed_test_subset.csv")
else:
    print("\n🎉 All test downloads successful!")


In [None]:
import os
import time
import random
import pandas as pd
import requests
import browser_cookie3
from urllib.parse import quote
from tqdm.notebook import tqdm

# === CONFIGURATION ===
CSV_PATH = "acm_filtered_pdfs.csv"
OUTPUT_FOLDER = "acm_pdf_test_subset"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# === Load cookies from your browser (must be logged in to ACM) ===
cookies = browser_cookie3.load(domain_name='dl.acm.org')

# === Load CSV and filter rows with DOI-based URLs ===
df = pd.read_csv(CSV_PATH)
doi_df = df[df["URL"].str.contains("10\\.", na=False)].copy()
subset_df = doi_df.head(5)  # <-- Only first 5 for testing

# === Convert DOI URL to direct PDF URL ===
def doi_to_pdf_url(url):
    import re
    match = re.search(r"(10\.\d{4,9}/[^\s]+)", url)
    if match:
        doi = match.group(1)
        return f"https://dl.acm.org/doi/pdf/{quote(doi)}"
    return None

# === User-Agent list ===
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
    "Mozilla/5.0 (X11; Linux x86_64)"
]

# === Download loop ===
failed = []

for _, row in tqdm(subset_df.iterrows(), total=len(subset_df), desc="Testing ACM PDF Download"):
    bib_key = row["Bib Key"]
    original_url = row["URL"]
    pdf_url = doi_to_pdf_url(original_url)

    if not pdf_url:
        print(f"❌ Skipping invalid DOI: {original_url}")
        failed.append((bib_key, original_url, "Invalid DOI"))
        continue

    try:
        headers = {"User-Agent": random.choice(USER_AGENTS)}
        response = requests.get(pdf_url, headers=headers, cookies=cookies, timeout=20)

        content_type = response.headers.get("Content-Type", "")
        if response.status_code == 200 and "application/pdf" in content_type:
            filename = os.path.join(OUTPUT_FOLDER, f"{bib_key}.pdf")
            with open(filename, "wb") as f:
                f.write(response.content)
            print(f"✅ Downloaded: {filename}")
        else:
            print(f"⚠️ Failed to download: {pdf_url} → {content_type}")
            failed.append((bib_key, pdf_url, f"{response.status_code}, {content_type}"))

        time.sleep(random.uniform(3, 6))  # Human-like delay

    except Exception as e:
        print(f"❌ Error for {pdf_url}: {e}")
        failed.append((bib_key, pdf_url, str(e)))

# === Save failures (optional) ===
if failed:
    pd.DataFrame(failed, columns=["Bib Key", "URL", "Reason"]).to_csv("acm_failed_test_subset.csv", index=False)
    print(f"\n⚠️ Some downloads failed. Check acm_failed_test_subset.csv")
else:
    print("\n🎉 All test downloads successful!")


In [None]:
import os
import time
import random
import pandas as pd
import requests
from urllib.parse import quote
from tqdm import tqdm

# === Configuration ===
CSV_PATH = "missing_pdfs.csv"
OUTPUT_FOLDER = "acm_pdf_test_subset"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# === Load CSV and filter for DOI-containing URLs ===
df = pd.read_csv(CSV_PATH)
doi_df = df[df["URL"].str.contains("10\\.", na=False)].copy()
subset_df = doi_df.head(2)  # <-- Test on first 5

def doi_to_pdf_url(url):
    import re
    match = re.search(r"(10\.\d{4,9}/[^\s]+)", url)
    if match:
        doi = match.group(1)
        return f"https://dl.acm.org/action/showPdf?doi={quote(doi)}"
    return None

# === Headers to mimic browser ===
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
    "Mozilla/5.0 (X11; Linux x86_64)"
]

# === Download loop ===
failed = []

for _, row in tqdm(subset_df.iterrows(), total=len(subset_df), desc="Downloading PDFs"):
    bib_key = row["Bib Key"]
    original_url = row["URL"]
    pdf_url = doi_to_pdf_url(original_url)

    if not pdf_url:
        print(f"❌ Invalid DOI in URL: {original_url}")
        failed.append((bib_key, original_url, "Invalid DOI"))
        continue

    try:
        headers = {"User-Agent": random.choice(USER_AGENTS)}
        response = requests.get(pdf_url, headers=headers, timeout=20)

        content_type = response.headers.get("Content-Type", "")
        if response.status_code == 200 and "application/pdf" in content_type:
            filename = os.path.join(OUTPUT_FOLDER, f"{bib_key}.pdf")
            with open(filename, "wb") as f:
                f.write(response.content)
            print(f"✅ Downloaded: {filename}")
        else:
            print(f"⚠️ Not a PDF or no access: {pdf_url}")
            failed.append((bib_key, pdf_url, f"{response.status_code}, {content_type}"))

        time.sleep(random.uniform(3, 6))

    except Exception as e:
        print(f"❌ Error: {e}")
        failed.append((bib_key, pdf_url, str(e)))

# === Save failed attempts ===
if failed:
    pd.DataFrame(failed, columns=["Bib Key", "URL", "Reason"]).to_csv("acm_failed_test_subset.csv", index=False)
    print("\n⚠️ Some downloads failed. Check 'acm_failed_test_subset.csv'")
else:
    print("\n🎉 All test downloads successful!")


In [16]:
import os
import time
import random
import pandas as pd
import requests
from urllib.parse import quote
from tqdm import tqdm

# === Config ===
CSV_PATH = "missing_pdfs.csv"
OUTPUT_FOLDER = "acm_pdf_doi_fallback_test"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# === Load CSV ===
df = pd.read_csv(CSV_PATH)
doi_df = df[df["URL"].str.contains("10\\.", na=False)].copy()
subset_df = doi_df.head(5)  # First 5 for test

# === Convert to /doi/pdf/<DOI> format ===
def doi_to_pdf_url(url):
    import re
    match = re.search(r"(10\.\d{4,9}/[^\s]+)", url)
    if match:
        doi = match.group(1)
        return f"https://dl.acm.org/doi/pdf/{quote(doi)}"
    return None

# === Headers to mimic real browser ===
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Accept": "application/pdf",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://dl.acm.org/",
}

# === Download loop ===
failed = []

for _, row in tqdm(subset_df.iterrows(), total=len(subset_df), desc="Downloading via /doi/pdf"):
    bib_key = row["Bib Key"]
    original_url = row["URL"]
    pdf_url = doi_to_pdf_url(original_url)

    if not pdf_url:
        failed.append((bib_key, original_url, "Invalid DOI"))
        continue

    try:
        response = requests.get(pdf_url, headers=HEADERS, timeout=20, allow_redirects=True)

        content_type = response.headers.get("Content-Type", "")
        if response.status_code == 200 and "application/pdf" in content_type:
            filename = os.path.join(OUTPUT_FOLDER, f"{bib_key}.pdf")
            with open(filename, "wb") as f:
                f.write(response.content)
            print(f"✅ Downloaded: {filename}")
        else:
            failed.append((bib_key, pdf_url, f"{response.status_code}, {content_type}"))

        time.sleep(random.uniform(3, 6))  # Human-like delay

    except Exception as e:
        failed.append((bib_key, pdf_url, str(e)))

# === Save failures (optional) ===
if failed:
    pd.DataFrame(failed, columns=["Bib Key", "URL", "Reason"]).to_csv("acm_doi_pdf_fails.csv", index=False)
    print("\n⚠️ Some downloads failed. Check 'acm_doi_pdf_fails.csv'")
else:
    print("\n🎉 All test downloads successful!")


Downloading via /doi/pdf: 100%|██████████| 5/5 [00:20<00:00,  4.19s/it]


⚠️ Some downloads failed. Check 'acm_doi_pdf_fails.csv'





In [None]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
import time
import os
import random

# Create download directory
download_dir = os.path.join(os.getcwd(), "acm_pdfs")
os.makedirs(download_dir, exist_ok=True)

# Configure undetected Chrome
options = uc.ChromeOptions()
prefs = {
    "download.default_directory": download_dir,
    "download.prompt_for_download": False,
    "plugins.always_open_pdf_externally": True,
}
options.add_experimental_option("prefs", prefs)

# Launch Chrome (undetected)
driver = uc.Chrome(options=options, headless=False)

# List of DOIs
doi_list = [
    "10.1145/2537734.2537741",
]

for doi in doi_list:
    url = f"https://dl.acm.org/doi/{doi}"
    print(f"Opening: {url}")
    driver.get(url)

    # Give time for manual CAPTCHA solving or auto-bypass
    print("🕵️‍♂️ Solve CAPTCHA if asked, or wait for bypass...")b
    time.sleep(20)

    try:
        # Try to click on the PDF button
        pdf_button = driver.find_element(By.XPATH, "//a[contains(text(), 'PDF')]")
        pdf_button.click()
        print("✅ PDF download started.")
    except Exception as e:
        print(f"❌ Couldn't find the PDF button: {e}")

    time.sleep(random.uniform(10, 20))

driver.quit()


In [None]:
import os
import time
import random
import pandas as pd
import requests
from urllib.parse import quote
from tqdm import tqdm

# === Configuration ===
CSV_PATH = "acm_filtered_pdfs.csv"
OUTPUT_FOLDER = "acm_pdf_test_subset"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# === Load CSV and filter for DOI-containing URLs ===
df = pd.read_csv(CSV_PATH)
doi_df = df[df["URL"].str.contains("10\\.", na=False)].copy()
subset_df = doi_df.head(5)  # <-- Test on first 5

# === Convert to /action/showPdf URL ===
def doi_to_pdf_url(url):
    import re
    match = re.search(r"(10\.\d{4,9}/[^\s]+)", url)
    if match:
        doi = match.group(1)
        # return f"https://dl.acm.org/action/showPdf?doi={quote(doi)}"
        return f"https://dl.acm.org/doi/pdf/{quote(doi)}"
    return None

# === Headers to mimic browser ===
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
    "Mozilla/5.0 (X11; Linux x86_64)"
]

# === Download loop ===
failed = []

for _, row in tqdm(subset_df.iterrows(), total=len(subset_df), desc="Downloading PDFs"):
    bib_key = row["Bib Key"]
    original_url = row["URL"]
    pdf_url = doi_to_pdf_url(original_url)

    if not pdf_url:
        print(f"❌ Invalid DOI in URL: {original_url}")
        failed.append((bib_key, original_url, "Invalid DOI"))
        continue

    try:
        headers = {"User-Agent": random.choice(USER_AGENTS)}
        response = requests.get(pdf_url, headers=headers, timeout=20)

        content_type = response.headers.get("Content-Type", "")
        if response.status_code == 200 and "application/pdf" in content_type:
            filename = os.path.join(OUTPUT_FOLDER, f"{bib_key}.pdf")
            with open(filename, "wb") as f:
                f.write(response.content)
            print(f"✅ Downloaded: {filename}")
        else:
            print(f"⚠️ Not a PDF or no access: {pdf_url}")
            failed.append((bib_key, pdf_url, f"{response.status_code}, {content_type}"))

        time.sleep(random.uniform(3, 6))

    except Exception as e:
        print(f"❌ Error: {e}")
        failed.append((bib_key, pdf_url, str(e)))

# === Save failed attempts ===
if failed:
    pd.DataFrame(failed, columns=["Bib Key", "URL", "Reason"]).to_csv("acm_failed_test_subset.csv", index=False)
    print("\n⚠️ Some downloads failed. Check 'acm_failed_test_subset.csv'")
else:
    print("\n🎉 All test downloads successful!")
