In [1]:
#importing necessary libraries
import os
import pandas as pd
import aiohttp
import asyncio
import nest_asyncio
from tqdm import tqdm

In [3]:
df = pd.read_csv("Phishing_dataset_with_status_codes")
df.head(5)

Unnamed: 0.1,Unnamed: 0,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,...,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label,status_code
0,12,515489.txt,https://www.socialpolicy.org,27,www.socialpolicy.org,20,0,org,100.0,1.0,...,0,1,16,9,13,92,1,18,1,200.0
1,33,53440.txt,https://www.bikeseoul.com,24,www.bikeseoul.com,17,0,com,100.0,1.0,...,0,1,15,29,17,49,0,13,1,200.0
2,70,91992.txt,https://www.radware.com,22,www.radware.com,15,0,com,100.0,1.0,...,0,1,39,1,9,347,2,43,1,200.0
3,119,571426.txt,https://www.gunsmokenet.com,26,www.gunsmokenet.com,19,0,com,100.0,1.0,...,0,0,10,1,0,5,5,8,1,200.0
4,130,178925.txt,https://www.mro.nmt.edu,22,www.mro.nmt.edu,15,0,edu,100.0,0.571429,...,0,1,30,14,14,106,0,112,1,200.0


In [2]:
df = pd.read_csv("Phishing_dataset_with_status_codes_bad.csv")
df.head(5)

Unnamed: 0.3,Unnamed: 0.2,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,...,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label,Unnamed: 0.1,Unnamed: 0,status_code
0,1983,mw16317.txt,http://www.r1.panjo.club,23,www.r1.panjo.club,17,0,club,51.428571,0.875,...,0,0,0,0,0,0,0,3007.0,3007.0,200.0
1,1984,mw16317.txt,http://www.r1.panjo.club,23,www.r1.panjo.club,17,0,club,51.428571,0.875,...,0,0,0,0,0,0,0,4507.0,,200.0
2,1985,mw16317.txt,http://www.r1.panjo.club,23,www.r1.panjo.club,17,0,club,51.428571,0.875,...,0,0,0,0,0,0,0,6007.0,,200.0
3,1986,mw16317.txt,http://www.r1.panjo.club,23,www.r1.panjo.club,17,0,club,51.428571,0.875,...,0,0,0,0,0,0,0,7507.0,,200.0
4,3237,mw158850.txt,http://www.tasut.com,20,www.tasut.com,13,0,com,88.395062,1.0,...,0,0,0,0,0,0,0,846.0,846.0,200.0


In [3]:
nest_asyncio.apply()

# === CONFIG ===
BATCH_SIZE = 500
CONCURRENCY = 20
TIMEOUT = 15
DATASET = "Phishing_dataset_with_status_codes_bad.csv" 
OUTPUT_DIR = "downloaded_pages"
FAILED_LOG = "failed_downloads_async.csv"

# Load and filter
df = pd.read_csv(DATASET)
df = df[df['status_code'] == 200].reset_index(drop=True)

# Output folder
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Global failed list
failed_files = []

# Async fetcher
async def fetch(session, row):
    url = row['URL']
    filename = row['FILENAME'].replace('.txt', '.html')
    path = os.path.join(OUTPUT_DIR, filename)

    for attempt in range(3):
        try:
            async with session.get(url, timeout=TIMEOUT) as response:
                html = await response.text()
                with open(path, 'w', encoding='utf-8') as f:
                    f.write(html)
                return
        except Exception as e:
            if attempt == 2:
                failed_files.append({'FILENAME': row['FILENAME'], 'URL': url, 'error': str(e)})
                with open(path, 'w', encoding='utf-8') as f:
                    f.write(f"<html><body><h1>Download failed</h1><p>{str(e)}</p></body></html>")
            await asyncio.sleep(1)

# Run batch
async def run_batch(batch_df):
    timeout = aiohttp.ClientTimeout(total=TIMEOUT + 5)
    connector = aiohttp.TCPConnector(limit_per_host=CONCURRENCY)
    async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session:
        tasks = [fetch(session, row) for _, row in batch_df.iterrows()]
        await asyncio.gather(*tasks)

# Main loop
async def main():
    total = len(df)
    for start in tqdm(range(0, total, BATCH_SIZE), desc="Processing batches"):
        end = min(start + BATCH_SIZE, total)
        batch_df = df.iloc[start:end]
        await run_batch(batch_df)

    # Save failed
    if failed_files:
        pd.DataFrame(failed_files).to_csv(FAILED_LOG, index=False)
        print(f"\n🚫 {len(failed_files)} failed — saved to {FAILED_LOG}")
    else:
        print("\n✅ All pages downloaded successfully!")

# Run it
await main()


Processing batches:   0%|          | 0/3 [00:00<?, ?it/s]

Processing batches: 100%|██████████| 3/3 [01:57<00:00, 39.30s/it]


🚫 46 failed — saved to failed_downloads_async.csv





In [4]:
#Excluding from the dataset not downloaded
failed_files = pd.read_csv("failed_downloads_async.csv")
filtered_df = df[~df["FILENAME"].isin(failed_files["FILENAME"])]

In [5]:
filtered_df.to_csv("Dataset_with_downloaded_bad.csv", index=False)