In [38]:
import os

data_folder = "../data"  # relative path from notebook folder
os.makedirs(data_folder, exist_ok=True)

In [39]:
import requests

url = "http://data.phishtank.com/data/online-valid.csv.gz"
save_path = os.path.join(data_folder, "online-valid.csv.gz")

response = requests.get(url, timeout=20)
response.raise_for_status()  # ensure download succeeded

with open(save_path, "wb") as f:
    f.write(response.content)

print(f"Downloaded file saved to {save_path}")

Downloaded file saved to ../data/online-valid.csv.gz


In [40]:
import gzip
import shutil

csv_path = os.path.join(data_folder, "online-valid.csv")

with gzip.open(save_path, 'rb') as f_in:
    with open(csv_path, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

print(f"Unzipped CSV saved to {csv_path}")

Unzipped CSV saved to ../data/online-valid.csv


In [41]:
import pandas as pd

data_phishtank = pd.read_csv(csv_path)
data_phishtank = data_phishtank.query("online == 'yes' and verified == 'yes'")

In [42]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("harisudhan411/phishing-and-legitimate-urls")

print("Path to dataset files:", path)

Path to dataset files: /Users/ougi/.cache/kagglehub/datasets/harisudhan411/phishing-and-legitimate-urls/versions/1


In [43]:
data_kaggle = pd.read_csv(os.path.join(path,'new_data_urls.csv'))

The status field contains the class 1 and 0 where 1 represents legitimate and 0 represents phishing


In [44]:
data_kaggle = data_kaggle.query("status == 1")

In [45]:
data_kaggle.head(n=1)

Unnamed: 0,url,status
178574,http://www.crestonwood.com/router.php,1


In [46]:
data_phishtank.head(n=1)

Unnamed: 0,phish_id,url,phish_detail_url,submission_time,verified,verification_time,online,target
0,9296634,https://aidasdir.github.io/MyEtherWallet/,http://www.phishtank.com/phish_detail.php?phis...,2025-12-22T08:07:19+00:00,yes,2025-12-22T08:11:58+00:00,yes,Other


In [47]:
data_phishing = data_phishtank[['url']].copy()
data_phishing['label'] = 0

In [48]:
data_phishing.head(n=1)

Unnamed: 0,url,label
0,https://aidasdir.github.io/MyEtherWallet/,0


In [49]:
data_safe = data_kaggle[['url', 'status']].copy()
data_safe.rename(columns={'status': 'label'}, inplace=True)

In [50]:
data_safe.head(n=1)

Unnamed: 0,url,label
178574,http://www.crestonwood.com/router.php,1


In [51]:
merged = pd.concat([data_phishing, data_safe], ignore_index=True)
merged.to_csv(os.path.join(data_folder, 'url.csv'), index=False)

In [52]:
merged.head(1)

Unnamed: 0,url,label
0,https://aidasdir.github.io/MyEtherWallet/,0


label = 0 is phishing
label = 1 is safe