In [39]:
import pandas as pd
import gdown
import zipfile
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [40]:
# File URL and extraction
file_url = "https://drive.google.com/uc?id=12MfYQpBl_79ak9K7VHx4jDEaeFUEhHEO"
output_zip = "spam.zip"
gdown.download(file_url, output_zip, quiet=False)

with zipfile.ZipFile(output_zip, 'r') as zip_ref:
    zip_ref.extractall("spam_files")
os.remove(output_zip)

# Load dataset
df = pd.read_csv('spam_files/url_spam_classification.csv')

Downloading...
From: https://drive.google.com/uc?id=12MfYQpBl_79ak9K7VHx4jDEaeFUEhHEO
To: /content/spam.zip
100%|██████████| 3.58M/3.58M [00:00<00:00, 12.4MB/s]


In [41]:
# Convert is_spam to binary
df['is_spam'] = df['is_spam'].apply(lambda x: 1 if x == df['is_spam'].unique()[0] else 0)

# Feature extraction
df['len_url'] = df['url'].apply(lambda x: len(x))
df['contains_subscribe'] = df['url'].apply(lambda x: 1 if "subscribe" in x else 0)
df['contains_hash'] = df['url'].apply(lambda x: 1 if "#" in x else 0)
df['num_digits'] = df['url'].apply(lambda x: len("".join(_ for _ in x if _.isdigit())))
df['non_https'] = df['url'].apply(lambda x: 1 if "https" not in x else 0)  # Change to 'non_https'
df['num_words'] = df['url'].apply(lambda x: len(x.split("/")))
df['contains_http'] = df['url'].apply(lambda x: 1 if "http" in x else 0)
df['contains_special_chars'] = df['url'].apply(lambda x: 1 if any(char in x for char in ["$", "%", "&", "!", "@"]) else 0)
df['subdomain_count'] = df['url'].apply(lambda x: len(x.split(".")) - 2)  # Count subdomains

display(df)

# Select features and target
features = df.drop(columns=['url', 'is_spam'])
target = df['is_spam']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)


Unnamed: 0,url,is_spam,len_url,contains_subscribe,contains_hash,num_digits,non_https,num_words,contains_http,contains_special_chars,subdomain_count
0,https://briefingday.us8.list-manage.com/unsubs...,1,51,1,0,1,0,4,1,0,2
1,https://www.hvper.com/,1,22,0,0,0,0,4,1,0,1
2,https://briefingday.com/m/v4n3i4f3,1,34,0,0,4,0,5,1,0,0
3,https://briefingday.com/n/20200618/m#commentform,0,48,0,1,8,0,6,1,0,0
4,https://briefingday.com/fan,1,27,0,0,0,0,4,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
148298,"https://cdn.substack.com/image/fetch/f_auto,q_...",1,223,0,0,52,0,7,1,1,5
148299,https://numlock.substack.com/subscribe,1,38,1,0,0,0,4,1,0,1
148300,https://docs.google.com/forms/d/e/1FAIpQLSfDLO...,1,99,0,0,11,0,8,1,0,1
148301,https://bookclub.substack.com/,1,30,0,0,0,0,4,1,0,1
