In [1]:
import pandas as pd
import numpy as np
import torch
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from tqdm import tqdm


In [2]:
# Load URL Data
http_df = pd.read_csv("./processed_http.csv", usecols=["url"])  # Load only necessary columns

# Preprocess URLs (Tokenization)
http_df["tokenized_url"] = http_df["url"].astype(str).apply(lambda x: word_tokenize(x.replace(".", " ").replace("/", " ")))

# Save Tokenized URLs as List
tokenized_urls = http_df["tokenized_url"].tolist()


In [3]:
# Train Word2Vec Model on URLs
w2v_model = Word2Vec(sentences=tokenized_urls, vector_size=100, window=5, min_count=1, workers=4)

# Optimize Word2Vec for Faster Inference
w2v_model.init_sims(replace=True)  # Reduces memory usage, speeds up lookup



  w2v_model.init_sims(replace=True)  # Reduces memory usage, speeds up lookup


In [None]:

# Move Word2Vec Model to CPU (Word2Vec is CPU-bound)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Batch Processing with Direct Disk Writing
batch_size = 5000
output_file = "url_embeddings.npy"

for batch_index, i in enumerate(tqdm(range(0, len(tokenized_urls), batch_size), desc="Processing URLs in Batches", unit="batch")):
    batch_tokens = tokenized_urls[i : i + batch_size]
    
    batch_embeddings = []
    for tokens in batch_tokens:
        vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
        embedding = np.mean(vectors, axis=0) if vectors else np.zeros(100)
        batch_embeddings.append(embedding)
    
    # Convert to PyTorch Tensor & Move to GPU
    batch_embeddings = torch.tensor(np.array(batch_embeddings), dtype=torch.float32).to(device)

    # Move Back to CPU & Save Batch to Disk
    np.save(f"url_embeddings_batch_{batch_index}.npy", batch_embeddings.cpu().numpy())

print("✅ URL embeddings successfully computed & saved in batches!")


Processing URLs in Batches: 100%|██████████| 5687/5687 [26:05<00:00,  3.63batch/s]

✅ URL embeddings successfully computed & saved in batches!



