# load the model

In [None]:
# main loop:
# get three URLs for each domain
# classify the URLs
# label the domain based on the 
# save

In [36]:
import dask.dataframe as dd

# Login using e.g. `huggingface-cli login` to access this dataset
#df = dd.read_parquet("hf://datasets/nhagar/c4_urls_multilingual/batch*/train-*.parquet")

ddf = dd.read_parquet("hf://datasets/nhagar/c4_urls_en.noblocklist/batch*/train-*.parquet")
print("Sampling...")
#df = ddf.sample(frac=0.00000274).compute()
df = ddf.head(1000)

Sampling...


In [31]:
df

Unnamed: 0,url,domain
527,https://www.gritnglory.com/collections/sale/pr...,gritnglory.com
816,https://gnowfglins.com/health-and-nutrition/10...,gnowfglins.com
490,http://nam-guild.com/tag/achievement/,nam-guild.com
579,https://bakingandbakingscience.com/car-wall-cl...,bakingandbakingscience.com
392,https://actslaw.com/civil-rights/police-brutal...,actslaw.com
...,...,...
711,http://drrossmckenzie.ca/ross-blog/?category=O...,drrossmckenzie.ca
211,http://bdrtyjdt.000webhostapp.com/page-371.html,000webhostapp.com
59,https://www.qabookco.com/book/9781616952419,qabookco.com
986,http://forums.openlabs.com/discussion/338/is-i...,openlabs.com


In [34]:
import joblib
import random
import time
import pandas as pd

# Limit data (if needed for testing)
ground_truth_labels = pd.read_csv('../data/combined_domain_labels_16k_splits.csv')

# Load saved vectorizer and model
print("Loading vectorizer and model...")
vectorizer = joblib.load("saved_vectorizers_3/Full Path (Word).joblib")
logreg_model = joblib.load("saved_models_3/Full Path (Word)_LogReg.joblib")
knn_model = joblib.load("saved_models_2/Path Only (Word)_KNN_(k=1).joblib")


# Load KNN training URLs
knn_urls_df = pd.read_csv("saved_vectorizers_3/the_urls.csv")
knn_urls = knn_urls_df['url'].tolist()

# Sample and classify
start = time.time()
print("Classifying...")
results = []

for domain in df['domain'].unique():
    urls = df[df['domain'] == domain]['url'].tolist()
    if len(urls) < 3:
        #print(f"Not many urls from {domain}")
        sampled_urls = urls
        #continue
    else:
        sampled_urls = random.sample(urls, 3)
    
    X_sampled = vectorizer.transform(sampled_urls)
    
    logreg_preds = logreg_model.predict(X_sampled)
    distances, indices = knn_model.kneighbors(X_sampled, n_neighbors=1)
    
    for url, pred, idx in zip(sampled_urls, logreg_preds, indices.flatten()):
        nearest_neighbor = knn_urls[idx]
        results.append({
            'domain': domain,
            'url': url,
            'prediction': pred,
            'nearest_neighbor_url': nearest_neighbor
        })

print(f"Classification completed in {time.time() - start:.2f} seconds")

# Results DataFrame
result_df = pd.DataFrame(results)
result_df

Loading vectorizer and model...
Classifying...
Classification completed in 0.56 seconds


Unnamed: 0,domain,url,prediction,nearest_neighbor_url
0,gritnglory.com,https://www.gritnglory.com/collections/sale/pr...,0.0,https://www.artic.edu/collection?filters=tag_s...
1,gnowfglins.com,https://gnowfglins.com/health-and-nutrition/10...,1.0,http://alpharetta.11alive.com/news/politics?pa...
2,nam-guild.com,http://nam-guild.com/tag/achievement/,1.0,https://www.bartleby.com/essay/Answers-to-End-...
3,bakingandbakingscience.com,https://bakingandbakingscience.com/car-wall-cl...,0.0,https://www.ashworthcollege.edu/career-diploma...
4,actslaw.com,https://actslaw.com/civil-rights/police-brutal...,1.0,http://fc-legion.pro/astuces-pour-toca-race-72/
...,...,...,...,...
95,drrossmckenzie.ca,http://drrossmckenzie.ca/ross-blog/?category=O...,0.0,https://bmv.shop.com/web1/actions/searchHandle...
96,000webhostapp.com,http://bdrtyjdt.000webhostapp.com/page-371.html,1.0,https://www.intouchweekly.com/posts/my-600-lb-...
97,qabookco.com,https://www.qabookco.com/book/9781616952419,0.0,http://alpharetta.11alive.com/news/politics?pa...
98,openlabs.com,http://forums.openlabs.com/discussion/338/is-i...,1.0,http://alpharetta.11alive.com/news/politics?pa...


In [35]:
result_df.to_csv('neighbors.csv')

# load the domain list

In [None]:
import requests
import json
import random
from collections import defaultdict

NUM_INDEXES = 5

# List of domains to query
domains = ['urbandictionary.com',
           'getpocket.com', 'youtube.com', 'bloomberg.com']

def get_random_urls(domain, limit=3):
    # Get recent indexes
    index_list = requests.get("http://index.commoncrawl.org/collinfo.json").json()
    recent_indexes = sorted(index_list, key=lambda x: x['id'], reverse=True)[:NUM_INDEXES]

    results = []
    seen_urls = set()

    for index in recent_indexes:
        cdx_url = index['cdx-api']
        query_url = f"{cdx_url}?url=*.{domain}&output=json&filter=status:200&limit=1000"

        try:
            response = requests.get(query_url)
            lines = response.text.strip().splitlines()

            # If it's an error message like 'No Captures found', skip
            if lines and lines[0].startswith('{"message":'):
                continue

            entries = [json.loads(line) for line in lines]
            for r in entries:
                url = r.get('url')
                if url and url not in seen_urls:
                    results.append(url)
                    seen_urls.add(url)
        except Exception as e:
            print(f"Error querying {domain} from {cdx_url}: {e}")

    # Shuffle and return N random URLs
    random.shuffle(results)
    return results[:limit]

In [None]:
# Build a dict of domain -> recent URLs
domain_urls = {domain: get_random_urls(domain) for domain in domains}

# Example output
for domain, urls in domain_urls.items():
    print(f"{domain}:")
    for url in urls:
        print(f"  {url}")


In [19]:
help(knn_model)

Help on KNeighborsClassifier in module sklearn.neighbors._classification object:

class KNeighborsClassifier(sklearn.neighbors._base.KNeighborsMixin, sklearn.base.ClassifierMixin, sklearn.neighbors._base.NeighborsBase)
 |  KNeighborsClassifier(n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None)
 |
 |  Classifier implementing the k-nearest neighbors vote.
 |
 |  Read more in the :ref:`User Guide <classification>`.
 |
 |  Parameters
 |  ----------
 |  n_neighbors : int, default=5
 |      Number of neighbors to use by default for :meth:`kneighbors` queries.
 |
 |  weights : {'uniform', 'distance'}, callable or None, default='uniform'
 |      Weight function used in prediction.  Possible values:
 |
 |      - 'uniform' : uniform weights.  All points in each neighborhood
 |        are weighted equally.
 |      - 'distance' : weight points by the inverse of their distance.
 |        in this case, closer neighbors of a 

In [21]:
X_knn_train

<66314x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 238358 stored elements in Compressed Sparse Row format>