In [1]:
import sys
print(sys.executable)

/Library/Frameworks/Python.framework/Versions/3.11/bin/python3.11


In [2]:
!/usr/local/opt/python@3.11/bin/python3.11 -m pip install scikit-learn

zsh:1: no such file or directory: /usr/local/opt/python@3.11/bin/python3.11


In [8]:
import pandas as pd
import numpy as np
import sklearn
from pathlib import Path

# Read Data

In [5]:
dataframes_dict = {}
for folder in Path('../consolidated_data_files').iterdir():
    if folder.is_dir():
        parquet_path = folder / 'consolidated.parquet'
        if parquet_path.exists():
            print(f"Reading {folder.name}...")
            df = pd.read_parquet(parquet_path)
            # Store in dictionary with name as key
            dataframes_dict[folder.name] = df
            print(f"Read {folder.name}!")
print("Done!")

Reading uonlp_CulturaX__...
Read uonlp_CulturaX__!
Reading allenai_c4_en...
Read allenai_c4_en!
Reading dolma...
Read dolma!
Reading HuggingFaceFW_fineweb-edu_data__...
Read HuggingFaceFW_fineweb-edu_data__!
Reading tiiuae_falcon-refinedweb_data...
Read tiiuae_falcon-refinedweb_data!
Done!


# Descriptive Stats

In [6]:
for name, df in dataframes_dict.items():
    print(f"{name}:")
    print(f"  Rows: {df.shape[0]}")
    print(f"  Columns: {df.shape[1]}")
    print(f"  Memory usage: {df.memory_usage().sum() / 1024**2:.2f} MB\n")

uonlp_CulturaX__:
  Rows: 109879658
  Columns: 2
  Memory usage: 1676.63 MB

allenai_c4_en:
  Rows: 15668873
  Columns: 2
  Memory usage: 239.09 MB

dolma:
  Rows: 331096
  Columns: 2
  Memory usage: 5.05 MB

HuggingFaceFW_fineweb-edu_data__:
  Rows: 12019813
  Columns: 2
  Memory usage: 183.41 MB

tiiuae_falcon-refinedweb_data:
  Rows: 33271341
  Columns: 2
  Memory usage: 507.68 MB



# Histogram

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Get the URL counts per domain
#url_counts = dataframes_dict['uonlp_CulturaX__']['count'].values
url_counts = dataframes_dict['dolma']['count'].values

plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(figsize=(12, 6))

# Create histogram with log-scale bins
bins = np.logspace(np.log10(1), np.log10(url_counts.max()), 50)
ax.hist(url_counts, bins=bins, alpha=0.7, color='#2563eb')

# Set log scale for x-axis
ax.set_xscale('log')
ax.set_yscale('log')  # Also using log scale for y-axis due to long tail

# Customize labels and title
ax.set_xlabel('URLs per Domain (log scale)', fontsize=12)
ax.set_ylabel('Number of Domains (log scale)', fontsize=12)
ax.set_title('Distribution of URLs per Domain', fontsize=14, pad=20)

# Customize grid and spines
ax.grid(True, which='major', linestyle='-', alpha=0.2)
ax.grid(True, which='minor', linestyle=':', alpha=0.2)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Add some statistics annotations
stats_text = (
    f'Total Domains: {len(url_counts):,}\n'
    f'Mean URLs/Domain: {url_counts.mean():.1f}\n'
    f'Median URLs/Domain: {np.median(url_counts):.1f}\n'
    f'Max URLs/Domain: {url_counts.max():,}'
)
ax.text(0.95, 0.95, stats_text,
        transform=ax.transAxes,
        verticalalignment='top',
        horizontalalignment='right',
        bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

plt.tight_layout()
plt.show()

# Lorenz Curves

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(figsize=(12, 8))

# Plot Lorenz curve for each dataset with different colors
colors = ['#2563eb', '#dc2626', '#16a34a', '#9333ea', '#ea580c', '#0891b2']
for (name, df), color in zip(dataframes_dict.items(), colors):
    # Sort domains by number of URLs
    sorted_counts = np.sort(df['count'].values)
    
    # Calculate cumulative % of URLs
    cumulative_urls = np.cumsum(sorted_counts) / sorted_counts.sum() * 100
    
    # Calculate cumulative % of domains
    cumulative_domains = np.arange(1, len(sorted_counts) + 1) / len(sorted_counts) * 100
    
    # Plot the distribution
    ax.plot(cumulative_domains, cumulative_urls, color=color, linewidth=2, label=name)
    
    # Print statistics for each dataset
    print(f"\n{name}:")
    print(f"  Top 1% domains hold {100 - cumulative_urls[int(0.99 * len(cumulative_urls))]:.1f}% of URLs")
    print(f"  Top 10% domains hold {100 - cumulative_urls[int(0.90 * len(cumulative_urls))]:.1f}% of URLs")
    print(f"  Median URLs per domain: {sorted_counts[len(sorted_counts)//2]:.1f}")

# Add diagonal line representing perfect equality
ax.plot([0, 100], [0, 100], '--', color='gray', alpha=0.5, label='Perfect equality')

# Customize the plot
ax.set_xlim(0, 100)
ax.set_ylim(0, 100)
ax.set_xlabel('Cumulative % of Domains', fontsize=12)
ax.set_ylabel('Cumulative % of URLs', fontsize=12)
ax.set_title('URL Distribution Across Domains by Dataset', fontsize=14, pad=20)

# Customize grid and spines
ax.grid(True, which='major', linestyle='-', alpha=0.2)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Add legend with smaller font and transparency
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10, 
         framealpha=0.8, borderaxespad=0.)

plt.tight_layout()
plt.show()

# Distribution

In [None]:
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('seaborn-v0_8-whitegrid')

# Create the figure
fig, ax = plt.subplots(figsize=(12, 8))

# Define custom buckets and their labels
bucket_edges = [1, 2, 10, 100, 1000, 10000, 100000, 1000000]
bucket_labels = ['1', '2-9', '10-99', '100-999', '1K-9.9K', '10K-99K', '100K+']

# Calculate bar positions
n_datasets = len(dataframes_dict)
n_buckets = len(bucket_labels)
bar_width = 0.15  # Width of each bar
group_positions = np.arange(n_buckets)  # Center positions for each group

colors = ['#2563eb', '#dc2626', '#16a34a', '#9333ea', '#ea580c', '#0891b2']
for i, ((name, df), color) in enumerate(zip(dataframes_dict.items(), colors)):
    bucket_counts = []
    for j in range(len(bucket_edges)-1):
        if j == len(bucket_edges)-2:  # Last bucket includes all higher values
            count = ((df['count'] >= bucket_edges[j])).sum()
        else:
            count = ((df['count'] >= bucket_edges[j]) & 
                    (df['count'] < bucket_edges[j+1])).sum()
        bucket_counts.append(count / len(df) * 100)  # Convert to percentage
    
    # Calculate bar positions
    bar_positions = group_positions + (i - n_datasets/2 + 0.5) * bar_width
    
    # Plot bars
    ax.bar(bar_positions, 
           bucket_counts,
           width=bar_width,
           color=color,
           label=name)
    
    # Print statistics
    print(f"\n{name}:")
    print(f"  Mean URLs per domain: {df['count'].mean():.1f}")
    print(f"  Median URLs per domain: {df['count'].median():.1f}")

# Customize the plot
ax.set_xticks(group_positions)
ax.set_xticklabels(bucket_labels, rotation=45)
ax.set_ylim(0, None)

ax.set_xlabel('URLs per Domain', fontsize=12)
ax.set_ylabel('% of Domains', fontsize=12)
ax.set_title('Distribution of URLs per Domain by Dataset', fontsize=14, pad=20)

# Customize grid and spines
ax.grid(True, which='major', linestyle='-', alpha=0.2)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Add legend with smaller font
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10, 
         borderaxespad=0.)

plt.tight_layout()
plt.show()

# Cluster

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

dolma_sample = dataframes_dict['dolma'].sample(10000)

X = dolma_sample[['count']]

# Dictionary to store silhouette scores
silhouette_scores = {}

# Test n_clusters from 3 to 10
for n in range(3, 11):
    clustering = AgglomerativeClustering(n_clusters=n)
    labels = clustering.fit_predict(X)
    dolma_sample[f"cluster_{n}"] = labels
    
    # Calculate silhouette score
    if n > 1:  # Silhouette score requires at least 2 clusters
        score = silhouette_score(X, labels)
        silhouette_scores[n] = score

# Print the DataFrame and silhouette scores
print(dolma_sample.sample(10))
print("Silhouette Scores:", silhouette_scores)

In [None]:
from sklearn.cluster import AgglomerativeClustering

dolma_sample_20k = dataframes_dict['dolma'].sample(20000)

X = dolma_sample_20k[['count']]

# Choose the optimal number of clusters (replace `optimal_n` with your value)
optimal_n = 7

# Run the clustering
clustering = AgglomerativeClustering(n_clusters=optimal_n)
dolma_sample_20k['cluster'] = clustering.fit_predict(X)

# Calculate the range of values within each cluster
cluster_ranges = (
    dolma_sample_20k
    .groupby('cluster')['count']
    .agg(['min', 'max'])
    .rename(columns={'min': 'min_count', 'max': 'max_count'})
)

# Show the results
print("\nCluster Ranges:")
print(cluster_ranges)


# Common Crawl Sample

In [24]:
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.qcut.html

N_BUCKETS = 5
N_SAMPLES_PER_BUCKET = 100

# synthetic sample from parent data
df = dataframes_dict['allenai_c4_en']

# divide domains into buckets
bins = [0, 1, 10, 100, 1000, df['count'].max()+1]#float('inf')]
labels = ['one','small', 'medium', 'large', 'very_large']

#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html
df['bucket'] = pd.cut(df['count'], bins=bins, labels=labels)

print("Distribution:")
print(df.bucket.value_counts())

# print 5 sample domains from each bucket
for bucket in df['bucket'].unique():
    print(bucket)
    bucket_domains = df[df['bucket'] == bucket].sample(5)
    print(bucket_domains)
    print()

Distribution:
bucket
small         6416779
one           5063140
medium        3640376
large          515031
very_large      33547
Name: count, dtype: int64
small
                                   domain  count bucket
9076935           www.2dstageelectric.com    4.0  small
13859404              www.revealuxion.com    6.0  small
14564379                www.superwash.com    8.0  small
315816    alarminstallationcapetown.co.za    5.0  small
4116116                 irkarbeljaars.com    9.0  small

one
                             domain  count bucket
12244980              www.kisfm.org    1.0    one
2187248               dcampedup.com    1.0    one
7608488                   sstong.kr    1.0    one
5697117   nawbo.smallbusinesspr.com    1.0    one
14789555  www.theleverageagency.com    1.0    one

medium
                           domain  count  bucket
14514502  www.stripeystork.org.uk   32.0  medium
12573864     www.lwvprinceton.org   34.0  medium
12244920            www.kisco.com   50.0 

In [11]:
df[df.domain=='pizza.dominos.com']

Unnamed: 0,domain,count,bucket
6354079,pizza.dominos.com,11890.0,very_large
