In [7]:
import gc
import os

import matplotlib.pyplot as plt
import pandas as pd

from collections import Counter
from itertools import islice
from tqdm import tqdm

In [11]:
def get_stats(df):
    df.fillna('', inplace=True)
    domains = df['Domain'].tolist()
    
    content_languages = []
    for content_language in df['Content_Language'].tolist():
        langs = content_language.lower().strip().replace(' ', '').replace('_', '-').split(',')
        content_languages.extend(langs)
    content_languages = content_languages
    
    html_langs = []
    for html_lang in df['HTML_Language'].tolist():
        langs = html_lang.lower().strip().replace(' ', '').replace('_', '-').split(',')
        html_langs.extend(langs)
    html_langs = html_langs
    
    html_dirs = df['HTML_Dir'].tolist()

    return domains, content_languages, html_langs, html_dirs

In [15]:
folder_path = 'dest'
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

total_domains = []
total_content_languages = []
total_html_langs = []
total_html_dirs = []

for csv_file in tqdm(csv_files, desc='Processing CSV files', unit='file'):
    file_path = os.path.join(folder_path, csv_file)
    df = pd.read_csv(file_path)
    domains, content_languages, html_langs, html_dirs = get_stats(df)
    total_domains.extend(domains)
    total_content_languages.extend(content_languages)
    total_html_langs.extend(html_langs)
    total_html_dirs.extend(html_dirs)

dict_domains = dict(sorted(dict(Counter(total_domains)).items(), key=lambda item: item[1], reverse=True))
dict_content_languages = dict(sorted(dict(Counter(total_content_languages)).items(), key=lambda item: item[1], reverse=True))
dict_html_langs = dict(sorted(dict(Counter(total_html_langs)).items(), key=lambda item: item[1], reverse=True))
dict_html_dirs = dict(sorted(dict(Counter(total_html_dirs)).items(), key=lambda item: item[1], reverse=True))

Processing CSV files: 100%|██████████████████████████████████████████████████████| 3013/3013 [08:43<00:00,  5.75file/s]


In [52]:
length = len(total_domains)

In [53]:
del total_domains, total_content_languages, total_html_langs, total_html_dirs

## Helper functions to show metadata with exclution or only the data specified

In [59]:
def show_metadata(data, exclude=[], show=None, above=0):
    if isinstance(exclude, str):
        exclude = [exclude]
    if exclude is not None and len(exclude) != 0:
        exclution = ''
        for i, ex in enumerate(exclude):
            exclution += f'"{ex}" ({data[ex]}, {(data[ex]/length*100):.2f}%)'
            if i+1 != len(exclude):
                exclution += ', '
            
        print(f'Exclude: {exclution}\n')
    exclude_value = sum([data[k] for k in exclude])
    _length = length - exclude_value
    for t, count in data.items() if show is None else islice(data.items(), show):
        if t in exclude: continue
        val = count/_length
        if val < above: continue
        print(f'{t}\t{count}\t({(val*100):.2f}%)')

In [72]:
def show_metadata_only(data, *args):
    match_list = list(args)
    for t, count in data.items():
        if t not in match_list: continue
        print(f'{t}\t{count}\t({(count/length*100):.2f}%)')

## Domains

In [73]:
show_metadata(dict_domains, above=.01)

com	56219138	(47.70%)
org	7258788	(6.16%)
de	5158771	(4.38%)
ru	5066486	(4.30%)
net	4314619	(3.66%)
fr	2255271	(1.91%)
it	2249346	(1.91%)
nl	2059938	(1.75%)
pl	1933734	(1.64%)
jp	1766756	(1.50%)
edu	1679683	(1.43%)
cz	1278170	(1.08%)


## Domains excluding 'com'

In [74]:
show_metadata(dict_domains, exclude='com', above=.01)

Exclude: "com" (56219138, 47.70%)

org	7258788	(11.78%)
de	5158771	(8.37%)
ru	5066486	(8.22%)
net	4314619	(7.00%)
fr	2255271	(3.66%)
it	2249346	(3.65%)
nl	2059938	(3.34%)
pl	1933734	(3.14%)
jp	1766756	(2.87%)
edu	1679683	(2.73%)
cz	1278170	(2.07%)
es	1159578	(1.88%)
ca	1072655	(1.74%)
cn	872561	(1.42%)
info	856256	(1.39%)
eu	850337	(1.38%)
ch	750286	(1.22%)
se	731373	(1.19%)
be	693294	(1.12%)
hu	655533	(1.06%)
ro	650853	(1.06%)
in	630180	(1.02%)
gr	624943	(1.01%)


## Content Languages

In [75]:
show_metadata(dict_content_languages, above=.01)

	105570646	(89.58%)
en	5011794	(4.25%)
en-us	3036416	(2.58%)


## Content Languages excluding ''

In [76]:
show_metadata(dict_content_languages)

	105570646	(89.58%)
en	5011794	(4.25%)
en-us	3036416	(2.58%)
de	703220	(0.60%)
fr	504827	(0.43%)
es	312686	(0.27%)
ru	231258	(0.20%)
it	188411	(0.16%)
nl	166804	(0.14%)
en-gb	149038	(0.13%)
zh-cn	127865	(0.11%)
ja	117963	(0.10%)
cs	73201	(0.06%)
pl	72879	(0.06%)
de-de	65394	(0.06%)
pt-br	53442	(0.05%)
da	50067	(0.04%)
ar	45560	(0.04%)
sv	45134	(0.04%)
hu	42543	(0.04%)
fi	42318	(0.04%)
el	42019	(0.04%)
ko	40503	(0.03%)
fr-fr	40488	(0.03%)
zh-tw	37834	(0.03%)
fa	30576	(0.03%)
vi	29933	(0.03%)
ca	29677	(0.03%)
es-es	28803	(0.02%)
pt	27465	(0.02%)
sk	26766	(0.02%)
uk	24599	(0.02%)
ro	23098	(0.02%)
nb	22594	(0.02%)
pt-pt	22438	(0.02%)
tr	22230	(0.02%)
it-it	21903	(0.02%)
ja-jp	18532	(0.02%)
en-ca	17067	(0.01%)
sl	16912	(0.01%)
zh	16312	(0.01%)
nl-nl	16176	(0.01%)
hr	14689	(0.01%)
th	14260	(0.01%)
da-dk	14233	(0.01%)
bg	14104	(0.01%)
he	13819	(0.01%)
id	13744	(0.01%)
lt	12884	(0.01%)
et	12747	(0.01%)
no	12359	(0.01%)
sr	12250	(0.01%)
de-ch	11911	(0.01%)
sv-se	11877	(0.01%)
und	11131	(0.01%)


## Content Languages excluding also 'en' and 'en-us'

In [77]:
show_metadata(dict_content_languages, exclude=['', 'en', 'en-us'], above=.01)

Exclude: "" (105570646, 89.58%), "en" (5011794, 4.25%), "en-us" (3036416, 2.58%)

de	703220	(16.62%)
fr	504827	(11.93%)
es	312686	(7.39%)
ru	231258	(5.47%)
it	188411	(4.45%)
nl	166804	(3.94%)
en-gb	149038	(3.52%)
zh-cn	127865	(3.02%)
ja	117963	(2.79%)
cs	73201	(1.73%)
pl	72879	(1.72%)
de-de	65394	(1.55%)
pt-br	53442	(1.26%)
da	50067	(1.18%)
ar	45560	(1.08%)
sv	45134	(1.07%)
hu	42543	(1.01%)
fi	42318	(1.00%)


## HTML Languages

In [87]:
show_metadata(dict_html_langs, above=.0005)

	25092011	(21.29%)
en	24578987	(20.86%)
en-us	18288050	(15.52%)
ja	4481436	(3.80%)
es	3564853	(3.02%)
ru	3206506	(2.72%)
de-de	3165381	(2.69%)
de	2957113	(2.51%)
fr	2423999	(2.06%)
en-gb	2364529	(2.01%)
fr-fr	2352993	(2.00%)
ru-ru	1983318	(1.68%)
nl	1876985	(1.59%)
it-it	1752889	(1.49%)
zh-cn	1221536	(1.04%)
cs	930701	(0.79%)
it	911580	(0.77%)
pl-pl	869587	(0.74%)
pl	776726	(0.66%)
vi	765190	(0.65%)
tr	734899	(0.62%)
es-es	607663	(0.52%)
ko	552580	(0.47%)
pt-br	541880	(0.46%)
ar	530938	(0.45%)
hu	499287	(0.42%)
fa-ir	470210	(0.40%)
el	436831	(0.37%)
sv-se	379123	(0.32%)
fi	355725	(0.30%)
sv	320037	(0.27%)
zh-tw	287437	(0.24%)
id-id	265768	(0.23%)
nl-nl	261403	(0.22%)
pt	260267	(0.22%)
zh	253403	(0.22%)
ro	244242	(0.21%)
en-ca	236491	(0.20%)
th	235147	(0.20%)
ro-ro	234282	(0.20%)
da-dk	227634	(0.19%)
uk	220008	(0.19%)
pt-pt	217474	(0.18%)
sk	215240	(0.18%)
da	191062	(0.16%)
ca	182544	(0.15%)
en-au	181780	(0.15%)
fa	161883	(0.14%)
cs-cz	151425	(0.13%)
nb-no	148788	(0.13%)
sk-sk	148233	(0

In [88]:
len(dict_html_langs)

6116

## HTML Languages excluding '', 'en', 'en-us'

In [89]:
show_metadata(dict_html_langs, exclude=['', 'en', 'en-us'], above=.0005)

Exclude: "" (25092011, 21.29%), "en" (24578987, 20.86%), "en-us" (18288050, 15.52%)

ja	4481436	(8.98%)
es	3564853	(7.15%)
ru	3206506	(6.43%)
de-de	3165381	(6.34%)
de	2957113	(5.93%)
fr	2423999	(4.86%)
en-gb	2364529	(4.74%)
fr-fr	2352993	(4.72%)
ru-ru	1983318	(3.98%)
nl	1876985	(3.76%)
it-it	1752889	(3.51%)
zh-cn	1221536	(2.45%)
cs	930701	(1.87%)
it	911580	(1.83%)
pl-pl	869587	(1.74%)
pl	776726	(1.56%)
vi	765190	(1.53%)
tr	734899	(1.47%)
es-es	607663	(1.22%)
ko	552580	(1.11%)
pt-br	541880	(1.09%)
ar	530938	(1.06%)
hu	499287	(1.00%)
fa-ir	470210	(0.94%)
el	436831	(0.88%)
sv-se	379123	(0.76%)
fi	355725	(0.71%)
sv	320037	(0.64%)
zh-tw	287437	(0.58%)
id-id	265768	(0.53%)
nl-nl	261403	(0.52%)
pt	260267	(0.52%)
zh	253403	(0.51%)
ro	244242	(0.49%)
en-ca	236491	(0.47%)
th	235147	(0.47%)
ro-ro	234282	(0.47%)
da-dk	227634	(0.46%)
uk	220008	(0.44%)
pt-pt	217474	(0.44%)
sk	215240	(0.43%)
da	191062	(0.38%)
ca	182544	(0.37%)
en-au	181780	(0.36%)
fa	161883	(0.32%)
cs-cz	151425	(0.30%)
nb-no	148788	(0

## Spanish Dialects in Content Language and HTML Language

In [81]:
spanish_dialects = [
    'es-cl', # Chile
    'es-ar', # Chile
    'es-es', # España
    'es-mx', # México
    'es-co', # Colombia
    'es-pe', # Perú
    'es-ve', # Venezuela
    'es-ec', # Ecuador
    'es-gt', # Guatemala
    'es-cu', # Cuba
    'es-do', # República Dominicana
    'es-bo', # Bolivia
    'es-py', # Paraguay
    'es-uy', # Uruguay
    'es-pa', # Panamá
    'es-cr', # Costa Rica
    'es-ni', # Nicaragua
    'es-hn', # Honduras
    'es-sv', # El Salvador
    'es-pr', # Puerto Rico
    'es-us', # Estados Unidos
]

In [82]:
show_metadata_only(dict_html_langs, *spanish_dialects)

es-es	607663	(0.52%)
es-mx	130185	(0.11%)
es-ar	81313	(0.07%)
es-cl	44751	(0.04%)
es-co	39110	(0.03%)
es-pe	16953	(0.01%)
es-us	14508	(0.01%)
es-cr	5824	(0.00%)
es-ec	3777	(0.00%)
es-uy	3529	(0.00%)
es-ve	3163	(0.00%)
es-do	2619	(0.00%)
es-gt	2375	(0.00%)
es-hn	1862	(0.00%)
es-py	1262	(0.00%)
es-bo	1083	(0.00%)
es-pa	929	(0.00%)
es-pr	695	(0.00%)
es-sv	587	(0.00%)
es-ni	318	(0.00%)
es-cu	34	(0.00%)


In [83]:
show_metadata_only(dict_content_languages, *spanish_dialects)

es-es	28803	(0.02%)
es-mx	4522	(0.00%)
es-us	4185	(0.00%)
es-co	1993	(0.00%)
es-cl	1928	(0.00%)
es-pe	1845	(0.00%)
es-ar	1637	(0.00%)
es-uy	1205	(0.00%)
es-ec	590	(0.00%)
es-ve	573	(0.00%)
es-py	504	(0.00%)
es-bo	463	(0.00%)
es-pa	449	(0.00%)
es-gt	433	(0.00%)
es-cr	407	(0.00%)
es-do	385	(0.00%)
es-hn	381	(0.00%)
es-ni	360	(0.00%)
es-sv	354	(0.00%)
es-pr	344	(0.00%)
es-cu	4	(0.00%)


## HTML Dirs

In [84]:
show_metadata(dict_html_dirs)

	103822107	(88.10%)
ltr	12824642	(10.88%)
rtl	1141922	(0.97%)
auto	61769	(0.05%)


## HTML Dirs excluding ''

In [85]:
show_metadata(dict_html_dirs, exclude='')

Exclude: "" (103822107, 88.10%)

ltr	12824642	(91.42%)
rtl	1141922	(8.14%)
auto	61769	(0.44%)
