In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import gc
import os

from collections import Counter

# Reading the whole dataset and loading it into memory

In [2]:
folder_path = 'dataset-2gb'
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]
df = pd.DataFrame()

for csv_file in csv_files:
    file_path = os.path.join(folder_path, csv_file)
    temp_df = pd.read_csv(file_path)
    df = pd.concat([df, temp_df], ignore_index=True)

len(df)

11666891

In [3]:
df.fillna('', inplace=True)

## Setting up variables

In [4]:
domains = list(set(df['Domain'].tolist()))
_domains = df['Domain'].tolist()

#content_languages = list(set(df['Content_Language'].tolist()))
content_languages = []
for content_language in df['Content_Language'].tolist():
    langs = content_language.lower().strip().replace(' ', '').replace('_', '-').split(',')
    content_languages.extend(langs)
_content_languages = content_languages
content_languages = list(set(content_languages))

#html_langs = list(set(df['HTML_Language'].tolist()))
html_langs = []
for html_lang in df['HTML_Language'].tolist():
    langs = html_lang.lower().strip().replace(' ', '').replace('_', '-').split(',')
    html_langs.extend(langs)
_html_langs = html_langs
html_langs = list(set(html_langs))

html_dirs = list(set(df['HTML_Dir'].tolist()))
_html_dirs = df['HTML_Dir'].tolist()

print(f'Domains: {len(domains)}')
print(f'Content Languages: {len(content_languages)}')
print(f'HTML Languages: {len(html_langs)}')
print(f'HTML Dirs: {len(html_dirs)}')
print(f'\nTotal size of labels: {len(domains) + len(content_languages) + len(html_langs) + len(html_dirs)}')
print(f'Records analized: {len(df)} ({len(csv_files)}/80000 chunks, {len(csv_files)/80000*100}%)')

Domains: 1913
Content Languages: 789
HTML Languages: 1567
HTML Dirs: 4

Total size of labels: 4273
Records analized: 11666891 (296/80000 chunks, 0.37%)


In [5]:
dict_domains = dict(sorted(dict(Counter(_domains)).items(), key=lambda item: item[1], reverse=True))
dict_content_languages = dict(sorted(dict(Counter(_content_languages)).items(), key=lambda item: item[1], reverse=True))
dict_html_langs = dict(sorted(dict(Counter(_html_langs)).items(), key=lambda item: item[1], reverse=True))
dict_html_dirs = dict(sorted(dict(Counter(_html_dirs)).items(), key=lambda item: item[1], reverse=True))

## Helper function to show metadata with exclution

In [6]:
def show_metadata(data, exclude=[]):
    if isinstance(exclude, str):
        exclude = [exclude]
    if exclude is not None and len(exclude) != 0:
        exclution = ''
        for i, ex in enumerate(exclude):
            exclution += f'"{ex}" ({data[ex]}, {(data[ex]/len(df)*100):.2f}%)'
            if i+1 != len(exclude):
                exclution += ', '
            
        print(f'Exclude: {exclution}\n')
    exclude_value = sum([data[k] for k in exclude])
    length = len(df) - exclude_value
    for t, count in data.items():
        if t in exclude: continue
        print(f'{t}\t{count}\t({(count/length*100):.2f}%)')

## Domains

In [7]:
show_metadata(dict_domains)

com	5556023	(47.62%)
org	721456	(6.18%)
ru	508301	(4.36%)
de	507854	(4.35%)
net	420029	(3.60%)
fr	225891	(1.94%)
it	224770	(1.93%)
nl	199599	(1.71%)
pl	187134	(1.60%)
jp	176847	(1.52%)
edu	160917	(1.38%)
cz	124781	(1.07%)
es	115281	(0.99%)
ca	110402	(0.95%)
cn	86986	(0.75%)
info	84147	(0.72%)
eu	83389	(0.71%)
ch	73207	(0.63%)
se	71783	(0.62%)
be	71745	(0.61%)
hu	66153	(0.57%)
ro	65314	(0.56%)
vn	61548	(0.53%)
in	60718	(0.52%)
gr	57284	(0.49%)
at	55965	(0.48%)
gov	53318	(0.46%)
dk	50828	(0.44%)
co	49349	(0.42%)
ua	45864	(0.39%)
fi	45585	(0.39%)
uk	44269	(0.38%)
sk	42982	(0.37%)
ir	40926	(0.35%)
no	40465	(0.35%)
kr	37726	(0.32%)
br	36376	(0.31%)
ar	35287	(0.30%)
id	30519	(0.26%)
pt	29861	(0.26%)
au	27254	(0.23%)
us	25884	(0.22%)
me	25699	(0.22%)
io	24033	(0.21%)
ie	23081	(0.20%)
cl	22545	(0.19%)
tv	21934	(0.19%)
by	19759	(0.17%)
si	19481	(0.17%)
blog	19130	(0.16%)
xyz	18885	(0.16%)
biz	18769	(0.16%)
lt	18163	(0.16%)
tw	18146	(0.16%)
hr	17223	(0.15%)
ee	17100	(0.15%)
rs	16854	(0.14%)
mx	1

## Domains excluding 'com'

In [8]:
show_metadata(dict_domains, exclude='com')

Exclude: "com" (5556023, 47.62%)

org	721456	(11.81%)
ru	508301	(8.32%)
de	507854	(8.31%)
net	420029	(6.87%)
fr	225891	(3.70%)
it	224770	(3.68%)
nl	199599	(3.27%)
pl	187134	(3.06%)
jp	176847	(2.89%)
edu	160917	(2.63%)
cz	124781	(2.04%)
es	115281	(1.89%)
ca	110402	(1.81%)
cn	86986	(1.42%)
info	84147	(1.38%)
eu	83389	(1.36%)
ch	73207	(1.20%)
se	71783	(1.17%)
be	71745	(1.17%)
hu	66153	(1.08%)
ro	65314	(1.07%)
vn	61548	(1.01%)
in	60718	(0.99%)
gr	57284	(0.94%)
at	55965	(0.92%)
gov	53318	(0.87%)
dk	50828	(0.83%)
co	49349	(0.81%)
ua	45864	(0.75%)
fi	45585	(0.75%)
uk	44269	(0.72%)
sk	42982	(0.70%)
ir	40926	(0.67%)
no	40465	(0.66%)
kr	37726	(0.62%)
br	36376	(0.60%)
ar	35287	(0.58%)
id	30519	(0.50%)
pt	29861	(0.49%)
au	27254	(0.45%)
us	25884	(0.42%)
me	25699	(0.42%)
io	24033	(0.39%)
ie	23081	(0.38%)
cl	22545	(0.37%)
tv	21934	(0.36%)
by	19759	(0.32%)
si	19481	(0.32%)
blog	19130	(0.31%)
xyz	18885	(0.31%)
biz	18769	(0.31%)
lt	18163	(0.30%)
tw	18146	(0.30%)
hr	17223	(0.28%)
ee	17100	(0.28%)
rs	1685

## Content Languages

In [9]:
show_metadata(dict_content_languages)

	10467437	(89.72%)
en	480097	(4.12%)
en-us	303361	(2.60%)
de	68033	(0.58%)
fr	50771	(0.44%)
es	28463	(0.24%)
ru	23626	(0.20%)
it	18309	(0.16%)
nl	15781	(0.14%)
en-gb	14712	(0.13%)
ja	12043	(0.10%)
zh-cn	10960	(0.09%)
cs	7394	(0.06%)
pl	7256	(0.06%)
de-de	7128	(0.06%)
pt-br	5659	(0.05%)
da	5017	(0.04%)
fi	4533	(0.04%)
hu	4444	(0.04%)
sv	4110	(0.04%)
ko	3978	(0.03%)
el	3906	(0.03%)
ar	3839	(0.03%)
zh-tw	3689	(0.03%)
fr-fr	3378	(0.03%)
vi	2918	(0.03%)
ca	2883	(0.02%)
pt	2872	(0.02%)
fa	2804	(0.02%)
ro	2757	(0.02%)
es-es	2710	(0.02%)
sk	2448	(0.02%)
tr	2287	(0.02%)
nb	2274	(0.02%)
nl-nl	2237	(0.02%)
pt-pt	2136	(0.02%)
it-it	2103	(0.02%)
sl	2033	(0.02%)
uk	2014	(0.02%)
zh	1825	(0.02%)
th	1651	(0.01%)
he	1480	(0.01%)
no	1473	(0.01%)
sr	1431	(0.01%)
ja-jp	1427	(0.01%)
bg	1370	(0.01%)
de-ch	1362	(0.01%)
et	1338	(0.01%)
da-dk	1230	(0.01%)
en-ca	1222	(0.01%)
hr	1186	(0.01%)
lt	1148	(0.01%)
lv	1146	(0.01%)
sv-se	1109	(0.01%)
id	1101	(0.01%)
zh-hans	1000	(0.01%)
pl-pl	875	(0.01%)
zh-hant	871	(0.01

## Content Languages excluding ''

In [10]:
show_metadata(dict_content_languages, exclude='')

Exclude: "" (10467437, 89.72%)

en	480097	(40.03%)
en-us	303361	(25.29%)
de	68033	(5.67%)
fr	50771	(4.23%)
es	28463	(2.37%)
ru	23626	(1.97%)
it	18309	(1.53%)
nl	15781	(1.32%)
en-gb	14712	(1.23%)
ja	12043	(1.00%)
zh-cn	10960	(0.91%)
cs	7394	(0.62%)
pl	7256	(0.60%)
de-de	7128	(0.59%)
pt-br	5659	(0.47%)
da	5017	(0.42%)
fi	4533	(0.38%)
hu	4444	(0.37%)
sv	4110	(0.34%)
ko	3978	(0.33%)
el	3906	(0.33%)
ar	3839	(0.32%)
zh-tw	3689	(0.31%)
fr-fr	3378	(0.28%)
vi	2918	(0.24%)
ca	2883	(0.24%)
pt	2872	(0.24%)
fa	2804	(0.23%)
ro	2757	(0.23%)
es-es	2710	(0.23%)
sk	2448	(0.20%)
tr	2287	(0.19%)
nb	2274	(0.19%)
nl-nl	2237	(0.19%)
pt-pt	2136	(0.18%)
it-it	2103	(0.18%)
sl	2033	(0.17%)
uk	2014	(0.17%)
zh	1825	(0.15%)
th	1651	(0.14%)
he	1480	(0.12%)
no	1473	(0.12%)
sr	1431	(0.12%)
ja-jp	1427	(0.12%)
bg	1370	(0.11%)
de-ch	1362	(0.11%)
et	1338	(0.11%)
da-dk	1230	(0.10%)
en-ca	1222	(0.10%)
hr	1186	(0.10%)
lt	1148	(0.10%)
lv	1146	(0.10%)
sv-se	1109	(0.09%)
id	1101	(0.09%)
zh-hans	1000	(0.08%)
pl-pl	875	(0.07%)
zh

## Content Languages excluding also 'en' and 'en-us'

In [15]:
show_metadata(dict_content_languages, exclude=['', 'en', 'en-us'])

Exclude: "" (10467437, 89.72%), "en" (480097, 4.12%), "en-us" (303361, 2.60%)

de	68033	(16.35%)
fr	50771	(12.20%)
es	28463	(6.84%)
ru	23626	(5.68%)
it	18309	(4.40%)
nl	15781	(3.79%)
en-gb	14712	(3.54%)
ja	12043	(2.89%)
zh-cn	10960	(2.63%)
cs	7394	(1.78%)
pl	7256	(1.74%)
de-de	7128	(1.71%)
pt-br	5659	(1.36%)
da	5017	(1.21%)
fi	4533	(1.09%)
hu	4444	(1.07%)
sv	4110	(0.99%)
ko	3978	(0.96%)
el	3906	(0.94%)
ar	3839	(0.92%)
zh-tw	3689	(0.89%)
fr-fr	3378	(0.81%)
vi	2918	(0.70%)
ca	2883	(0.69%)
pt	2872	(0.69%)
fa	2804	(0.67%)
ro	2757	(0.66%)
es-es	2710	(0.65%)
sk	2448	(0.59%)
tr	2287	(0.55%)
nb	2274	(0.55%)
nl-nl	2237	(0.54%)
pt-pt	2136	(0.51%)
it-it	2103	(0.51%)
sl	2033	(0.49%)
uk	2014	(0.48%)
zh	1825	(0.44%)
th	1651	(0.40%)
he	1480	(0.36%)
no	1473	(0.35%)
sr	1431	(0.34%)
ja-jp	1427	(0.34%)
bg	1370	(0.33%)
de-ch	1362	(0.33%)
et	1338	(0.32%)
da-dk	1230	(0.30%)
en-ca	1222	(0.29%)
hr	1186	(0.29%)
lt	1148	(0.28%)
lv	1146	(0.28%)
sv-se	1109	(0.27%)
id	1101	(0.26%)
zh-hans	1000	(0.24%)
pl-pl	875	(0

## HTML Languages

In [20]:
show_metadata(dict_html_langs)

en	2431896	(20.84%)
	2341553	(20.07%)
en-us	1867976	(16.01%)
ja	446256	(3.82%)
es	357392	(3.06%)
ru	318844	(2.73%)
de-de	317348	(2.72%)
de	296153	(2.54%)
en-gb	244499	(2.10%)
fr	241745	(2.07%)
fr-fr	236809	(2.03%)
ru-ru	200706	(1.72%)
nl	184533	(1.58%)
it-it	176497	(1.51%)
zh-cn	129874	(1.11%)
it	90485	(0.78%)
pl-pl	87035	(0.75%)
cs	85978	(0.74%)
vi	76816	(0.66%)
tr	75594	(0.65%)
pl	72763	(0.62%)
es-es	61614	(0.53%)
pt-br	54233	(0.46%)
ko	53067	(0.45%)
ar	52626	(0.45%)
hu	51267	(0.44%)
fa-ir	50064	(0.43%)
el	41460	(0.36%)
sv-se	38935	(0.33%)
fi	35946	(0.31%)
sv	31946	(0.27%)
zh-tw	29413	(0.25%)
id-id	27649	(0.24%)
uk	26861	(0.23%)
pt	26664	(0.23%)
da-dk	25523	(0.22%)
th	25246	(0.22%)
ro-ro	25115	(0.22%)
nl-nl	24879	(0.21%)
en-ca	24378	(0.21%)
ro	24172	(0.21%)
zh	24070	(0.21%)
pt-pt	22242	(0.19%)
sk	21849	(0.19%)
en-au	18753	(0.16%)
da	18678	(0.16%)
ca	17540	(0.15%)
fa	17211	(0.15%)
sk-sk	15692	(0.13%)
hr	14661	(0.13%)
nb-no	14588	(0.13%)
cs-cz	14492	(0.12%)
es-mx	14031	(0.12%)
bg-bg	13

## HTML Languages excluding '', 'en', 'en-us'

In [12]:
show_metadata(dict_html_langs, exclude=['', 'en', 'en-us'])

Exclude: "" (2341553, 20.07%), "en" (2431896, 20.84%), "en-us" (1867976, 16.01%)

ja	446256	(8.88%)
es	357392	(7.11%)
ru	318844	(6.34%)
de-de	317348	(6.31%)
de	296153	(5.89%)
en-gb	244499	(4.87%)
fr	241745	(4.81%)
fr-fr	236809	(4.71%)
ru-ru	200706	(3.99%)
nl	184533	(3.67%)
it-it	176497	(3.51%)
zh-cn	129874	(2.58%)
it	90485	(1.80%)
pl-pl	87035	(1.73%)
cs	85978	(1.71%)
vi	76816	(1.53%)
tr	75594	(1.50%)
pl	72763	(1.45%)
es-es	61614	(1.23%)
pt-br	54233	(1.08%)
ko	53067	(1.06%)
ar	52626	(1.05%)
hu	51267	(1.02%)
fa-ir	50064	(1.00%)
el	41460	(0.82%)
sv-se	38935	(0.77%)
fi	35946	(0.72%)
sv	31946	(0.64%)
zh-tw	29413	(0.59%)
id-id	27649	(0.55%)
uk	26861	(0.53%)
pt	26664	(0.53%)
da-dk	25523	(0.51%)
th	25246	(0.50%)
ro-ro	25115	(0.50%)
nl-nl	24879	(0.50%)
en-ca	24378	(0.49%)
ro	24172	(0.48%)
zh	24070	(0.48%)
pt-pt	22242	(0.44%)
sk	21849	(0.43%)
en-au	18753	(0.37%)
da	18678	(0.37%)
ca	17540	(0.35%)
fa	17211	(0.34%)
sk-sk	15692	(0.31%)
hr	14661	(0.29%)
nb-no	14588	(0.29%)
cs-cz	14492	(0.29%)
es-mx	1

## HTML Dirs

In [13]:
show_metadata(dict_html_dirs)

	10253755	(87.89%)
ltr	1288194	(11.04%)
rtl	118756	(1.02%)
auto	6186	(0.05%)


## HTML Dirs excluding ''

In [14]:
show_metadata(dict_html_dirs, exclude='')

Exclude: "" (10253755, 87.89%)

ltr	1288194	(91.16%)
rtl	118756	(8.40%)
auto	6186	(0.44%)
