In [33]:
import matplotlib.pyplot as plt
import pandas as pd
import gc
import os

from collections import Counter
from itertools import islice

# Reading the whole dataset and loading it into memory

In [22]:
folder_path = 'dataset-2gb'
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]
df = pd.DataFrame()

for csv_file in csv_files:
    file_path = os.path.join(folder_path, csv_file)
    temp_df = pd.read_csv(file_path)
    df = pd.concat([df, temp_df], ignore_index=True)

len(df)

11666891

In [23]:
df.fillna('', inplace=True)

## Setting up variables

In [24]:
domains = list(set(df['Domain'].tolist()))
_domains = df['Domain'].tolist()

#content_languages = list(set(df['Content_Language'].tolist()))
content_languages = []
for content_language in df['Content_Language'].tolist():
    langs = content_language.lower().strip().replace(' ', '').replace('_', '-').split(',')
    content_languages.extend(langs)
_content_languages = content_languages
content_languages = list(set(content_languages))

#html_langs = list(set(df['HTML_Language'].tolist()))
html_langs = []
for html_lang in df['HTML_Language'].tolist():
    langs = html_lang.lower().strip().replace(' ', '').replace('_', '-').split(',')
    html_langs.extend(langs)
_html_langs = html_langs
html_langs = list(set(html_langs))

html_dirs = list(set(df['HTML_Dir'].tolist()))
_html_dirs = df['HTML_Dir'].tolist()

print(f'Domains: {len(domains)}')
print(f'Content Languages: {len(content_languages)}')
print(f'HTML Languages: {len(html_langs)}')
print(f'HTML Dirs: {len(html_dirs)}')
print(f'\nTotal size of labels: {len(domains) + len(content_languages) + len(html_langs) + len(html_dirs)}')
print(f'Records analized: {len(df)} ({len(csv_files)}/80000 chunks, {len(csv_files)/80000*100}%)')

Domains: 1913
Content Languages: 789
HTML Languages: 1567
HTML Dirs: 4

Total size of labels: 4273
Records analized: 11666891 (296/80000 chunks, 0.37%)


In [25]:
dict_domains = dict(sorted(dict(Counter(_domains)).items(), key=lambda item: item[1], reverse=True))
dict_content_languages = dict(sorted(dict(Counter(_content_languages)).items(), key=lambda item: item[1], reverse=True))
dict_html_langs = dict(sorted(dict(Counter(_html_langs)).items(), key=lambda item: item[1], reverse=True))
dict_html_dirs = dict(sorted(dict(Counter(_html_dirs)).items(), key=lambda item: item[1], reverse=True))

## Helper function to show metadata with exclution

In [37]:
def show_metadata(data, exclude=[], show=None, above=0):
    if isinstance(exclude, str):
        exclude = [exclude]
    if exclude is not None and len(exclude) != 0:
        exclution = ''
        for i, ex in enumerate(exclude):
            exclution += f'"{ex}" ({data[ex]}, {(data[ex]/len(df)*100):.2f}%)'
            if i+1 != len(exclude):
                exclution += ', '
            
        print(f'Exclude: {exclution}\n')
    exclude_value = sum([data[k] for k in exclude])
    length = len(df) - exclude_value
    for t, count in data.items() if show is None else islice(data.items(), show):
        if t in exclude: continue
        val = count/length
        if val < above: continue
        print(f'{t}\t{count}\t({(val*100):.2f}%)')

## Domains

In [49]:
show_metadata(dict_domains, above=.01)

com	5556023	(47.62%)
org	721456	(6.18%)
ru	508301	(4.36%)
de	507854	(4.35%)
net	420029	(3.60%)
fr	225891	(1.94%)
it	224770	(1.93%)
nl	199599	(1.71%)
pl	187134	(1.60%)
jp	176847	(1.52%)
edu	160917	(1.38%)
cz	124781	(1.07%)


## Domains excluding 'com'

In [52]:
show_metadata(dict_domains, exclude='com', above=.01)

Exclude: "com" (5556023, 47.62%)

org	721456	(11.81%)
ru	508301	(8.32%)
de	507854	(8.31%)
net	420029	(6.87%)
fr	225891	(3.70%)
it	224770	(3.68%)
nl	199599	(3.27%)
pl	187134	(3.06%)
jp	176847	(2.89%)
edu	160917	(2.63%)
cz	124781	(2.04%)
es	115281	(1.89%)
ca	110402	(1.81%)
cn	86986	(1.42%)
info	84147	(1.38%)
eu	83389	(1.36%)
ch	73207	(1.20%)
se	71783	(1.17%)
be	71745	(1.17%)
hu	66153	(1.08%)
ro	65314	(1.07%)
vn	61548	(1.01%)


## Content Languages

In [54]:
show_metadata(dict_content_languages, above=.01)

	10467437	(89.72%)
en	480097	(4.12%)
en-us	303361	(2.60%)


## Content Languages excluding ''

In [55]:
show_metadata(dict_content_languages, exclude='', above=.01)

Exclude: "" (10467437, 89.72%)

en	480097	(40.03%)
en-us	303361	(25.29%)
de	68033	(5.67%)
fr	50771	(4.23%)
es	28463	(2.37%)
ru	23626	(1.97%)
it	18309	(1.53%)
nl	15781	(1.32%)
en-gb	14712	(1.23%)
ja	12043	(1.00%)


## Content Languages excluding also 'en' and 'en-us'

In [57]:
show_metadata(dict_content_languages, exclude=['', 'en', 'en-us'], above=.01)

Exclude: "" (10467437, 89.72%), "en" (480097, 4.12%), "en-us" (303361, 2.60%)

de	68033	(16.35%)
fr	50771	(12.20%)
es	28463	(6.84%)
ru	23626	(5.68%)
it	18309	(4.40%)
nl	15781	(3.79%)
en-gb	14712	(3.54%)
ja	12043	(2.89%)
zh-cn	10960	(2.63%)
cs	7394	(1.78%)
pl	7256	(1.74%)
de-de	7128	(1.71%)
pt-br	5659	(1.36%)
da	5017	(1.21%)
fi	4533	(1.09%)
hu	4444	(1.07%)


## HTML Languages

In [58]:
show_metadata(dict_html_langs, above=.01)

en	2431896	(20.84%)
	2341553	(20.07%)
en-us	1867976	(16.01%)
ja	446256	(3.82%)
es	357392	(3.06%)
ru	318844	(2.73%)
de-de	317348	(2.72%)
de	296153	(2.54%)
en-gb	244499	(2.10%)
fr	241745	(2.07%)
fr-fr	236809	(2.03%)
ru-ru	200706	(1.72%)
nl	184533	(1.58%)
it-it	176497	(1.51%)
zh-cn	129874	(1.11%)


## HTML Languages excluding '', 'en', 'en-us'

In [59]:
show_metadata(dict_html_langs, exclude=['', 'en', 'en-us'], above=.01)

Exclude: "" (2341553, 20.07%), "en" (2431896, 20.84%), "en-us" (1867976, 16.01%)

ja	446256	(8.88%)
es	357392	(7.11%)
ru	318844	(6.34%)
de-de	317348	(6.31%)
de	296153	(5.89%)
en-gb	244499	(4.87%)
fr	241745	(4.81%)
fr-fr	236809	(4.71%)
ru-ru	200706	(3.99%)
nl	184533	(3.67%)
it-it	176497	(3.51%)
zh-cn	129874	(2.58%)
it	90485	(1.80%)
pl-pl	87035	(1.73%)
cs	85978	(1.71%)
vi	76816	(1.53%)
tr	75594	(1.50%)
pl	72763	(1.45%)
es-es	61614	(1.23%)
pt-br	54233	(1.08%)
ko	53067	(1.06%)
ar	52626	(1.05%)
hu	51267	(1.02%)


## HTML Dirs

In [60]:
show_metadata(dict_html_dirs)

	10253755	(87.89%)
ltr	1288194	(11.04%)
rtl	118756	(1.02%)
auto	6186	(0.05%)


## HTML Dirs excluding ''

In [61]:
show_metadata(dict_html_dirs, exclude='')

Exclude: "" (10253755, 87.89%)

ltr	1288194	(91.16%)
rtl	118756	(8.40%)
auto	6186	(0.44%)
