In [1]:
import sys
sys.path.append('../')

In [2]:
# forcing the notebook to reload the modules
%load_ext autoreload
%autoreload 2

In [3]:
from src.data import datacollector as dc
from src.data import downloader as dl
from src.data import datafilter as df
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import requests
from concurrent.futures import ThreadPoolExecutor
import shutil
import zipfile


In [4]:
PATH_DB_CONFIGS = '../data/'
PATH_URL_LISTS = '../data/raw/url_lists/'
PATH_RAW = '../data/raw/'
PATH_COLLECTION = '../data/collection/'

In [5]:
# Download files from Databases
GOOGLEFONTSDB = True
DSRFONTSDB = True

dl.get_font_dbs({"GoogleFontsDB" : GOOGLEFONTSDB,
              "DSRFontsDB" : DSRFONTSDB}, 
             PATH_DB_CONFIGS,
             PATH_RAW)

Accessing GoogleFontsDB.
3 directories for transfer.
Transferring files...
File transfer successful
Accessing DSRFontsDB.
1 directories for transfer.
Transferring files...
File transfer successful


In [None]:
# Download files from list of urls
dl.update_glyphazzn(PATH_URL_LISTS)
dl.get_fonts_urls(PATH_URL_LISTS, PATH_RAW)

In [46]:
# Go through all folders of raw database data
# collect all fonts, write description to csv
# dump all files into a single folder

# (This will take a moment or two..)

dc.collectfonts(PATH_RAW, PATH_COLLECTION)



Copy: ../data/raw/DSRFontsDB\DB1\fonts_otf\24Janvier-Light.otf -> ../data/collection/24Janvier-Light.otf
Copy: ../data/raw/DSRFontsDB\DB1\fonts_otf\24Janvier.otf -> ../data/collection/24Janvier.otf
Copy: ../data/raw/DSRFontsDB\DB1\fonts_otf\Acephimere Bold Italic.otf -> ../data/collection/Acephimere Bold Italic.otf
Copy: ../data/raw/DSRFontsDB\DB1\fonts_otf\Acephimere Bold.otf -> ../data/collection/Acephimere Bold.otf
Copy: ../data/raw/DSRFontsDB\DB1\fonts_otf\Acephimere Italic.otf -> ../data/collection/Acephimere Italic.otf
Copy: ../data/raw/DSRFontsDB\DB1\fonts_otf\Acephimere Thin Italic.otf -> ../data/collection/Acephimere Thin Italic.otf
Copy: ../data/raw/DSRFontsDB\DB1\fonts_otf\Acephimere Thin.otf -> ../data/collection/Acephimere Thin.otf
Copy: ../data/raw/DSRFontsDB\DB1\fonts_otf\Acephimere.otf -> ../data/collection/Acephimere.otf
Copy: ../data/raw/DSRFontsDB\DB1\fonts_otf\AceSans-FREE.otf -> ../data/collection/AceSans-FREE.otf
Copy: ../data/raw/DSRFontsDB\DB1\fonts_otf\acid.otf

This is not enough data to train a model. Seems like we need to download more data.

In [None]:
# TODO CBN
#
# 1. Include the glyphazzn_urls
# 2. Tweak the collector to catch zip files in the GoogleFontsDB/raw folder
# 3. Clean up this notebook


In [4]:
def download_fonts(url_list, download_dir=None):
    if download_dir is None:
        download_dir = os.getcwd()

    # if url_list is a list
    if isinstance(url_list, list):
        files_downloaded = 0
        for url in tqdm(url_list):
            try:
                response = requests.get(url, timeout=1)
                if response.status_code == 200:
                    font_file_name = os.path.basename(url)
                    font_file_name_path = os.path.join(download_dir, font_file_name)
                    with open(font_file_name_path, 'wb') as f:
                        f.write(response.content)
                        files_downloaded += 1
                    
                else:
                    pass#print(f"URL not reachable: {url}")
            except Exception as e:
                pass#print(f"Error processing {url}: {e}")
        
        print(f"Downloaded {files_downloaded} of {len(url_list)} files.")  
    # if url_list is a string (single url)   
    elif isinstance(url_list, str):
        try:
            response = requests.get(url_list, stream=True)
            response.raise_for_status()
            filename = os.path.basename(url_list)
            filepath = os.path.join(download_dir, filename)

            with open(filepath, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192): 
                    if chunk:
                        f.write(chunk)
        except:
            pass



In [8]:
def download_fonts_in_parallel(urls, folder):
    if not os.path.exists(folder):
        os.makedirs(folder)

    with ThreadPoolExecutor(max_workers=10) as executor:
        for url in urls:
            executor.submit(download_fonts, url, folder)

In [6]:
glyphazzn_urls_df = pd.read_csv('https://storage.googleapis.com/magentadata/models/svg_vae/glyphazzn_urls.txt', header=None)
glyphazzn_urls_df.columns = ['id', 'split', 'url']

glyphazzn_urls_df['url'] = glyphazzn_urls_df['url'].str.lower()
glyphazzn_urls_df.drop_duplicates(subset='url', keep='first', inplace=True)
glyphazzn_urls_df['file_format'] = glyphazzn_urls_df['url'].str.split('.').str[-1]
glyphazzn_urls_df = glyphazzn_urls_df[glyphazzn_urls_df['file_format'].isin(['ttf', 'otf'])]
glyphazzn_urls_df['website'] = glyphazzn_urls_df['url'].str.split('/').str[2]

unique_websites = glyphazzn_urls_df['website'].unique().tolist()
website_status_codes = {}
for website in tqdm(unique_websites):
    # We try to get the status code of the website
    try:
        website_status_codes[website] = requests.get('https://' + website, timeout=2).status_code
    except:
        try:
            website_status_codes[website] = requests.get('http://' + website, timeout=2).status_code
        except:
            website_status_codes[website] = 0

# We create a new column called status_code
glyphazzn_urls_df['status_code'] = np.nan
# We iterate over the list of website_status_codes and and update the corresponding rows in the dataframe
for website, status_code in website_status_codes.items():
    glyphazzn_urls_df.loc[glyphazzn_urls_df['website'] == website, 'status_code'] = status_code
glyphazzn_urls_df = glyphazzn_urls_df[glyphazzn_urls_df['status_code'] == 200]

urls_list = glyphazzn_urls_df['url'].tolist()

100%|██████████| 23/23 [00:16<00:00,  1.40it/s]


In [7]:
# list of dead websites
for website, status_code in website_status_codes.items():
    if status_code != 200:
        print(website, status_code)

ru.fontsplace.com 0
andor.net 0
www.andor.net 0
grafikk.vktv.net 0
www.911fonts.com 0


This downloads for about 40 minutes about 9500 files (550 MB).

In [None]:
# Now we can download the fonts in parallel
download_dir = os.path.join(os.pardir, 'data', 'processed', 'all-fonts')
download_fonts_in_parallel(urls_list, download_dir)

In [10]:
df.filter_fonts("../data/processed/all-fonts", "../data/processed/filtered-fonts")

0it [00:00, ?it/s]'created' timestamp seems very low; regarding as unix timestamp


'modified' timestamp seems very low; regarding as unix timestamp
'created' timestamp seems very low; regarding as unix timestamp
'modified' timestamp seems very low; regarding as unix timestamp
'created' timestamp seems very low; regarding as unix timestamp
'modified' timestamp seems very low; regarding as unix timestamp
'created' timestamp seems very low; regarding as unix timestamp
'modified' timestamp seems very low; regarding as unix timestamp
'created' timestamp seems very low; regarding as unix timestamp
'modified' timestamp seems very low; regarding as unix timestamp
45it [00:00, 449.69it/s]'created' timestamp seems very low; regarding as unix timestamp
'modified' timestamp seems very low; regarding as unix timestamp
too much 'glyf' table data: expected 20820, received 21142 bytes
'created' timestamp seems very low; regarding as unix timestamp
'created' timestamp seems very low; regarding as unix timestamp
'created' timestamp seems very low; regarding as unix timestamp
'modified

Processed 12840 fonts. Found 8363 usable fonts and moved them to ../data/processed/filtered-fonts.





{'num_font_files_processed': 12840,
 'num_usable_fonts': 8363,
 'out_of_bounds': 2486,
 'has_not_all_chars': 1484,
 'has_empty_glyphs': 415,
 'glyf_is_corrupted': 53,
 'no_good_cmap': 17,
 'has_no_glyf': 18,
 'cmap_is_corrupted': 2}

In summation, we have about 8300 filtered font files. Is this enought data? We have to see.