In [45]:
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import pandas as pd

import urllib.request
import requests
import time
import http
import re
import shutil
import os
from random import choice

In [11]:
def get_soup(url):
    request = requests.get(url, timeout=10)
    request.encoding = 'utf-8'
    html_content = request.content

    soup = BeautifulSoup(html_content, 'lxml')

    return soup

In [38]:
base_url = "https://icons8.com/"

In [126]:
# parser
def parse_li(item, gif_url_id, free = False, icon_name_class = 'icon__name'):
    """
    A <li> sample:
    
    <li class="icons-item" data-v-44caebde="">
     <div class="icon" data-v-44caebde="" data-v-908c92b2="">
      <div class="icon__body" data-v-908c92b2="">
       <div class="icon-container" data-v-7e4fff6b="" data-v-908c92b2="">
        <img alt="Bell Animated Icon" class="icon" data-v-7e4fff6b="" src="/vue-static/landings/animated-icons-new/icons/color/bell/bell.svg"/>
        <span class="icon-over" data-v-7e4fff6b="">
        </span>
       </div>
       <div class="free" data-v-908c92b2="">
        Free
       </div>
      </div>
      <p class="icon__name" data-v-908c92b2="">
       Bell
      </p>
     </div>
    </li>
    """
    
    # free
    is_free = item.find("div", attrs={'class': 'free'}) is not None
    # icon name
    icon_name = item.find("p", attrs={'class': icon_name_class}).text
    # svg url
    img_url = base_url + item.find("img", attrs={'class': 'icon'}).get("src")
    # gif url
    ext = os.path.splitext(img_url)[1]
    gif_url = img_url.replace(ext, f"_{gif_url_id}.gif")
    
    data = {
        "is_free": "free" if free or is_free else "non-free",
        "name": icon_name,
        "image_url": img_url,
        "gif_url": gif_url,
    }
    
    return data

In [96]:
# gif url pattern
svg2gif = {
    'color': 192,
    'ios-glyph': 120,
    'windows-10': 128,
    'material-filled': 96,
    'office-style': 160,
}


In [97]:
# csv
file_name  = "icon_data.csv"
if (not os.path.exists(file_name)):
    icon_df = pd.DataFrame(columns=["id", "name", "group", "id_in_group", "is_free", "image_url", "gif_url"])
    icon_df.to_csv(file_name, index=None)

In [98]:
# scrape url
icon_df = pd.read_csv(file_name)

icon_id = -1
icon_groups = list(svg2gif.keys())
for icon_group in icon_groups:
    url = f"https://icons8.com/animated-icons/{icon_group}"
    soup = get_soup(url)
    lis = soup.find_all('li', attrs={'class': 'icons-item'})
    
    for id_in_group, li in tqdm(enumerate(lis)):
        icon_id += 1
        if (not icon_df.empty) and (not icon_df[(icon_df["group"] == icon_group) & (icon_df["id_in_group"] == id_in_group)].empty):
            continue
        try:
            icon_data = parse_li(li, svg2gif[icon_group])
            icon_data["group"] = icon_group
            icon_data["id_in_group"] = id_in_group
            icon_data["id"] = icon_id
            icon_df = icon_df.append(icon_data, ignore_index=True)
        except BaseException as err:
            print(f"Unexpected {err}, {type(err)}")

        if (icon_id % 20 == 0):
            icon_df.to_csv(file_name, index=None)
    icon_df.to_csv(file_name, index=None)

# drop duplicates
print(f"{icon_df.shape[0]} rows before dropping duplicates")
# icon_df = icon_df.drop_duplicates(subset=['image_url'])
# print(f"{icon_df.shape[0]} rows after dropping duplicates")
icon_df.to_csv(file_name, index=None)


0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

932 rows before dropping duplicates
932 rows after dropping duplicates


In [116]:
# scrape other free icons
url = "https://icons8.com/free-animated-icons"
soup = get_soup(url)

In [139]:
icon_df = pd.read_csv(file_name)

icon_groups = soup.find_all('div', attrs={'class': 'icons-group'})
icon_id = icon_df.shape[0] - 1
for icon_group in icon_groups:
    icon_group_name = icon_group.find('h2', {'class': 'group-title'}).text
    lis = icon_group.find_all('li', attrs={'class': 'icon'})
    
    for id_in_group, li in tqdm(enumerate(lis)):
        if (not icon_df.empty) and (not icon_df[(icon_df["group"] == icon_group_name) & (icon_df["id_in_group"] == id_in_group)].empty):
            
            continue
        icon_id += 1
        try:
            icon_data = parse_li(li, 200, free=True, icon_name_class='icon-name')
            icon_data["group"] = icon_group_name
            icon_data["id_in_group"] = id_in_group
            icon_data["id"] = icon_id
            icon_df = icon_df.append(icon_data, ignore_index=True)
        except BaseException as err:
            print(f"Unexpected {err}, {type(err)}")

        if (icon_id % 20 == 0):
            icon_df.to_csv(file_name, index=None)
    icon_df.to_csv(file_name, index=None)
    
# drop duplicates
print(f"{icon_df.shape[0]} rows before dropping duplicates")
# print(icon_df[icon_df.duplicated(subset=['image_url', 'gif_url'])])
# icon_df = icon_df.drop_duplicates(subset=['image_url', 'gif_url'])
# print(f"{icon_df.shape[0]} rows after dropping duplicates")
icon_df.to_csv(file_name, index=None)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

1587 rows before dropping duplicates
