In [77]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from PIL import Image
import os
import re

In [2]:
kids_puzzles_url = 'https://www.trefl.com/puzzle/dla-dzieci?product_list_limit=all'
adults_puzzles_url = 'https://www.trefl.com/puzzle/dla-doroslych?product_list_limit=all'

In [3]:
def get_links(content):
    soup = BeautifulSoup(content, 'html.parser')
    puzzles_list = soup.find('body')
    puzzles_list_items = puzzles_list.find('ol')
    list_items = puzzles_list_items.find_all('li')
    links = []

    for puzzles in list_items:
        a = puzzles.select_one('a')
        if 'href' in a.attrs:
            link_url = a.get('href')
        links.append(link_url)
    return links

In [96]:
def get_list_of_images_and_age_categories(links):
    excluded_keywords = ['panoramiczne', 'drewniane', 'Megabox']
    image_and_category_dict = {}
    invalid_links = []
    if len(image_and_category_dict) / 50 == 0:
                    print(f'Dict have {len(image_and_category_dict)} items.')
    for link in links:
        response = requests.get(link)
        html_content = response.text
        soup = BeautifulSoup(html_content, "html.parser")

        try:
            keywords = soup.find('meta', attrs={'name': 'keywords'})['content']
        except Exception as e:
            print(f'{e} in link: {link}')
        else:
            if any([keyword in keywords for keyword in excluded_keywords]):
                continue
        
            elements = soup.find_all('div', class_ = 'product-content__img')
            category = soup.find_all('div', class_ = 'c-product-attributes')
            years = BeautifulSoup(str(category[0])).div   
            if len(category) > 1:
                years = BeautifulSoup(str(category[1])).div           
            tag = BeautifulSoup(str(elements[0])).div
            try:
                tag_content = tag.img['src']
                years_content =  re.split(r"([0-9]+)", years.img['src'])[-2]
            except Exception as e:
                invalid_links.append(link)
                print(f'{e} in link: {link}')
            else:
                image_and_category_dict[tag_content] = years_content
    return image_and_category_dict, invalid_links


In [72]:
first_batch_of_links = get_links((requests.get(kids_puzzles_url)).text)
second_batch_of_links = get_links((requests.get(adults_puzzles_url)).text)

In [97]:
print(len(first_batch_of_links))
print(len(second_batch_of_links))
all_links = first_batch_of_links + second_batch_of_links
print(len(all_links))

332
351
683


In [98]:
image_and_category_dict, bad_links = get_list_of_images_and_age_categories(all_links)
img_response = requests.get(list(image_and_category_dict.keys())[0], stream=True)
img = Image.open(img_response.raw)
img.show()

Dict have 0 items.
'NoneType' object is not subscriptable in link: https://www.trefl.com/w-swiecie-transformers-23024
'NoneType' object is not subscriptable in link: https://www.trefl.com/urocze-bobaski-31414
'src' in link: https://www.trefl.com/sorter-kolorow-93162
'NoneType' object is not subscriptable in link: https://www.trefl.com/treflikowy-dzien-93166
'NoneType' object is not subscriptable in link: https://www.trefl.com/farma-44000
'NoneType' object is not subscriptable in link: https://www.trefl.com/zabawny-swiat-binga-93165
'NoneType' object is not subscriptable in link: https://www.trefl.com/zabawy-peppy-93164
'src' in link: https://www.trefl.com/rybka-minimini-36125
'NoneType' object is not subscriptable in link: https://www.trefl.com/bohaterski-spider-man-34384
'NoneType' object is not subscriptable in link: https://www.trefl.com/niezwyciezeni-avengersi-17357
'NoneType' object is not subscriptable in link: https://www.trefl.com/minionki-w-akcji-13264
'NoneType' object is not

In [99]:
len(image_and_category_dict)

484

In [100]:
len(bad_links)

38

In [101]:
for key, value in image_and_category_dict.items():
    if not os.path.exists(value):
        os.makedirs(value)
    response = requests.get(key)
    image_name = f"{key.split('/')[-1]}.png"
    image_path = os.path.join(value, image_name)
    image = response.content
    with open (image_path, 'wb') as file:
        file.write(image)