**Wiki Art Web Crawler**

In [None]:
import requests
import os
import re

class WikiArt :
  def __init__(self) :
    self.baseUrl = "https://www.wikiart.org/en/"
    self.iconArg = "!PinterestSmall"
    self.saveDir = "AbstractPaintings/"

  def downloadPainting(self, painting, isSmall = True) :
    ext = os.path.splitext(painting["image"])[1]
    url = f"{painting['image']}{(self.iconArg if isSmall else '')}{ext}"
    dirPath = f"{self.saveDir}{painting['artistName']}"
    filePath = "{}/{}{}".format(dirPath, re.sub(r'/', '|', painting['title']), ext)

    if not os.path.exists(self.saveDir) :
      os.mkdir(self.saveDir)

    if not os.path.exists(dirPath) :
      os.mkdir(dirPath)

    with open(filePath, "wb") as image :
      image.write(requests.get(url).content)

    with open("lookUp.txt", "a") as lookUp :
      lookUp.write(filePath + "\n")

  def allGenres(self) :
    genres = []
    mode = "artists-by-genre?json=2"

    url = f"{self.baseUrl}{mode}"
    data = requests.get(url).json()

    for genre in data["Dictionaries"] :
      genres.append(genre["Seo"])
    
    return genres

  def findAllPaintingsByArtist(self, artist) :
    dArgs = "json=2&layout=new&page={}&resultType=masonry"
    mode = f"{artist}/mode/all-paintings-by-alphabet?{dArgs}"

    url = f"{self.baseUrl}{mode.format(0)}"

    basePage = requests.get(url).json()
    pages = int(basePage["AllPaintingsCount"] / basePage["PageSize"]) + 1
    basePage["PageSize"] = basePage["AllPaintingsCount"]

    for page in range(1, pages + 1) :
      url = f"{self.baseUrl}{mode.format(page)}"
      r = requests.get(url)

      data = r.json()

      for item in data["Paintings"]:
        basePage["Paintings"].append(item)

    return basePage["Paintings"]

  def findAllArtistsByGenre(self, genre) :
    dArgs = "json=3&searchterm=abstract&layout=new&page={}&resultType=masonry"
    mode = f"App/Search/Artists-by-Genre?{dArgs}"

    url = f"{self.baseUrl}{mode.format(0)}"

    basePage = requests.get(url).json()
    pages = int(basePage["AllArtistsCount"] / basePage["PageSize"]) + 1
    basePage["PageSize"] = basePage["AllArtistsCount"]

    for page in range(0, pages) :
      url = f"{self.baseUrl}{mode.format(page + 1)}"
      r = requests.get(url)

      data = r.json()

      for item in data["Artists"]:
        basePage["Artists"].append(item)

    return basePage["Artists"]

  def findArtists(self, timeRange = False, movement = False, nation = False, field = False, school = False) :
    dVals = []
    dArgs = "?isAjax=true&layout=new&dictIdsJson=[{}]&layout=new&maxYear={}&minYear=-{}&page={}&resultType=masonry"
    mode = f"app/Search/ArtistAdvancedSearch/{dArgs}"

    if timeRange : mode = mode.format("{}", timeRange[1], timeRange[0], "{}")
    else : mode =  mode.format("{}", 2021, -50000, "{}")

    if not (movement or nation or field or school) :
      mode = mode.format("", "{}")

    #movement
    print("movement is not built")
    #naition
    print("naition is not built")
    #field
    print("field is not built")
    #school
    print("school is not built")

    url = f"{self.baseUrl}{mode.format(0)}"
    basePage = requests.get(url).json()

    pages = int(basePage["AllArtistsCount"] / basePage["PageSize"]) + 1
    basePage["PageSize"] = basePage["AllArtistsCount"]

    for page in range(0, pages) :
      url = f"{self.baseUrl}{mode.format(page + 1)}"
      data = requests.get(url).json()

      for item in data["Artists"]:
        basePage["Artists"].append(item)

    return basePage["Artists"]

**Implementation**

In [None]:
print(WikiArt().allGenres())

['abstract', 'advertisement', 'allegorical-painting', 'animal-painting', 'architecture', 'artists-book', 'battle-painting', 'bijinga', 'bird-and-flower-painting', 'calligraphy', 'capriccio', 'caricature', 'cityscape', 'cloudscape', 'design', 'digital', 'figurative', 'flower-painting', 'genre-painting', 'graffiti', 'history-painting', 'icon', 'illustration', 'installation', 'interior', 'jewelry', 'landscape', 'literary-painting', 'manga', 'marina', 'miniature', 'mosaic', 'mural', 'mythological-painting', 'nude-painting-nu', 'pastorale', 'performance', 'photo', 'pin-up', 'portrait', 'poster', 'quadratura', 'religious-painting', 'sculpture', 'self-portrait', 'shan-shui', 'sketch-and-study', 'still-life', 'symbolic-painting', 'tapestry', 'tessellation', 'trompe-loeil', 'tronie', 'urushi-e', 'utensil', 'vanitas', 'veduta', 'video-art', 'wildlife-painting', 'yakusha-e']


In [None]:
import shutil

wiki = WikiArt()

#artists = ["nicholas-roerich", "pablo-picasso", "salvador-dali", "andrei-rublev", "rene-magritte"]
artists = [a["artistUrl"][4:] for a in wiki.findAllArtistsByGenre("abstract")]
print("Artists: \n", artists)

with open("lookUp.txt", "w") as lookUp :
  lookUp.write("")

#dowload all the square dali paintings
print("Dowloading...")
for artist in artists :
  print(artist)
  try :
    paintings = wiki.findAllPaintingsByArtist(artist)

    for paint in paintings : #get only the square(ish) paintings
      w, h = paint["width"], paint["height"]
      if (min(w, h) / max(w, h)) > 0.8 :
        wiki.downloadPainting(paint)
  except : print("--->Did not load")

#archive the paintings for client to dowload
print("Zipping...")
os.rename("lookUp.txt", f"{wiki.saveDir}/lookUp.txt")
shutil.make_archive(wiki.saveDir, "zip", wiki.saveDir)

print("done")

Artists: 
 ['camille-bryen', 'ulfert-wilke', 'ilya-bolotowsky', 'giuseppe-santomaso', 'bruno-munari', 'myron-stout', 'rupprecht-geiger', 'luigi-veronesi', 'maria-helena-vieira-da-silva', 'william-saroyan', 'jorge-oteiza', 'lee-krasner', 'max-bill', 'gustave-singier', 'jack-bush', 'jean-bertholle', 'norman-lewis', 'antonio-corpora', 'jean-le-moal', 'paul-feeley', 'franz-kline', 'hedda-sterne', 'raoul-ubac', 'else-alfelt', 'richard-mortensen', 'aurelie-nemours', 'leon-berkowitz', 'hiroyuki-tajima', 'raul-lozza', 'alfred-manessier', 'olle-baertling', 'lothar-charoux', 'jackson-pollock', 'giulio-turcato', 'afro', 'agnes-martin', 'verena-loewensberg', 'emil-schumacher', 'vasile-dobrian', 'alexander-liberman', 'john-cage', 'morris-louis', 'mario-ballocco', 'jean-michel-atlan', 'william-scott', 'carmelo-arden-quin', 'toko-shinoda', 'conrad-marca-relli', 'philip-guston', 'ad-reinhardt', 'jose-guerrero', 'karl-otto-gotz', 'aurel-cojan', 'peter-busa', 'johannes-jan-schoonhoven', 'nassos-daphnis'