# WEB SCRAPING 

### Import libraries

In [5]:
### Images
! pip install opencv-python
from matplotlib import image
from matplotlib import pyplot
from skimage import io
import cv2
import os

### Web scraping packages
from bs4 import BeautifulSoup
import requests
import itertools
from urllib.request import Request, urlopen
import urllib.request

### Pandas/numpy for data manipulation
import pandas as pd
import numpy as np

### Progress bar
from tqdm.notebook import tqdm
from time import sleep 

### Visualize data
%pylab inline
plt.style.use('seaborn-talk')

Populating the interactive namespace from numpy and matplotlib


### Scraping the species of mushrooms from fungipedia

In [6]:
### function to have 542 species of muchrooms over Spain

def get_all_mushrooms():
    mushroom_dct = {}

    for idx in tqdm(range(0, 541, 20)):
        url = f"https://www.fungipedia.org/hongos.html?start={idx}"
        req = Request(
            url,
            headers={"User-Agent": "Mozilla&5.0"},
        )

        html = urlopen(req)
        soup = BeautifulSoup(html, "html.parser")
        
        mushrooms = soup.find_all(class_="gris")
        for m in mushrooms:
            mushroom_dct[m.get("title")] = m.get("href")
            
    return mushroom_dct

In [7]:
### Functions that gets a dictionary (setas_dct) from a webpage with the url "https://www.fungipedia.org....""
### with all mushrooms with their name and url., after doing this, we decide to use also Guadalajara Mushrooms, but
### this is to scrape the names of more than 500 mushrooms from fungipedia or almost 50 mushrooms exclusively from 
### Guadalajara.This is the find mushrooms cell.

print("Choose between All the regions or only Guadalajara Mushrooms, put All or Guadalajara")

def get_mushrooms_region(Region=input()):
    
    if Region == 'All':
        ### mushrooms_dct is a dictionary with Info about all mushrooms in the webpage (542)
        ### mushrooms_dct_short will be a dictionary with Info about the first 10 mushrooms in the webpage (10)

        mushrooms_dct = get_all_mushrooms()
        print(f"Mushrooms found: {len(mushrooms_dct)}")

        N = 10
        print(f"Info about the first {N} mushrooms")

        mushrooms_dct_short = dict(itertools.islice(mushrooms_dct.items(), 10))
        for name, link in mushrooms_dct_short.items():
            print(f"Mushroom {name} in {link}")
    
        ### Separate between names an urls mushrooms    
        mushrooms_names,mushrooms_urls=list(mushrooms_dct.keys()), list(mushrooms_dct.values())

        mushrooms_names_10,mushrooms_urls_10 = list(mushrooms_dct_short.keys()), list(mushrooms_dct_short.values())
        
        return mushrooms_names
        
    elif Region == 'Guadalajara':
        ### 50 most commons classes of Guadalajara
            
        mushrooms_names = ['Agaricus arvensis','Agaricus bitorquis','Agaricus campestris','Agaricus sylvaticus','Amanita caesarea','Amanita citrina',
                     'Amanita curtipes','Amanita muscaria','Amanita pantherina','Amanita phalloides','Boletus aereus','Boletus aestivalis','Boletus edulis',
                     'Boletus erythropus','Boletus luridus','Boletus pinophilus','Boletus satanas','Cantharellus cibarius','Clitocybe gibba','Clitocybe odora',
                     'Cratarellus cornucopioide','Cratarellus lutescens','Ganoderma lucidum','Lactarius controversus','Lactarius controversus','Lactarius deliciosus',
                     'Lactarius rufus','Lactarius sanguifluus','Lepista nuda','Macrolepiota mastoidea','Macrolepiota procera','Marasmius oreades','Morchella elata',
                     'Morchella esculenta','Morchella vulgaris','Pleurotus eryngii','Pleurotus ostreatus','Russula chloroides','Russula cyanoxantha','Russula vesca',
                     'Sparasis crispa','Suillus luteus','Terfecia leptoderma','Tricholoma columbetta','Tricholoma equestre','Tricholoma portentosum','Tuber aestivum',
                     'Tuber brumale','Tuber melanosporum']
            
        print('You choose Guadalajara Mushrooms')
        
        return mushrooms_names
    
    else:
        
        print('The region must be All or Guadalajara, please, try again:')
        return get_mushrooms_region(Region=input())
        
        
Mushrooms_names = get_mushrooms_region()

Choose between All the regions or only Guadalajara Mushrooms, put All or Guadalajara
Guadalajara
You choose Guadalajara Mushrooms


In [8]:
### Function to download the images from an url and save in a folder,
### you will choose the path you want to download the images, also how many images you want to download
### per mushroom, if you choose "Default" the choice will be your current path (the path you are now).

print("Where do you want to save the photos?, example: /home/dsc/FP_Mushrooms/Mushrooms_Classification_Guadalajara")
print("If you want your current path write 'Default'")

path_output = input()

if path_output == 'Default':
    path_output = os.getcwd()+'/'
else:
    os.chdir(os.path.join(os.getcwd(),path_output))
    path_output = os.getcwd()+'/'

def downloader(url,folder,scroll_until):
    
    try:
    
        os.chdir(os.path.join(os.getcwd(),path_output))

        try:
            os.mkdir(os.path.join(os.getcwd(),folder))
        except:
            pass

        os.chdir(os.path.join(os.getcwd(),path_output+folder))

        num_res = scroll_until
        
        ### Scrape 20 images from 20 images of google images until the num_res we consider necessary

        for start in range(0, num_res, 20):

            response = requests.get(url.format(start))

            soup = BeautifulSoup(response.content,'html.parser')

            images = soup.find_all("img")

            number = 0 + start
            
            for image in images[1::]:

                image_src = image['src']

                urllib.request.urlretrieve(image_src, str(number) + ".jpg")

                number += 1
    except:
        print('The Mushroom '+folder+' has had some errors in the download , to revise')
        
print("How many images per mushroom do you want? minimum 20, give a number multiple of 20 (20,40,60...)")

scroll_until=int(input())

### download the number of images we choose per mushrooms
for mushroom in tqdm(Mushrooms_names):
    
    url = 'https://www.google.com/search?q='+mushroom+'&tbm=isch&hl=es&sa=X&ved=2ahUKEwjw_M7z2NfuAhUMpRoKHbPQAokQgowBegQIARAX&biw=1905&bih=852&start={}'
    
    downloader(url,mushroom,scroll_until)
    
### Come back to our directory
os.chdir(os.path.join(os.getcwd(),path_output))


Where do you want to save the photos?, example: /home/dsc/FP_Mushrooms/Mushrooms_Classification_Guadalajara
If you want your current path write 'Default'
/home/dsc/FP_Mushrooms/Mushrooms_Classification_Guadalajara/Mushrooms_Dataset
How many images per mushroom do you want? minimum 20, give a number multiple of 20 (20,40,60...)
60


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=49.0), HTML(value='')))


