# Notebook to scrape mushroomexpert.com for mushroom catalog

In [None]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep
from random import randint

# Scrape site to get links to each mushroom catalog post

In [None]:
root = 'https://www.mushroomexpert.com/'

In [None]:
r = requests.get(root)

In [None]:
soup = BeautifulSoup(r.text,'html.parser')

In [None]:
result = soup.find_all("table",width="600")

In [None]:
all_dfs = []

In [None]:
for i in result:
  dfs_to_concat = []
  cols = i.find_all("td")
  for i in cols:
    a = i.find_all("a", href=True)
    col_links = []
    for l in a:
      col_links.append([l['href'],l.get_text()])
    df = pd.DataFrame(col_links,columns = ['link','text'])
    dfs_to_concat.append(df)
  joined_df = pd.concat(dfs_to_concat,axis=1) 
  col_names = ["link_key",'key',"link_item","item","link_similar","similar"]
  joined_df.columns = col_names[0:len(joined_df.columns)]
  joined_df['key'] = joined_df['key'].iloc[0]
  joined_df['link_key'] = joined_df['link_key'].iloc[0]
  all_dfs.append(joined_df)

In [None]:
entire_catalog = pd.concat(all_dfs)

In [None]:
entire_catalog['paragraphs'] = np.nan
entire_catalog['images'] = np.nan
entire_catalog = entire_catalog.reset_index(drop = True)

In [None]:
entire_catalog

Unnamed: 0,link_key,key,link_item,item,link_similar,similar,paragraphs,images
0,polypores_stemmed_pale.html,Abortiporus,abortiporus_biennis.html,A. biennis,,,,
1,agaricus.html,Agaricus,agaricus_abruptibulbus.html,A. abruptibulbus,,,,
2,agaricus.html,Agaricus,agaricus_amicosus.html,A. amicosus,,,,
3,agaricus.html,Agaricus,agaricus_andrewii.html,A. andrewii,,,,
4,agaricus.html,Agaricus,agaricus_argenteus.html,A. argenteus,,,,
...,...,...,...,...,...,...,...,...
1442,xylaria.html,Xylaria,xylaria_magnoliae.html,X. magnoliae,,,,
1443,xylaria.html,Xylaria,xylaria_polymorpha.html,X. polymorpha,,,,
1444,,,xylobolus_frustulatus.html,X. frustulatus,,,,
1445,yard.html,Yard Mushrooms,,,,,,


# Get images and post text from each path

In [None]:
def get_images_and_text(root, path):
  try:
    r = requests.get(root + path)
    soup = BeautifulSoup(r.text,'html.parser')
    result = soup.find_all("p")
    paragraphs = ''
    for i in result:
      paragraphs += i.get_text() + '\n'
    result = soup.find_all('img')
    images = ''
    for i in result:
      images += i.get('src') + ','
    print(path + ' scraped_successfully')
    return paragraphs, images
  except:
    print('scrape failed')
    return '',''

In [None]:
for index,row in entire_catalog.iterrows():
  paras,imgs = get_images_and_text(root, row['link_item'])
  entire_catalog.loc[index,'paragraphs'] = paras
  entire_catalog.loc[index,'images'] = imgs
  sleep(randint(2,3))

In [None]:
entire_catalog.to_csv('catalog.csv',)

In [None]:
entire_catalog.to_pickle('catalog.pkl')

In [None]:
pd.read_pickle('/content/catalog.pkl').loc[50,'paragraphs']

'\nAlloclavaria purpurea\n[ Basidiomycetes\xa0>\xa0Agaricales\xa0>\xa0Clavariaceae\xa0>\xa0Alloclavaria . . . ]\nby Michael Kuo\n"Alloclavaria" means "the other Clavaria," and this club fungus differs from closely related species in Clavaria, Clavulina, and Clavulinopsis in its prominent cystidia--a feature not found in the other genera. It is also clearly separated from the others molecularly, leading Dentinger & McLaughlin (2006) to create the genus Alloclavaria to accommodate this funky mushroom.\nHowever, the dull purple colors and densely packed, non-branching fruiting bodies of Alloclavaria purpurea are distinctive enough that you will probably not need to use a microscope or a DNA sequencer to identify the mushroom successfully. Clavaria zollingeri and Clavulina amethystinoides are vaguely similar but are at least moderately branched; additionally they appear in hardwood forests, while Alloclavaria purpurea is fond of conifers.\nClavaria purpurea is a former name.\nDescription:\

In [None]:
catalog = pd.read_pickle('/content/catalog.pkl')

In [None]:
print(catalog)

                         link_key  ...                                             images
0     polypores_stemmed_pale.html  ...  images/inside_top.gif,images/kuo2/abortiporus_...
1                   agaricus.html  ...  images/inside_top.gif,images/kuo6/agaricus_abr...
2                   agaricus.html  ...  images/inside_top.gif,images/kuo6/agaricus_ami...
3                   agaricus.html  ...  images/inside_top.gif,images/kuo6/agaricus_and...
4                   agaricus.html  ...  images/inside_top.gif,images/kuo6/agaricus_arg...
...                           ...  ...                                                ...
1442                 xylaria.html  ...  images/inside_top.gif,images/kuo3/xylaria_magn...
1443                 xylaria.html  ...  images/inside_top.gif,images/kuo2/xylaria_poly...
1444                          NaN  ...  images/inside_top.gif,images/kuo2/xylobolus_fr...
1445                    yard.html  ...                                                   
1446      

In [19]:
pd.__version__

'1.3.5'