# The Digital Cheese Sommelier

## Scraping the data

In [1]:
import pandas as pd 
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import unicodedata

### I checked (and scraped) *four* different websites for a dataset of cheeses that I liked enough.  Although none of them were ideal, Cheese.com had an extensive library of 1,827 cheeses with a decent amount of features (19 features, 13 which were of use), as well as a very lenient API...

#### First, scrape the paths for the webpage for each cheese (to later run through and use to scrape the info off of every single cheese's page.

In [14]:
## based off of Riley Dallas' code, provided in an youtube info session video from May 2018
## https://www.youtube.com/watch?v=5Y3ZE26Ciuk
## ALSO based on Wesley Bosse and Douglas Strodtman's 'CART - Mini-demo using reddit data' DC-Flex lesson 



# make a function that scrapes a url for data.  make sure to use a beautifulsoup for url
# going to scrape all the cheese paths by letter of the alphabet and number of pages for each letter
def get_chz_pages(letter, pages):
    
    #set initial conditions
    names = []  #initiate a list to contain all of the web paths for each cheese 
    
    
    
    ### sett the url to the alphabetical list of cheese's (100 per each page) 
    url = f'https://cheese.com/alphabetical/?per_page=100&i={letter}&page={pages}#top' 
    


    # Perform a get requests on cheese.com
    res = requests.get(url) 
    # check to make sure not getting error before doing main quest of the code
    # 429 is error - 200 is no errors
    if res.status_code == 200:  

        
        # create a beautiful soup instance
        soup = BeautifulSoup(res.content, 'lxml')
        
        # in the soup, find the spot where the info I want is
        div = soup.find('div', {'class':'catalog internal'})
        
        # parse through the info to find the specific bits i want "webpage file paths for each cheese"
        for row in div.find_all('div', {'class':'cheese-image'}):
        
            # add each cheese's file path to the list
            names.append(row.find('a')['href'])
            
            

    #if code getting an error, do a print message notification and break the for loop    
    else:
        print('ERROR')
        print(res.status_code)

    time.sleep(1) #sleep for 1 second in between for loops so as not to appear to be DDoS attack 

    # return the filepath names
    return names

In [15]:
# create an empty list of filepath names to append to
webpage_names = []

# go through every letter in the alphabet as the cheeses are listed alphabetically on cheese.com
alph = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 
            'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

# for every letter in the alphabet, there are at most 3 pages of cheeses
# note that when there are less then 3 pages for a letter, the function will read in
# the cheeses from the last existant page, which will result in scraping duplicates
page = ['1', '2', '3']

# loop through every letter in the alphabet
for let in alph:
    # loop through every page for each letter
    for p in page:
        webpage_names += get_chz_pages(let, p)

# drop all duplicates by changing the list to a set (whose objects are distinct) and then back to a list
# note that .sort() is neccesary because changing to a set takes it outt of alphabetical order
webpage_names = list(set(webpage_names))

# note that .sort() is neccesary because changing to a set takes it outt of alphabetical order
webpage_names.sort()

# note that .sort() will sort uppercase before lowercase, so need to use sorted(), lambda, and .casefold()
# https://stackoverflow.com/questions/10269701/case-insensitive-list-sorting-without-lowercasing-the-result
webpage_names = sorted(webpage_names, key=lambda s: s.casefold())

In [21]:
# note that we want this list to fix to our dataframe later on
list(webpage_names)[1826]

'/zwitser/'

In [17]:
# check the number of filepaths scraped
len(webpage_names)

1827

### Use The Filepath's Collected To Scrape The Webpage For Each Cheese on Cheese.com

In [31]:
## based off of Riley Dallas' code, provided in the youtube info session video from May 2018
## https://www.youtube.com/watch?v=5Y3ZE26Ciuk
## ALSO based on Wesley Bosse and Douglas Strodtman's 'CART - Mini-demo using reddit data' DC-Flex lesson 

# make a function that scrapes each url for data.  make sure to use a beautifulsoup for each url
# input will be the filepath name for each cheese's specific url
def get_chz_posts(cheese_name):
    

    
    ### by switching between different lists within the url, its possible to scrape more data in one day
    url = f'https://cheese.com{cheese_name}'     


    # Perform a get requests on cheese.com
    res = requests.get(url) 
    # check to make sure not getting error before doing main quest of the code
    # 429 is error - 200 is no errors
    if res.status_code == 200:  

        
        # create an instance of beautiful soup
        soup = BeautifulSoup(res.content, 'lxml')
        
        # find the spots in the soup where the info I want resides
        div = soup.find('div', {'class':'unit'})
        div2 = soup.find('div', {'class':'summary'})
        ul = soup.find('ul', {'class':'summary-points'})
        
        # instantiate an empty dictionary that will contain all of the cheese info scraped from each page
        cheese = {}
        
        # get the name of the cheese (accents, hyphens, parenthese etc included)
        cheese['Name'] = div.find('h1').text.strip()        
        
        # parse through the soup to find more data
        for row in ul.find_all('p'):
            
            # strings that begin with "Made from" are all 'Milk' types.  Everything else varies
            if "Made from" in row.text:
                cheese['Milk'] = row.text.split("Made from ")[1]

            else:
                # strings are listed as "{feature name}:{feature details}".  Split appropriately
                cheese[row.text.split(":")[0]] = row.text.split(":")[1].strip()




        # instantiate an empty string that will be filled with the description located in the soup
        descript = ''    
        # parse through the soup to find more data
        for k in range(len(div2.find_all('p'))):

            #when there is description sentences, add them to the description string
            descript += div2.find_all('p')[k].text

        # pair the gathered 'descript' string to a 'Description' key in the 'cheese' dictionary
        cheese['Description'] = descript


            
            
    #if code getting an error, do a print message notification and break the for loop    
    else:
        print('ERROR')
        print(res.status_code)
        print(url)

    time.sleep(1) #sleep for 1 second in between for loops so as not to appear to be DDoS attack 

    # return a dictionary that has all the site's relevant info on the particular cheese
    return cheese

In [33]:
# check to see that the function works
get_chz_posts('/brinza---feta-style')

{'Name': 'Brinza - Feta style ',
 'Milk': "sheep's milk",
 'Country of origin': 'New Zealand',
 'Region': 'Queenstown',
 'Family': 'Feta',
 'Type': 'soft, brined',
 'Texture': 'creamy, crumbly and open',
 'Rind': 'natural',
 'Colour': 'white',
 'Flavour': 'citrusy, salty, sweet, tangy',
 'Vegetarian': 'no',
 'Producers': 'The Gibbston Valley Cheese Company',
 'Synonyms': 'Briza Feta',
 'Description': 'Briza Feta (or simply Briza) is feta style cheese made by the Gibbston Valley Cheese Company. Compared to traditional feta, it spends only a few days in brine resulting in a much less salty taste. Its natural sweetness and a slight citrus flavour make this sweet, delicate sheep milk cheese a great addition on a cheeseboard. The cheese is also diced in salads, crumbled and added to a frittata and layered in a vegetable stack.Make sure to pair it with an appropriate wine such as fresh lime and Riesling or with salads or vegetables and Sauvignon Blanc.Over 500,000 page views per month, Put y

In [34]:
# instantiate an empty list that will store the dictionary for each cheese
posts = []

# look through the filepaths for every cheese on cheese.com
for n in webpage_names:
    
    # scrape the webpage for each cheese, get the info into a 
    # dictionary, and append each dictionary to the list 'posts'
    posts.append(get_chz_posts(n))
    

# put the list of dictionaries of each cheese into a dataframe 'chz_df'
chz_df = pd.DataFrame(posts)

In [35]:
# inspect the dataframe of all of the cheeses scraped from cheese.com
chz_df

Unnamed: 0,Name,Milk,Country of origin,Region,Type,Texture,Rind,Colour,Flavour,Aroma,Vegetarian,Producers,Synonyms,Description,Fat content,Family,Alternative spellings,Calcium content,Fat content (in dry matter)
0,Abbaye de Belloc,unpasteurized cow's and sheep's milk,France,Pays Basque,"semi-hard, artisan","creamy, dense and firm",natural,yellow,burnt caramel,lanoline,yes,Abbaye de Notre-Dame de Belloc,Abbaye Notre-Dame de Belloc,Abbaye de Belloc is also known as 'Abbaye Notr...,,,,,
1,Abbaye de Belval,cow's milk,France,,semi-hard,elastic,washed,ivory,,aromatic,no,,,This cheese is also known as Le Trappiste de B...,40-46%,,,,
2,Abbaye de Citeaux,unpasteurized cow's milk,France,Burgundy,"semi-soft, artisan, brined","creamy, dense and smooth",washed,white,"acidic, milky, smooth","barnyardy, earthy",no,Frères Frédéric and Joel,,The Abbaye de Citeaux cheese comes from the Ci...,,,,,
3,Abbaye de Timadeuc,pasteurized cow's milk,France,province of Brittany,semi-hard,soft,washed,pale yellow,,,no,Abbaye Cistercienne NOTRE-DAME DE TIMADEUC,,"Being direct descendant of the Port du Salut, ...",,,,,
4,Abbaye du Mont des Cats,pasteurized cow's milk,France,Nord-Pas-de-Calais,"semi-soft, artisan, brined",smooth and supple,washed,pale yellow,"milky, salty",floral,no,Abbaye du Mont des Cats,,The Abbaye du Mont des Cats cheese is made by ...,50%,,,,
5,Abbot’s Gold,pasteurized cow's milk,"England, Great Britain and United Kingdom",North Yorkshire,semi-hard,"creamy, crumbly, dense and semi firm",natural,pale yellow,"mild, sweet, tangy",aromatic,yes,Wensleydale Creamery,"English Cheddar with Caramelized Onions, Caram...","Abbot's Gold, also known by other names such a...",,Cheddar,,,
6,Abertam,sheep's milk,Czech Republic,Karlovy Vary,"hard, artisan",firm,natural,pale yellow,"acidic, strong, tangy",,no,,,Abertam is a traditional sheep's milk cheese m...,45%,,,,
7,Abondance,unpasteurized cow's milk,France,"Haute-Savoie, Abondance","semi-hard, artisan","creamy, open and supple",natural,pale yellow,"acidic, buttery, fruity, sweet",nutty,no,,,Tomme d'Abondance or Abondance is a medium-siz...,48%,Tomme,Tomme d'Abondance,,
8,Acapella,goat's milk,United States,California,"soft, soft-ripened",,,,buttery,"fresh, herbal",no,Andante Dairy,,Acapella is a soft goat's milk cheese produced...,,,,,
9,Accasciato,Buffalo's and cow's milk,Italy,Campania,semi-hard,firm,natural,pale yellow,sweet,"aromatic, fresh",no,Casa Madaio,,"Accasciato meaning 'collapsed' in Italian, is ...",,,,,


In [36]:
# save our cheese  dataframe to the file big_cheese.csv
#chz_df.to_csv('./data/big_cheese2.csv', index=False)