# Feature 1: Data Collection

** IMPORTANT! **
** Web scraping takes a long time, if possible, please don't run the entire thing. I have written some test functions to show that the below code works, please find them here: **
** For the Web API part, I grabbed 500 random drinks, therefore if you run the below code, the outcome may be different than the presentation slides**

## Web Scraping: AllRecipes.com and Epicurious.com

-  Reference: 
    -  https://nycdatascience.com/blog/student-works/recipes-scraping-top-20-recipes-allrecipes/
-  Selenium is used because the webpages utilize AJAX, combined with beautifulsoup the speed of scraping seems to be improved 

### Import packages and define functions

In [2]:
import os
import sys
import bs4
import requests
import time
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options

######### see Lab3 ###############
def get_text_from_elements(elements):
    """Uses list comprehension to parse out the cleaned text strings from a list of
    elements returned from a BeautifulSoup selection.

    Arguments:
        elements {list} -- list of elements returned from a BeautifulSoup selection

    Returns:
        list -- list of cleaned text contained within the element list
    """
    return [e.text.strip() for e in elements]


### Choosing cocktails with different base liquors from AllRecipes.com

In [3]:
# get the URL for different categories of drinks, no other way than just to explore the website
rumURL = "https://www.allrecipes.com/recipes/1741/drinks/cocktails/rum-drinks/"
ginURL = "https://www.allrecipes.com/recipes/1739/drinks/cocktails/gin-drinks/"
bourbonURL = "https://www.allrecipes.com/recipes/16957/drinks/cocktails/bourbon-drinks/"
champURL = "https://www.allrecipes.com/recipes/14975/drinks/cocktails/champagne-drinks/"
blendedURL = "https://www.allrecipes.com/recipes/17651/drinks/cocktails/blended-cocktails/"
tequilaURL = "https://www.allrecipes.com/recipes/1742/drinks/cocktails/tequila-drinks/"
vodkaURL = "https://www.allrecipes.com/recipes/1743/drinks/cocktails/vodka-drinks/"
whiskeyURL = "https://www.allrecipes.com/recipes/1744/drinks/cocktails/whiskey-drinks/"

baseURL = (rumURL, ginURL, bourbonURL, champURL, blendedURL, tequilaURL, vodkaURL, whiskeyURL)

#print(baseURL)
### Import packages and define functions

In [4]:
# grab all the recipe IDs from the list of URLs
# this may take a while, if the webpage got stuck on Chrome, please refresh the page and it should proceed
recipeID = list()
browser = webdriver.Chrome()
for url in baseURL:
    browser.get(url)
    #data = r.text
    time.sleep(5)
    soup = bs(browser.page_source, 'html.parser')
    elements = soup.select('article.fixed-recipe-card > a')
    for e in elements:
        drink_id = e.get("data-id")
    #ls = [e.get_attribute('data-id') for e in browser.find_elements_by_class_name('favorite')]
    #print([e.get_attribute('data-id') for e in browser.find_elements_by_class_name('favorite')])
    #print(browser.find_elements_by_css_selector('ar-save-item.favorite'))
        recipeID.append(drink_id)
    
#print(len(recipeID))


### Web Scraping individual webpages for Cocktail recipes

In [5]:
#Disabling JavaScript loading to speed up the process
chrome_options = Options()
chrome_options.add_experimental_option( "prefs",{'profile.managed_default_content_settings.javascript': 2})
browser = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

dict = {}
for drink_id in recipeID:
    #print(drink_id)
    url = "https://www.allrecipes.com/recipe/"+ drink_id
    browser.get(url)
    time.sleep(5)
    soup = bs(browser.page_source, 'html.parser')
    name = get_text_from_elements(soup.select("h1.recipe-summary__h1"))
    if len(name) >= 0:
        #print(str(name))
        #elements = soup.select("section.recipe-summary > h1")
        ingre = soup.select("span.recipe-ingred_txt.added")
        ls=get_text_from_elements(ingre)
        if len(ls) >= 2:
            ls.pop()
            ls.pop()
            #print(ls)
            dict[str(name)]=ls
#print(dict)

###  Save to an intermediary .csv file for cleaning

In [6]:
#print(len(dict))
# Now, put them in a csv file and clean it later
df = pd.DataFrame.from_dict(dict, orient="index")
df.to_csv("allrecipes_raw.csv")


### Repeat the same process for Epicurious.com

In [7]:
# get the URL for different pages, as of now there are 49 pages
url = "https://www.epicurious.com/type/cocktail?page="
pageURL = list()
recipe_names = list()
for i in range(1,50):
    pageURL.append(url+str(i))

#print(pageURL)

browser = webdriver.Chrome()

for url in pageURL:
    browser.get(url)
    #data = r.text
    #time.sleep(5)
    soup = bs(browser.page_source, 'html.parser')
    elements = soup.select('div.list_c_hed > a')
    for e in elements:
        if '/recipes/food/views/' in e.get('href'):
            recipe_names.append(e.get('href'))
            
#print(recipe_names)

In [8]:
chrome_options = Options()
chrome_options.add_experimental_option( "prefs",{'profile.managed_default_content_settings.javascript': 2})
browser = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

dict = {}

for n in recipe_names:
    browser.get("https://www.epicurious.com"+n)
    #data = r.text
    #time.sleep(5)
    soup = bs(browser.page_source, 'html.parser')
    title = soup.select('div.title-source > h1')
    elements = get_text_from_elements(soup.select('ul.ingredients > li'))
    dict[get_text_from_elements(title)[0]]=elements

In [9]:
df = pd.DataFrame.from_dict(dict, orient="index")
df.to_csv("epicurious_raw.csv")

## Web API: CocktailDB.com

### Import Packages

In [11]:
import json
import urllib3
urllib3.disable_warnings()
from itertools import chain
from dask import bag

### Extract JSON objects

In [41]:
#transform 1000 JSON objects into a dataframe containing all the raw information
def get_random_drink():
    url = "https://www.thecocktaildb.com/api/json/v1/1/random.php"
    http = urllib3.PoolManager()
    response = http.request('GET', url)
    soup = bs(response.data,'html.parser')
    newDictionary=json.loads(str(soup))
    return list(chain(*newDictionary.values()))

drink_bag = bag.from_sequence(range(1000))
all_drinks = drink_bag.map(lambda x: get_random_drink()).compute()
drink_df = pd.DataFrame(list(chain(*all_drinks)))

In [42]:
#eliminate the columns that we will not be using
useless_columns = ['dateModified','idDrink', 'strAlcoholic','strCategory','strDrinkThumb','strGlass','strIBA','strInstructions','strVideo']
drink_df.drop(useless_columns, axis=1, inplace=True)

In [90]:
#order the columns
column_names = ['strDrink']
for i in range(1,16):
    column_names.append('strIngredient'+str(i))
    column_names.append('strMeasure'+str(i))

#rearrange drink columns
drink_df = drink_df[column_names]

#print(drink_df)

dict = drink_df.set_index('strDrink').T.to_dict('list')
final_dict = {}

#print(dict)
for k in dict.keys():
    str_list = list(filter(None, dict[k])) # fastest
    #str_list= [item for item in dict[k] if item != ' ']
    str_list = [name for name in str_list if name.strip()]
    #str_list = list(filter(lambda name: name.strip(), dict[k]))
    final_dict[k] = str_list
  
    #dict[k] = str_list
    
print(final_dict)


{'Spanish chocolate': ['Milk', '2 cups ', 'Chocolate', '2 oz sweet ', 'Cinnamon', '1/2 tsp ', 'Egg yolk', '2 beaten '], 'Martini': ['Gin', '1 2/3 oz ', 'Dry Vermouth', '1/3 oz ', 'Olive', '1 '], 'Coke and Drops': ['Coca-Cola', '1 dl ', 'Lemon juice', '7 drops '], 'Pink Lady': ['Gin', '1 1/2 oz ', 'Grenadine', '1 tsp ', 'Light cream', '1 tsp ', 'Egg white', '1 '], 'Mojito': ['Light rum', '2-3 oz ', 'Lime', 'Juice of 1 ', 'Sugar', '2 tsp ', 'Mint', '2-4 ', 'Soda water'], 'Irish Spring': ['Irish whiskey', '1 oz ', 'Peach brandy', '1/2 oz ', 'Orange juice', '1 oz ', 'Sweet and sour', '1 oz ', 'Orange', '1 slice ', 'Cherry', '1 '], 'Chocolate Milk': ['Chocolate liqueur', '1/2 shot ', 'Milk', '1/2 shot ', 'Amaretto', '1 dash '], 'Chocolate Beverage': ['Milk', '6 cups ', 'Chocolate', '3 oz Mexican ', 'Cinnamon', '1 tsp powdered ', 'Egg', '3 '], 'Sazerac': ['Ricard', '1 tsp ', 'Sugar', '1/2 tsp superfine ', 'Peychaud bitters', '2 dashes ', 'Water', '1 tsp ', 'Bourbon', '2 oz ', 'Lemon peel', '

  if sys.path[0] == '':


###  Save to an intermediary .csv file for cleaning

In [91]:
df = pd.DataFrame.from_dict(final_dict, orient="index")
df.to_csv("cocktaildb_raw.csv")