## Web Scraping and MongoDB Database Integration for Wine Reviews and Taste Analysis




**PROJECT OVERVIEW**

The project involves web scraping of wine-related data from a Vivino webpage and storing the extracted data in MongoDB. The project requires four functions.

The first function extracts information such as wine ID, winery, name, grapes, badges, region, wine style, allergens, date of download, average rating, number of ratings, average price, and food pairing. This information is stored in the "wines" collection.

The second function extracts information such as user ID, user name, number of user reviews, vintage, star rating, review text, number of likes, and number of comments. This information is stored in the "reviews" collection.

The third function extracts all associated links of other wine webpages and stores them in the "links" collection along with a collection that uniquely stores all wine IDs encountered on the page.

The fourth function extracts all taste-related information from the webpage such as light-bold scale, smooth-tannic scale, dry-sweet scale, soft-acidic scale, and badges. This information is stored in the "taste" collection.

The project requires downloading webpages for 1,00 random wine IDs between 1 and 999,999 and running the four functions on the saved HTML pages.

In [96]:
from bs4 import BeautifulSoup
import requests
import time
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
import codecs
import pymongo
from pymongo import MongoClient
from datetime import datetime
import http.client, urllib.parse

In [97]:
def saveString(html, filename="test.html"):
    try:
        file = open(filename,"w", encoding="utf-8")
        file.write(str(html))
        file.close()
    except Exception as ex:
        print('Error: ' + str(ex))


def loadString(f="test.html"):
    try:
        html = open(f, "r", encoding='utf-8').read()
        return(html)
    except Exception as ex:
        print('Error: ' + str(ex))

**Define a function that reads the scraped webpage file saved  and stores all contained information in MongoDB.  Specifically, stores in a collection called “wines” an extracts the following infromation from the vivino webpage:**

1. wine_id
2. winery
3. name
4. grapes
6. badges   (array containing e.g., “Featured in Vivino's 2018 Wine Style Awards: Napa Valley Cabernet Sauvignon (2013 Vintage)”, “Latest vintage available (2018 Vintage)”, and “Oldest vintage available (1995 Vintage)”)
7. region
8. wine_style
9. allergens (if existing)
10. date_of_download
11. avg_rating
12. num_rating
13. avg_price
14. goes_well_with   (array of food this wine goes well with)


In [110]:

def scrape_wine_data(url):
    page = loadString(url)
    soup = BeautifulSoup(page, 'html.parser')
    myclient = pymongo.MongoClient("mongodb://localhost:27017/")
    mydb = myclient["Wines"]
    mycol_1 = mydb["Wines_new"]
    #print ('hi func started')
    key_list = []
    value_list = []
    
    #Extracting the Wine_id
    canonical_link = soup.find('link', {'rel': 'canonical'})
    if canonical_link is not None and 'href' in canonical_link.attrs:
        rel_attr = canonical_link['href']
        id_value = rel_attr.rsplit('/', 1)[-1]
        key_list.append('ID')
        value_list.append(id_value)
    else:
        key_list.append('ID')
        number = re.search(r"\d+(?=\.[^.]+$)", url).group()
        value_list.append(number)
    
    #Extracting the winery name
    if soup.find('a', attrs={'class': 'winery'}) is not None:
        element = soup.find('a', attrs={'class': 'winery'})
        text = element.text.replace("\n", "")
        key_list.append('Winery_name')
        value_list.append(text)
    
    else:
        key_list.append('Winery_name')
        value_list.append('NA')
        
    ##Extracting the grapes name   
    grapes = []

    if soup.find_all('a', href=True):
        for v in soup.find_all('a', href=True):
            if '/grapes/' in v['href']:
                grapes.append(v.text)
        key_list.append('Grapes')
        value_list.append(grapes)

    elif soup.find_all('a', attrs={'class': 'anchor_anchor__m8Qi- breadCrumbs__link--1TY6b'}):
        #print('element there')
        a = soup.find_all('a', attrs={'class': 'anchor_anchor__m8Qi- breadCrumbs__link--1TY6b'})
        #print(a)
        for v in a:
            if 'grape' in v['href']:
                grapes.append(v.text)
                #print('grape there')
                #print(v.text)
        key_list.append('Grapes')
        value_list.append(grapes)

    else:
        s = soup.find('h3', attrs={'class': 'wineTasteStyle-desktop__wineName--ML0zS'})
        key_list.append('Grapes_1')
        value_list.append(s.text)

    #Extracting the region and the countries
    countries = []
    if soup.find_all('a', href=True) is not None:
        for a in soup.find_all('a', href=True):
            if '/wine-countries/' in a['href']:
                country = a.text
                if country not in countries:
                    countries.append(a.text)
        key_list.append('Region_countries')
        value_list.append(countries)
    
    else:
        key_list.append('Region_countries')
        value_list.append('NA')

    regions = []
    if soup.find_all('a', href=True) is not None:
        for o in soup.find_all('a', href=True):
            if '/wine-regions/' in o['href']:
                region = o.text
                if region not in regions:
                    regions.append(o.text)
        key_list.append('Region')
        value_list.append(regions)
    
    else:
        key_list.append('Region')
        value_list.append('NA')

    #Extracting the wine styles
    wine_styles = []
    if soup.find_all('a', href=True) is not None:
        for a in soup.find_all('a', href=True):
            if '/wine-styles/' in a['href']:
                wine_styles.append(a.text)
        key_list.append('wine_styles')
        value_list.append(wine_styles)
    
    else: 
        key_list.append('wine_styles')
        value_list.append('NA')

    #Exracting the allergens
    if soup.find_all('tr', {'data-testid':'wineFactRow'}) is not None:
        #wait = WebDriverWait(driver, 10)
        if soup.find_all('tr', {'data-testid':'wineFactRow'}) is not None:
            p = soup.find_all('tr', {'data-testid':'wineFactRow'})
            if p[-1] is not None:
                allergens = p[-1]
                key_list.append('allergens')
                value_list.append(allergens.text)
            else:
                key_list.append('allergens')
                value_list.append('NA')
        else:
            key_list.append('allergens')
            value_list.append('NA')
    else: 
        key_list.append('allergens')
        value_list.append('NA')
        
        
    #Extracting the wine name
    if soup.find('span', {'class':'vintage'}) is not None:
        name = soup.find('span', {'class':'vintage'})
        name = name.text.replace("\n", "")
        key_list.append('Name')
        value_list.append(name)
    else: 
        key_list.append('Name')
        value_list.append('NA')
    
    #Extracting the average rating
    if soup.find('div', {'class':'vivinoRating_averageValue__uDdPM'}) is not None:
        avg_rating = soup.find('div', {'class':'vivinoRating_averageValue__uDdPM'})
        key_list.append('avg_rating')
        value_list.append(avg_rating.text)
    else: 
        key_list.append('avg_rating')
        value_list.append('NA')
    
    #Extracting the number of ratings
    if soup.find('div', {'class':'vivinoRating_caption__xL84P'}) is not None:
        num_rating = soup.find('div', {'class':'vivinoRating_caption__xL84P'})
        key_list.append('num_rating')
        value_list.append(num_rating.text)
    else: 
        key_list.append('num_rating')
        value_list.append('NA')

    #Extracting the average price
    if soup.find('span', {'class':'purchaseAvailabilityPPC__amount--2_4GT'}) is not None:
        avg_price = soup.find('span', {'class':'purchaseAvailabilityPPC__amount--2_4GT'})
        key_list.append('avg_price')
        value_list.append(avg_price.text)
    else:
        key_list.append('avg_price')
        value_list.append('NA')

    #Extracting the goes well with object
    if soup.find_all('a', {'class':'anchor_anchor__m8Qi- foodPairing__imageContainer--2CtYR'}) is not None:
        goes_well_with_obj = soup.find_all('a', {'class':'anchor_anchor__m8Qi- foodPairing__imageContainer--2CtYR'})
        c=[]
        for a in goes_well_with_obj:
            b = a['href']
            c.append(re.sub(r".+pairing/(.+)", r"\1", b))
        key_list.append('goes_well_with_obj')
        value_list.append(c)
    else: 
        key_list.append('goes_well_with_obj')
        value_list.append('NA')

    #Extracting the Badges
    if soup.find('div', attrs={'class': 'highlight'}) is not None:
        badges = soup.find('div', attrs={'class': 'highlight'})
        badges = badges.text.replace("\n", "")
        key_list.append('badges')
        value_list.append(badges)
    else:
        key_list.append('badges')
        value_list.append('NA')
    #print(' func')
    wine_data_dict = dict(zip(key_list, value_list))
    entry = mycol_1.insert_one(wine_data_dict)
    #print('end')
    return wine_data_dict


**Similarly define another function that extracts the review infromation from the save webpage and Stores it in “reviews” collection of the MongoDB:**

1. user_id
2. user_name
3. num_user_reviews
4. vintage   (the year of the wine if displayed)
5. star_rating
6. text
7. num_likes
8. num_comments

In [111]:
def get_wine_reviews(url):
    
    page = loadString(url)
    soup = BeautifulSoup(page, 'html.parser')
    
    myclient = pymongo.MongoClient("mongodb://localhost:27017/")
    mydb = myclient["Wines"]
    mycol_2 = mydb["reviews_new"]
    
    
    user_names = []
    star_ratings = []
    num_user_reviews = []
    reviews_text = []
    likes = []
    comments = []
    
    key_list_1 = []
    value_list_1 = []
    vivino_user_dict_list = []
    
    #Extracting the WINE id
    canonical_link = soup.find('link', {'rel': 'canonical'})
    if canonical_link is not None and 'href' in canonical_link.attrs:
        rel_attr = canonical_link['href']
        id_value = rel_attr.rsplit('/', 1)[-1]
        key_list_1.append('ID')
        value_list_1.append(id_value)
    else:
        key_list_1.append('ID')
        number = re.search(r"\d+(?=\.[^.]+$)", url).group()
        value_list_1.append(number)
    
    #Looping through all user reviews in the review window
    review_window = soup.find_all('div', attrs={'class': 'communityReviewItem__reviewSection--1OjzI'})
    for i in review_window:
        
        #Extracting the user name and number of user reviews
        a = i.find('a', attrs={'class': 'anchor_anchor__m8Qi- userAlias_userAlias__ztmrT undefined communityReview__userAlias--1wUOM anchor_baseLink__O8bvu'})
        c = re.sub(r"(.+)\(.+", r"\1", a.text)
        d = re.sub(r".+\((.+)\)", r"\1", a.text)
        user_names.append(c)
        num_user_reviews.append(d)
        
        key_list_1.append('num_user_reviews')
        value_list_1.append(d)
        
        key_list_1.append('User Name')
        value_list_1.append(c)
       
        #Extracting the Vintage year in the user review
        if i.find('span', attrs = {'class': 'reviewedVintageYear__vintageText--3TZOW communityReview__vintageText--vW6OI'}) is not None:
            vintage = i.find('span', attrs = {'class': 'reviewedVintageYear__vintageText--3TZOW communityReview__vintageText--vW6OI'})
            key_list_1.append('vintage')
            value_list_1.append(vintage.text)
        else:
            key_list_1.append('vintage')
            value_list_1.append('NA')

        
        #Extracting the star rating 
        a = i.find('span', attrs={'class': 'userRating_ratingNumber__cMtKU'})
        star_ratings.append(a.text)
        key_list_1.append('star_rating')
        value_list_1.append(a.text)
        
        #Extracting the USER_ID 
        a = i.find('a', attrs={'class': 'anchor_anchor__m8Qi- userAlias_userAlias__ztmrT undefined communityReview__userAlias--1wUOM anchor_baseLink__O8bvu'})['href']
        #print(a)
        #user_ID.append(a)
        user_id = re.sub(r"\/users\/(.+)", r"\1", str(a))
        #print(user_id)
        key_list_1.append('USER_ID')
        value_list_1.append(user_id)
        
        #Extracting the review text
        a = i.find('span', attrs={'class': 'communityReview__reviewText--2bfLj'})
        reviews_text.append(a.text)
        key_list_1.append('Review_Text')
        value_list_1.append(a.text)
    
        #Extracting the num of likes in the user review
        a_1 = i.find('div', attrs={'class': 'likeButton__likeCount--1stJS'})
        if a_1 is not None:
            likes.append(a_1.text)
            key_list_1.append('num_like')
            value_list_1.append(a_1.text)
        else:
            likes.append('NA')
            key_list_1.append('num_like')
            value_list_1.append('NA')
        
        #Extracting the number of comments in the user review
        a_2 = i.find('div', attrs={'class': 'commentsButton__commentsCount--3CoCn'})
        if a_2 is not None:
            comments.append(a_2.text)
            key_list_1.append('num_comments')
            value_list_1.append(a_2.text)  
            #print('check 9')
        else:
            likes.append('NA')
            key_list_1.append('num_comments')
            value_list_1.append('NA')
        
        vivino_user_dict = dict(zip(key_list_1, value_list_1))
        vivino_user_dict_list.append(vivino_user_dict)
        
        # insert into MongoDB
        entry = mycol_2.insert_one(vivino_user_dict)

    return vivino_user_dict_list


 **Again define another function to extract all associated links of other Wines webpages and Store in another collection in MongoDB called “links”:**

Collection that uniquely stores all wine IDs encounter on the page (just the IDs, nothing else needed).


In [112]:
def store_links(url):
    page = loadString(url)
    soup = BeautifulSoup(page, 'html.parser')
    
    myclient = pymongo.MongoClient("mongodb://localhost:27017/")
    mydb = myclient["Wines"]
    mycol_3 = mydb["links_new"]
    #print ('hi func started')
    
    key_list_2 = []
    value_list_2 = []

    # Extract main wine ID
    canonical_link = soup.find('link', {'rel': 'canonical'})
    if canonical_link is not None and 'href' in canonical_link.attrs:
        rel_attr = canonical_link['href']
        id_value = rel_attr.rsplit('/', 1)[-1]
        key_list_2.append('ID')
        value_list_2.append(id_value)
    else:
        key_list_2.append('ID')
        number = re.search(r"\d+(?=\.[^.]+$)", url).group()
        value_list_2.append(number)
        
    
    # Extract great alternatives wine IDs
    great_alteratives_list = []
    if soup.find('div', attrs={'id': 'cross-sell-module'}) is not None:
        a = soup.find('div', attrs={'id': 'cross-sell-module'})
        if soup.find('div', attrs={'id': 'cross-sell-module'}) is not None:
            b = a.find_all('a', attrs={'class': 'anchor_anchor__m8Qi- wineCard__cardLink--3F_uB'})
            d = []
            for c in b:
                linked_wine_url_1 = c['href']
                d.append(re.sub(r".+/w/(.+)\?.+", r"\1", linked_wine_url_1))
                great_alteratives_list.append(d)
                key_list_2.append('great alternatives wine id')
                value_list_2.append(d)
        else:
            key_list_2.append('great alternatives wine id')
            value_list_2.append('NA')
    else:
        key_list_2.append('great alternatives wine id')
        value_list_2.append('NA')

    # Extract other wine IDs
    e = []
    other_wine_id = []
    if soup.find('div', attrs={'class': 'winerySummary__wineBandRow--2IPvj'}) is not None:
        a = soup.find('div', attrs={'class': 'winerySummary__wineBandRow--2IPvj'})
        b = a.find_all('a', attrs={'class': 'anchor_anchor__m8Qi- wineCard__cardLink--3F_uB'})
        for c in b:
            linked_wine_url_2 = c['href']
            linked_wine_url_2
            e.append(re.sub(r".+/w/(\d+).+", r"\1", linked_wine_url_2))
            
        key_list_2.append('other wine id')
        value_list_2.append(e)
        
    else:
        key_list_2.append('other wine id')
        value_list_2.append('NA')
    
    #Combine keys and values into a list
    links_dict = dict(zip(key_list_2, value_list_2))

    # Store the wine IDs in MongoDB collection
    entry = mycol_3.insert_one(links_dict)

    return links_dict


**Define an another collection called taste that extracts all the infroation related to the taste of the wine from the webpage and Stores in a collection called “taste”:**

1. All the information displayed in “What does this wine taste like?”  I.e.,

2. wine_id

3. light_bold_scale   (0-1 value of where it ranks on this scale, can be found in “indicatorBar”)

4. smooth_tannic_scale   (0-1 value of where it ranks on this scale, can be found in “indicatorBar”)

5. dry_sweet_scale   (0-1 value of where it ranks on this scale, can be found in “indicatorBar”)

6. soft_acidic_scale   (0-1 value of where it ranks on this scale, can be found in “indicatorBar”)etc.

7. badges   (array containing e.g., “206 mentions of black fruit notes”, “148 mentions of oaky notes”, and “82 mentions of earthy notes”)


In [113]:
def get_wine_taste_info(url):
    page = loadString(url)
    soup = BeautifulSoup(page, 'html.parser')
    
    myclient = pymongo.MongoClient("mongodb://localhost:27017/")
    mydb = myclient["Wines"]
    mycol_4 = mydb["taste_new"]
    
    
    key_list_2 = []
    value_list_2 = []
    taste_dict_list =[]
    
    # Extract main wine ID
    canonical_link = soup.find('link', {'rel': 'canonical'})
    if canonical_link is not None and 'href' in canonical_link.attrs:
        rel_attr = canonical_link['href']
        id_value = rel_attr.rsplit('/', 1)[-1]
        key_list_2.append('ID')
        value_list_2.append(id_value)
    else:
        key_list_2.append('ID')
        number = re.search(r"\d+(?=\.[^.]+$)", url).group()
        value_list_2.append(number)
        
    #Extracting the scale of the lever's indicating light bold, smooth tannic, dry sweet and soft acidic
    if soup.find('div', attrs={'class': 'col mobile-column-6 tablet-column-8 desktop-column-8'}) is not None:
        # Find the wine taste div and get the scale spans
        wine_taste = soup.find('div', attrs={'class': 'col mobile-column-6 tablet-column-8 desktop-column-8'})
        scale = wine_taste.find_all('span', attrs={'class': 'indicatorBar__progress--3aXLX'})
    
        # Convert the scale values to a range of 0-1 and add to the key-value lists
        scale_0_1 = []
        numbers = re.findall(r'left: (\d+\.\d+)%;', str(scale))
        #print(numbers)
        for a in numbers:
            scale_0_1.append(a)

        if len(scale_0_1) > 0 and scale_0_1[0] is not None:
            light_bold_scale = scale_0_1[0]
            key_list_2.append('light_bold_scale')
            value_list_2.append(scale_0_1[0])

        if len(scale_0_1) > 1 and scale_0_1[1] is not None:
            smooth_tannic_scale = scale_0_1[1]
            key_list_2.append('smooth_tannic_scale')
            value_list_2.append(scale_0_1[1])

        if len(scale_0_1) > 2 and scale_0_1[2] is not None:
            dry_sweet_scale = scale_0_1[2]
            key_list_2.append('dry_sweet_scale')
            value_list_2.append(scale_0_1[2])

        if len(scale_0_1) > 3 and scale_0_1[3] is not None:
            soft_acidic_scale = scale_0_1[3]
            key_list_2.append('soft_acidic_scale')
            value_list_2.append(scale_0_1[3])

        # Find the badges and add to the key-value lists
        if soup.find_all('div', attrs={'class': 'tasteNote__mentions--1T_d5'}) is not None:
            badges = soup.find_all('div', attrs={'class': 'tasteNote__mentions--1T_d5'})
            data = []
            for a in badges:
                pattern = r"\n"
                data.append(re.sub(pattern, "", a.text))
            key_list_2.append('badges')
            value_list_2.append(data)
        else:
            key_list_2.append('badges')
            value_list_2.append('NA')

        # Combine the key-value lists into a dictionary
        taste_dict = dict(zip(key_list_2, value_list_2))
        #print(taste_dict)
        taste_dict_list.append(taste_dict)

        # Insert the dictionary as a document into the MongoDB collection mycol_4
        entry = mycol_4.insert_one(taste_dict)
    
    return taste_dict_list


**Download all the webpages from the For 1,00 random wine IDs between 1 and 999,999 (uniform random draws).**

In [85]:
#first download all webpages

import random
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


random.seed(2295)
random_number =[]
for i in range (0, 100):
    a = random.randint(1, 999999)
    random_number.append(a)
    print(a)
#random_number_2 = [397063, 361183, 885552, 756386, 954349, 556940, 756134, 656315, 162271, 779620, 174574, 213770, 642769, 305759, 885338, 768982, 793271, 711454, 773747, 410105, 465952, 200784, 510239, 385676, 599440, 602458, 894976, 805916, 974816, 45351, 227637, 675388, 219805, 11055, 157417, 768940, 522515, 68878, 135477, 674256, 84905, 957265, 379263, 116300, 536135, 547360, 775990, 833125, 696891, 921200, 242017, 177789, 582982, 913299, 93125, 647668, 135439, 769911, 312479, 625719, 715209, 899069, 466824, 899253, 742949, 31560, 988853, 491982, 644115, 978668, 647836, 555481, 700276, 628344, 670813, 670388, 156200, 38299, 2282, 907473, 327373, 183335, 72098, 892524, 262159, 433957, 603053, 767024, 60612, 352818, 715551, 802672, 704023, 948230, 529349, 429854, 587292, 419844, 771161, 454451, 783332, 857392, 588311, 719398, 665988, 704801, 383930, 84469, 272493, 548650, 867992, 598772, 685534, 242171, 739680, 712743, 239073, 972234, 183166, 968743, 711424, 169757, 60343, 590899, 688084, 680319, 240554, 211607, 545222, 338737, 517958, 347032, 915492, 173346, 722323, 7205, 886738, 522741, 667293, 991862, 747896, 467791, 404090, 838620, 922158, 61774, 679011, 628210, 252762, 427321, 83688, 885434, 524648, 735907, 455610, 286233, 366778, 665651, 763224, 242298, 614090, 250226, 404948, 773246, 360698, 727380, 852328, 724226, 700278, 661228, 11271, 101250, 703357, 160693, 722379, 736362, 728630, 37343, 768427, 783654, 434590, 52543, 583938, 167791] 
#random_number_3 = [95154, 9079, 76916, 99664, 86711, 9896, 98173, 10142, 76891, 86711]
#random_number_1 = [  8879, 5708092, 2061] #20557,
#random_number_4 = [8879]
for i in random_number:
    driver = webdriver.Chrome(executable_path="/Users/jyomohan//Users/jyomohan/Downloads/chromedriver_mac_arm64")
    driver.implicitly_wait(0.5)
    #launch URL
    driver.get("https://www.vivino.com/US-CA/en/w/"+str(i))
    time.sleep(5)
    
    current_url = driver.current_url
    print(current_url)
    
    #Check if the URL exists in the first place print 404 error if it doesnt
    try:
        error_element = driver.find_element(By.CSS_SELECTOR,"div[class ='error-page-text-container']")
        print("404 Error")
    
    except NoSuchElementException:
        #If wine page does exist then print no 404 error
        print("no 404 error")
        wait = WebDriverWait(driver, 10)
        
        wait = WebDriverWait(driver, 20) 
        
        #Wait till the wine_facts table is downloaded then scroll down the page
        for q in range(10):
            try:
                table = driver.find_element(By.XPATH, '//table[@class="wineFacts__wineFacts--2Ih8B"]')
                driver.execute_script("arguments[0].scrollIntoView();", table)
                break
            except NoSuchElementException:
                # If the table element is not found, scroll down and try again
                driver.execute_script("window.scrollBy(0, 500);")
                time.sleep(1)
        
        #Set drive wait time
        driver.implicitly_wait(10)  
        
        #Scroll down to the bottom of the page to make sure all elements in the webpage are loaded
        try:
            for t in range(10):
                driver.execute_script("window.scrollBy(0, 1000)")
                time.sleep(1)

            #(e)  [Click] on “Show more reviews”.
            # Click on “Show more reviews”
            show_more_reviews_button = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//span[text()='Show more reviews']")))
            show_more_reviews_button.click()
            wait = WebDriverWait(driver, 20)
            
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[class ='allReviews__gradientMargin--2mB5_']")))
            driver.implicitly_wait(10)
            
    
            
            # Save the page with both the original page and the review window in the foreground and background respectively
            time.sleep(10)
            url = driver.current_url
            with open('vivino_id_4_'+str(i)+'.html', 'w') as f:
                f.write('<!--URL:' + url + '-->\n' + driver.page_source)
                print('saved')
            
        except: 
            #If show more reviews button is not present save the background webpage
            time.sleep(30)
            url = driver.current_url
            with open('vivino_id_4_'+str(i)+'.html', 'w') as f:
                f.write('<!--URL:' + url + '-->\n' + driver.page_source)
                print('saved')
            
            
          


293058
959304
583279
409398
593608
967756
73600
801993
605044
919383
153123
957836
127384
736635
911303
776715
705756
139198
122875
223419
598746
464278
410510
612396
495736
798941
464728
552689
377900
178241
128907
129441
729014
866975
732580
356176
679666
923038
338618
90270
736538
158544
716807
826873
678565
327889
942026
476957
935450
649408
198200
133861
912793
731272
44372
208239
1447
510055
194087
367776
800481
78089
917831
327766
272897
253684
353308
644495
862248
666988
914807
403116
546748
928117
101098
297068
330984
776306
902524
91822
255668
698314
49788
359645
526509
517432
844736
97235
280759
506912
335417
858458
377306
963804
387840
557861
304327
485967
66995
972327
547329
898498
359105
275399
48121
583386
57777
651421
774658
324418
533356
874299
615472
475265
173713
964820
641675
571875
943750
995777
358952
589388
160211
70436
200740
298388
527946
773028
787032
636508
987962
960787
638682
282079
588404
296103
154736
167078
792063
694858
292120
92643
2118
286647
147041
5

  driver = webdriver.Chrome(executable_path="/Users/jyomohan//Users/jyomohan/Downloads/chromedriver_mac_arm64")


https://www.vivino.com/US-CA/en/w/293058
404 Error
https://www.vivino.com/US-CA/en/w/959304
404 Error
https://www.vivino.com/US-CA/en/w/583279
404 Error
https://www.vivino.com/US-CA/en/w/409398
404 Error
https://www.vivino.com/US-CA/en/w/593608
404 Error
https://www.vivino.com/US-CA/en/w/967756
404 Error
https://www.vivino.com/US-CA/en/odfjell-armador-carmenere/w/73600
no 404 error
saved
https://www.vivino.com/US-CA/en/w/801993
404 Error
https://www.vivino.com/US-CA/en/w/605044
404 Error
https://www.vivino.com/US-CA/en/w/919383
404 Error
https://www.vivino.com/US-CA/en/w/153123
404 Error
https://www.vivino.com/US-CA/en/w/957836
404 Error
https://www.vivino.com/US-CA/en/w/127384
404 Error
https://www.vivino.com/US-CA/en/w/736635
404 Error
https://www.vivino.com/US-CA/en/w/911303
404 Error
https://www.vivino.com/US-CA/en/w/776715
404 Error
https://www.vivino.com/US-CA/en/w/705756
404 Error
https://www.vivino.com/US-CA/en/w/139198
404 Error
https://www.vivino.com/US-CA/en/w/122875
404 Err

https://www.vivino.com/US-CA/en/w/588749
404 Error
https://www.vivino.com/US-CA/en/w/873231
404 Error
https://www.vivino.com/US-CA/en/w/611442
404 Error
https://www.vivino.com/US-CA/en/w/586188
404 Error
https://www.vivino.com/US-CA/en/w/444146
404 Error
https://www.vivino.com/US-CA/en/w/845304
404 Error
https://www.vivino.com/US-CA/en/w/584536
404 Error
https://www.vivino.com/US-CA/en/chateau-combebelle-saint-chinian-syrah-grenache/w/1443566
no 404 error
saved
https://www.vivino.com/US-CA/en/w/716059
404 Error
https://www.vivino.com/US-CA/en/w/980103
404 Error
https://www.vivino.com/US-CA/en/w/976260
404 Error
https://www.vivino.com/US-CA/en/w/847511
404 Error
https://www.vivino.com/US-CA/en/w/708503
404 Error
https://www.vivino.com/US-CA/en/w/913615
404 Error
https://www.vivino.com/US-CA/en/w/737095
404 Error
https://www.vivino.com/US-CA/en/w/721003
404 Error
https://www.vivino.com/US-CA/en/w/44716
404 Error
https://www.vivino.com/US-CA/en/w/511708
404 Error
https://www.vivino.com/US

https://www.vivino.com/US-CA/en/w/814130
404 Error
https://www.vivino.com/US-CA/en/w/970618
404 Error
https://www.vivino.com/US-CA/en/w/396627
404 Error
https://www.vivino.com/US-CA/en/w/726202
404 Error
https://www.vivino.com/US-CA/en/w/591698
404 Error
https://www.vivino.com/US-CA/en/w/321224
404 Error
https://www.vivino.com/US-CA/en/w/285767
404 Error
https://www.vivino.com/US-CA/en/w/714857
404 Error
https://www.vivino.com/US-CA/en/w/334394
404 Error
https://www.vivino.com/US-CA/en/w/846367
404 Error
https://www.vivino.com/US-CA/en/w/946251
404 Error
https://www.vivino.com/US-CA/en/w/465551
404 Error
https://www.vivino.com/US-CA/en/w/748149
404 Error
https://www.vivino.com/US-CA/en/w/623776
404 Error
https://www.vivino.com/US-CA/en/w/81280
404 Error
https://www.vivino.com/US-CA/en/w/778247
404 Error
https://www.vivino.com/US-CA/en/w/157758
404 Error
https://www.vivino.com/US-CA/en/w/503514
404 Error
https://www.vivino.com/US-CA/en/w/366322
404 Error
https://www.vivino.com/US-CA/en/

KeyboardInterrupt: 

In [95]:

#Run the four functions on the save HTML pages 

#random_number_new = [11271, 83688, 61774, 7205, 84469, 2282, 93125, 84905, 11055, 779620, 95171, 76891, 88772, 86711, 9896, 98173, 51864, 77947, 9169, 7541, 91413, 10142, 87538, 94418, 76810, 88311, 15239, 99664, 12397, 12196, 9079, 76916, 2327, 90832, 79647, 95154, 84096, 19775, 94777, 76066, 64814, 15947, 76762, 18365, 80091, 99945, 91377, 1119, 83332, 11004, 97780, 22822, 95312, 94303, 20240, 62554, 4089, 17918, 91195, 80168, 82649, 1013, 21694, 87908, 99251, 87235, 11593, 74154, 39954, 63604, 65174, 22121, 7669, 16790, 96132, 64514, 86816, 21994, 80232, 10421, 4801, 14681, 8114, 19162, 79774, 74884, 23021, 17612, 91351, 5583, 85592, 78208, 79986, 19767, 6203, 23280, 247, 12207, 77300, 5072, 95728, 94587, 85434, 74412, 74319, 92290, 92349, 779597, 9425, 18103, 23370, 78629, 19612, 91473, 79196, 12022, 74004, 77445, 80423, 8879, 20557]
#random_number_new_1 = [11271, 83688, 61774, 7205, 84469]

#random_number is a list we previously created while saving the webpages to save the wine id
for i in random_number:
    if os.path.isfile('vivino_id_4_'+str(i)+'.html'):
            
            scrape_wine = scrape_wine_data('vivino_id_4_'+str(i)+'.html')       
            print(scrape_wine)
            
            wine_reviews = get_wine_reviews('vivino_id_4_'+str(i)+'.html')
            print(wine_reviews)
            
            wine_links = store_links('vivino_id_4_'+str(i)+'.html')
            print(wine_links)
            
            wine_taste = get_wine_taste_info('vivino_id_4_'+str(i)+'.html')
            print(wine_taste)
            
    else:
        continue
        #print("File not found.")

{'ID': '73600', 'Winery_name': 'Odfjell', 'Grapes': ['Carménère'], 'Region_countries': ['Chile'], 'Region': ['Maule Valley', 'Central Valley'], 'wine_styles': ['Chilean Carménère', 'Read more'], 'allergens': 'AllergensContains sulfites', 'Name': 'Armador Carménère', 'avg_rating': '3.7', 'num_rating': '9735 ratings', 'avg_price': '$11.87', 'goes_well_with_obj': ['beef', 'lamb', 'poultry'], 'badges': 'NA', '_id': ObjectId('640ad33efcfe7c8cb88765db')}
[{'ID': '73600', 'num_user_reviews': '126 ratings', 'User Name': 'Andy Kelly ', 'vintage': 'NA', 'star_rating': '4.0', 'USER_ID': 'andy.kelly3', 'Review_Text': 'I love my Chilean wines. Purple/ruby colour in the glass with violet on the nose. Medium bodied with plenty of smoke and hints of pepper spice as it goes down. Just over £10 so its a very cheap wine at the price. Very good carmenere overall!', 'num_like': '6', 'num_comments': '0', '_id': ObjectId('640ad33ffcfe7c8cb88765dd')}, {'ID': '73600', 'num_user_reviews': '1125 ratings', 'User 

{'ID': '1877558', 'Winery_name': 'Staatliche Weinbaudomäne Oppenheim', 'Grapes': ['Silvaner'], 'Region_countries': ['Germany'], 'Region': ['Rheinhessen'], 'wine_styles': ['German Silvaner', 'Read more'], 'allergens': 'AllergensContains sulfites', 'Name': 'Silvaner Trocken', 'avg_rating': 'NA', 'num_rating': 'Not enough ratings', 'avg_price': 'NA', 'goes_well_with_obj': ['veal', 'pork', 'vegetarian', 'poultry'], 'badges': 'NA', '_id': ObjectId('640ad340fcfe7c8cb8876603')}
[{'ID': '1877558', 'num_user_reviews': '1884 ratings', 'User Name': 'Doug ', 'vintage': '2013 vintage', 'star_rating': '3.0', 'USER_ID': 'douglas.me', 'Review_Text': 'For €5? Extremely serviceable.', 'num_like': '2', 'num_comments': '0', '_id': ObjectId('640ad340fcfe7c8cb8876605')}, {'ID': '1877558', 'num_user_reviews': '3111 ratings', 'User Name': 'Eduardo Zen ', 'vintage': '2015 vintage', 'star_rating': '4.0', 'USER_ID': 'eduardozen', 'Review_Text': 'Espetacular ', 'num_like': '22', 'num_comments': '1', '_id': Object

[{'ID': '91822', 'light_bold_scale': '30.3894', 'smooth_tannic_scale': '1.56893', 'dry_sweet_scale': '57.592', 'badges': ['214 mentions of tree fruit notes', '130 mentions of citrus notes', '100 mentions of earthy notes', '29 mentions of yeasty notes', '23 mentions of tropical notes', '22 mentions of ageing notes', '18 mentions of vegetal notes', '12 mentions of floral notes', '11 mentions of oaky notes', '7 mentions of spices notes', '1 mentions of black fruit notes', '1 mentions of red fruit notes'], '_id': ObjectId('640ad340fcfe7c8cb8876623')}]
{'ID': '97235', 'Winery_name': 'Château Cabezac', 'Grapes': ['Carignan'], 'Region_countries': ['France'], 'Region': ['Minervois', 'Languedoc-Roussillon', 'Languedoc'], 'wine_styles': ['Languedoc-Roussillon Red', 'Read more'], 'allergens': 'AllergensContains sulfites', 'Name': 'Cariñu', 'avg_rating': '4.0', 'num_rating': '531 ratings', 'avg_price': '$15.45', 'goes_well_with_obj': ['beef', 'veal'], 'badges': 'NA', '_id': ObjectId('640ad340fcfe7

[{'ID': '1720575', 'light_bold_scale': '42.0618', 'smooth_tannic_scale': '0.357527', 'dry_sweet_scale': '59.1498', 'badges': ['130 mentions of tree fruit notes', '79 mentions of citrus notes', '64 mentions of earthy notes', '17 mentions of vegetal notes', '14 mentions of tropical notes', '12 mentions of ageing notes', '9 mentions of yeasty notes', '9 mentions of oaky notes', '9 mentions of floral notes', '7 mentions of spices notes', '1 mentions of dried fruit notes'], '_id': ObjectId('640ad341fcfe7c8cb887664b')}]
{'ID': '2118', 'Winery_name': 'Kendall-Jackson', 'Grapes': ['Cabernet Sauvignon'], 'Region_countries': ['United States'], 'Region': ['Russian River Valley', 'California', 'North Coast', 'Sonoma County'], 'wine_styles': ['Californian Cabernet Sauvignon', 'Read more'], 'allergens': 'AllergensContains sulfites', 'Name': 'Highland Estates Napa Mountain Cabernet Sauvignon', 'avg_rating': '4.2', 'num_rating': '119 ratings', 'avg_price': '$44.66', 'goes_well_with_obj': ['beef', 'lam

{'ID': '75098', 'Winery_name': 'Cesare Pavese', 'Grapes': ['Moscato'], 'Region_countries': ['Italy'], 'Region': ["Moscato d'Asti", 'Northern Italy', 'Piemonte'], 'wine_styles': ["Italian Moscato d'Asti", 'Read more'], 'allergens': 'AllergensContains sulfites', 'Name': "Moscato d'Asti", 'avg_rating': '4.0', 'num_rating': '195 ratings', 'avg_price': '$7.91', 'goes_well_with_obj': [], 'badges': 'NA', '_id': ObjectId('640ad341fcfe7c8cb8876670')}
[{'ID': '75098', 'num_user_reviews': '74 ratings', 'User Name': 'Evelyn Hoh ', 'vintage': 'NA', 'star_rating': '4.0', 'USER_ID': 'evelyn.hoh', 'Review_Text': "Very sweet even for a moscato d'asti! Nectar, white peach, elderflower notes", 'num_like': '0', 'num_comments': '0', '_id': ObjectId('640ad342fcfe7c8cb8876672')}, {'ID': '75098', 'num_user_reviews': '17 ratings', 'User Name': 'Bendik Fostervoll ', 'vintage': 'NA', 'star_rating': '4.0', 'USER_ID': 'c4d1ef26d02b6200a993fb9632e52bc2', 'Review_Text': 'Fruity and fresh, great wine for starters.', 

[{'ID': '2275745', 'num_user_reviews': '8 ratings', 'User Name': 'Gwen Thomas ', 'vintage': 'NA', 'star_rating': '3.0', 'USER_ID': 'gwen.tho', 'Review_Text': 'Fruity, nice chilled', 'num_like': '0', 'num_comments': '0', '_id': ObjectId('640ad342fcfe7c8cb8876699')}, {'ID': '2275745', 'num_user_reviews': '1777 ratings', 'User Name': 'Texas Novice ', 'vintage': '2012 vintage', 'star_rating': '3.0', 'USER_ID': 'toby.ew', 'Review_Text': 'Run of the mill. Was served room temperature. Not appealing.', 'num_like': '1', 'num_comments': '0', '_id': ObjectId('640ad342fcfe7c8cb887669a')}, {'ID': '2275745', 'num_user_reviews': '196 ratings', 'User Name': 'Daria A. ', 'vintage': '2013 vintage', 'star_rating': '4.0', 'USER_ID': 'daria.allen', 'Review_Text': 'Smooth berry flavors. ', 'num_like': '0', 'num_comments': '0', '_id': ObjectId('640ad342fcfe7c8cb887669b')}, {'ID': '2275745', 'num_user_reviews': '1731 ratings', 'User Name': 'Mike Conca ', 'vintage': '2014 vintage', 'star_rating': '2.0', 'USER_

[]
{'ID': '90933', 'other wine id': ['90931', '2391085', '6214044', '90929'], '_id': ObjectId('640ad343fcfe7c8cb88766bc')}
[]
{'ID': '58494', 'Winery_name': 'Anura', 'Grapes': ['Chardonnay'], 'Region_countries': ['South Africa'], 'Region': ['Stellenbosch', 'Western Cape', 'Coastal Region', 'Paarl'], 'wine_styles': ['South African Chardonnay', 'Read more'], 'allergens': 'AllergensContains sulfites', 'Name': 'Chardonnay', 'avg_rating': '3.6', 'num_rating': '626 ratings', 'avg_price': '$8.43', 'goes_well_with_obj': ['pork', 'vegetarian', 'poultry'], 'badges': 'NA', '_id': ObjectId('640ad343fcfe7c8cb88766bf')}
[{'ID': '58494', 'num_user_reviews': '499 ratings', 'User Name': 'baci e cibo ', 'vintage': 'NA', 'star_rating': '4.0', 'USER_ID': 'legal.cat', 'Review_Text': 'Tasting with Hannah — \nfloral and light on the nose with a lemon butter palate. soft mouthfeel. perfect for rich seafood. guess I’ll have to start changing my mind about chardonnays. ', 'num_like': '3', 'num_comments': '0', '

{'ID': '1201474', 'great alternatives wine id': ['6611013', '1148412', '1864974'], 'other wine id': ['2015996', '430999', '324660', '137093'], '_id': ObjectId('640ad343fcfe7c8cb88766e3')}
[{'ID': '1201474', 'light_bold_scale': '62.1471', 'smooth_tannic_scale': '72.9113', 'dry_sweet_scale': '72.1675', 'badges': ['47 mentions of tree fruit notes', '26 mentions of citrus notes', '24 mentions of ageing notes', '21 mentions of earthy notes', '16 mentions of oaky notes', '15 mentions of yeasty notes', '12 mentions of vegetal notes', '2 mentions of black fruit notes', '1 mentions of spices notes', '1 mentions of floral notes', '1 mentions of red fruit notes', '1 mentions of tropical notes'], '_id': ObjectId('640ad344fcfe7c8cb88766e5')}]
{'ID': '80325', 'Winery_name': 'Alma Negra', 'Grapes': ['Pinot Noir'], 'Region_countries': ['Argentina'], 'Region': ['Mendoza'], 'wine_styles': ['Argentinian Pinot Noir', 'Read more'], 'allergens': 'AllergensContains sulfites', 'Name': 'Pinot Noir', 'avg_ratin