### Import Packages

In [1]:
import pandas as pd
import requests
import sqlite3
import random
import time

### Setting up the requests

In [2]:
user_agents = open('user_agent_list.txt', 'r').read().split('\n')

In [34]:
def get_request(request_url):
    response = requests.get(
        url=request_url,
        headers={
            'user-agent': random.choice(user_agents),
            'Accept-Language': 'en-US,en;q=0.5'
        }
    )
    try:
        response.raise_for_status()
    except requests.exceptions.HTTPError as e:
        return "Error: " + str(e)
    return response

In [4]:
# Define wine type ids as defined by vivino.com

wine_type_ids = {
    'red': 1,
    'white': 2,
    'sparkling_wine': 3,
    'rose': 4,
    'liqueur_wine': 24,
    'dessert_wine': 7
}

### Collecting WINES

In [5]:
con = sqlite3.connect('wine.db')
cur = con.cursor()
cur.execute('CREATE TABLE wine('
            'id, name, price, type_id, is_natural, winemaker, region, country, '
            'grapes, average_rating, has_valid_ratings)')

<sqlite3.Cursor at 0x15ad09c00>

In [6]:
def save_wines(json, wine_type):
    wine_data = []
    for i in range(len(json['explore_vintage']['matches'])):
        grapes = []
        try:
            for grape_index in range(
                    len(json['explore_vintage']['matches'][i]['vintage']['wine']['style']['grapes'])):
                grapes.append(
                    json['explore_vintage']['matches'][i]['vintage']['wine']['style']['grapes'][grape_index][
                        'name'])
            grapes = ', '.join(grapes)
        except:
            grapes = None

        # id
        try:
            var_id = json['explore_vintage']['matches'][i]['vintage']['wine']['id']
        except:
            var_id = None

        # name
        try:
            var_name = json['explore_vintage']['matches'][i]['vintage']['wine']['name']
        except:
            var_name = None

        # price
        try:
            var_price = json['explore_vintage']['matches'][i]['price']['amount']
        except:
            var_price = None

        # is_natural
        try:
            var_is_natural = json['explore_vintage']['matches'][i]['vintage']['wine']['is_natural']
        except:
            var_is_natural = False

        # winery
        try:
            var_winery = json['explore_vintage']['matches'][i]['vintage']['wine']['winery']['name']
        except:
            var_winery = None

        # region
        try:
            var_region = json['explore_vintage']['matches'][i]['vintage']['wine']['region']['name']
        except:
            var_region = None

        # country
        try:
            var_country = json['explore_vintage']['matches'][i]['vintage']['wine']['region']['country']['name']
        except:
            var_country = None

        # wine_ratings_average
        try:
            var_wine_ratings_average = json['explore_vintage']['matches'][i]['vintage']['statistics'][
                'wine_ratings_average']
        except:
            var_wine_ratings_average = None

        # has_valid_ratings
        try:
            var_has_valid_ratings = json['explore_vintage']['matches'][i]['vintage']['has_valid_ratings']
        except:
            var_has_valid_ratings = False

        wine_data.append(
            (var_id, var_name, var_price, wine_type, var_is_natural, var_winery, var_region, var_country, grapes,
             var_wine_ratings_average, var_has_valid_ratings)
        )
    cur.executemany('INSERT INTO wine VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', wine_data)
    con.commit()

In [79]:
request_errors = {}
for wine_type in wine_type_ids.values():
    page = 1
    while True:
        url = 'https://www.vivino.com/api/explore/explore?wine_type_ids[]={}&page={}'.format(wine_type, page)
        try:
            response = get_request(request_url=url)
            if len(response.json()['explore_vintage']['matches']) == 0:
                break
        except:
            print('Requesting wine type ' + str(wine_type) + ' failed (Page ' + str(page) + ')')
            request_errors[page] = response = get_request(request_url=url).status_code
            break
        # Continue with next wine type if all wines have been collected.
        save_wines(response.json(), wine_type)
        time.sleep(random.randint(2, 10))
        page += 1
con.close()

NameError: name 'save_wines' is not defined

In [8]:
con = sqlite3.connect('wine.db')
cur = con.cursor()

In [9]:
df = pd.read_sql_query("SELECT * from wine", con)

In [10]:
df = df.groupby(by='id').agg({'name': 'first', 'price': 'mean', 'type_id': 'first', 'is_natural': 'first',
                              'winemaker': 'first', 'region': 'first', 'country': 'first', 'grapes': 'first',
                              'average_rating': 'first',
                              'has_valid_ratings': 'first'}).reset_index()

In [15]:
df['price'] = df['price'].round(decimals=2)

In [16]:
df.to_sql('wine', con=con, if_exists='replace')

6513

In [17]:
df

Unnamed: 0,id,name,price,type_id,is_natural,winemaker,region,country,grapes,average_rating,has_valid_ratings
0,590,Chardonnay,29.95,2,0,Bernardus,Monterey County,Vereinigte Staaten,Chardonnay,4.3,1
1,1151,Brut (Chardonnay - Pinot Noir),16.90,3,0,Graham Beck,Robertson,Südafrika,,3.8,1
2,1155,Brut Rosé (Chardonnay - Pinot Noir),23.85,4,0,Graham Beck,Robertson,Südafrika,,3.9,1
3,1156,Brut Blanc De Blancs,23.70,3,0,Graham Beck,Robertson,Südafrika,,3.8,1
4,1213,Vin Santo di Montepulciano,158.00,7,0,Avignonesi,Vin Santo di Montepulciano,Italien,,4.6,1
...,...,...,...,...,...,...,...,...,...,...,...
6508,10770481,M Minuty Léamati,18.20,4,0,Minuty,Côtes de Provence,Frankreich,"Shiraz/Syrah, Grenache, Cinsault",4.1,1
6509,10831702,PN TX Brut Champagne,119.00,3,0,Bollinger,Schaumwein,Frankreich,"Chardonnay, Pinot Noir, Pinot Meunier",4.3,0
6510,10845026,Amai Susumaniello Rosé,7.90,4,0,San Marzano,Salento,Italien,,4.2,0
6511,10847847,Rosé Non-Vintage,33.56,3,0,Wiston,Sussex,Vereinigtes Königreich,"Chardonnay, Pinot Noir, Pinot Meunier",4.0,1


### Collecting REVIEWS

In [None]:
con = sqlite3.connect('reviews.db')
cur = con.cursor()
cur.execute('CREATE TABLE reviews('
            'id, name, price, type_id, is_natural, winemaker, region, country, '
            'grapes, average_rating, has_valid_ratings)')

In [82]:
test_response = get_request('https://www.vivino.com/api/wines/5418700/reviews?per_page=10&page=1&year=2018&language=en').json()
test_response['reviews'][0]

{'id': 231146226,
 'rating': 4.5,
 'note': 'Perfectly balanced red whine, ideal for a poultry, smooth cherry, plum and black cherry taste, light honey taste, light acidity, full body ',
 'language': 'en',
 'created_at': '2022-01-07T19:12:27.000Z',
 'aggregated': True,
 'user': {'id': 43388773,
  'seo_name': 'vladislav-ilyin',
  'alias': 'Vladislav Ilyin',
  'is_featured': False,
  'visibility': 'all',
  'image': {'location': '//images.vivino.com/avatars/6t5yiPlYTPiVb1BsUk9L6w.jpg',
   'variations': {'large': '//thumbs.vivino.com/avatars/6t5yiPlYTPiVb1BsUk9L6w_300x300.jpg',
    'small_square': '//thumbs.vivino.com/avatars/6t5yiPlYTPiVb1BsUk9L6w_50x50.jpg'}},
  'statistics': {'followers_count': 3,
   'followings_count': 1,
   'ratings_count': 226,
   'ratings_sum': 871.2,
   'reviews_count': 168,
   'purchase_order_count': 0},
  'language': 'en',
  'background_image': {'location': '//images.vivino.com/users/backgrounds/default_1.jpg',
   'variations': {'large': '//images.vivino.com/users

In [88]:
i = 0
test_response['reviews'][i]['vintage']['image']

{'location': '//images.vivino.com/labels/uhTmauiHRzCAZ0wlfBAsVQ.jpg',
 'variations': {'large': '//images.vivino.com/thumbs/uhTmauiHRzCAZ0wlfBAsVQ_375x500.jpg',
  'medium': '//images.vivino.com/thumbs/uhTmauiHRzCAZ0wlfBAsVQ_150x200.jpg',
  'medium_square': '//images.vivino.com/thumbs/uhTmauiHRzCAZ0wlfBAsVQ_150x150.jpg',
  'small_square': '//images.vivino.com/thumbs/uhTmauiHRzCAZ0wlfBAsVQ_80x80.jpg'}}

In [65]:
def save_reviews(json, wine_id):
    review_data = []
    for i in range(len(json['reviews'])):

        # id
        try:
            var_id = json['reviews'][i]['id']
        except:
            var_id = None

        # rating
        try:
            rating = json['reviews'][i]['rating']
        except:
            rating = None

        # note
        try:
            note = json['reviews'][i]['note']
        except:
            note = None

        # created_at
        try:
            created_at = json['reviews'][i]['created_at']
        except:
            created_at = None

        # likes_count
        try:
            likes_count = json['reviews'][i]['activity']['statistics']['likes_count']
        except:
            likes_count = None

        # comments_count
        try:
            comments_count = json['reviews'][i]['activity']['statistics']['comments_count']
        except:
            comments_count = None

        # scan_image_path
        try:
            scan_image_path = test_response['reviews'][i]['vintage']['image']['location'].strip("/")
        except:
            scan_image_path = None

        review_data.append(
            (var_id, wine_id, rating, note, created_at, likes_count, comments_count, scan_image_path)
        )
    cur.executemany('INSERT INTO wine VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', review_data)
    con.commit()

In [22]:
for wine_id in df.id:
    page = 1
    while True:
        url = 'https://www.vivino.com/api/wines/{}}/reviews?per_page=10&page={}&language=en'.format(wine_id, page)
        try:
            response = get_request(request_url=url)
            if len(response.json()['explore_vintage']['matches']) == 0:
                break
        except:
            print('Requesting wine type ' + str(wine_type) + ' failed (Page ' + str(page) + ')')
            request_errors[page] = response
            break
        save_reviews(json=response.json(), wine_id=wine_id)
        time.sleep(random.randint(2, 10))
        page += 1
con.close()

590
1151
1155
1156
1213
1263
1264
1455
1479
1651
1652
1658
1687
1688
1690
1697
1799
1835
1838
1868
1869
1977
1979
2066
2634
2635
2784
2846
2871
2900
2923
2930
2996
3069
3110
3405
3545
3551
3715
3777
3908
3989
3990
4049
4086
4125
4368
4450
4848
4902
4905
4907
4965
4970
4972
4977
4992
5009
5010
5012
5063
5078
5103
5104
5115
5194
5211
5273
5274
5433
5602
5786
6019
6020
6211
6285
6339
6636
6678
6693
6697
6703
6892
6935
7001
7073
7082
7110
7133
7273
7490
7494
7495
7774
7786
7935
7971
8023
8025
8039
8198
8303
8305
8307
8549
8551
8781
8971
8998
9063
9081
9121
9122
9220
9237
9240
9243
9324
9398
9712
9748
9752
9796
9826
9921
9924
9958
9998
10021
10022
10023
10107
10108
10151
10175
10196
10553
10607
10873
10922
10975
10989
11001
11036
11038
11082
11157
11170
11176
11199
11214
11245
11563
11605
11607
11781
11785
11830
11861
11890
11943
11951
11953
11956
11979
11981
12008
12266
12318
12322
12393
12437
12574
12864
13268
13293
13296
13297
13300
13303
13417
13563
13759
13924
13971
13978
14119
14123
1

In [78]:
request_errors[86].status_code()

TypeError: 'int' object is not callable