### Import Packages

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import requests
import sqlite3
import random
import time

### Setting up the requests

In [None]:
user_agents = open('user_agent_list.txt', 'r').read().split('\n')

In [None]:
def cosine_sim_vectors(vec1, vec2):
    vec1 = vec1.reshape(1, -1)
    vec2 = vec2.reshape(1, -1)
    return cosine_similarity(vec1, vec2)[0][0]

In [None]:
def get_request(request_url):
    try:
        response = requests.get(
            url=request_url,
            headers={
                'user-agent': random.choice(user_agents),
                'Accept-Language': 'en-US,en;q=0.5'
            }
        )
        response.raise_for_status()
    except requests.exceptions.HTTPError as e:
        return e
    return response

In [None]:
def get_wine_ids(json):
    return {
        wine_id['vintage']['wine'].get('id')
        for wine_id
        in json['explore_vintage']['matches']
    }

In [None]:
def get_review_ids(json):
    return {
        wine_id.get('id')
        for wine_id
        in json['reviews']
    }

In [None]:
def is_equal_to_previous(current_ids, previous_ids):
    for current_id in current_ids:
        if current_id not in previous_ids:
            return False
    return True

In [None]:
# Define wine type ids as defined by vivino.com

wine_type_ids = {
    'red': 1,
    'white': 2,
    'sparkling_wine': 3,
    'rose': 4,
    'liqueur_wine': 24,
    'dessert_wine': 7
}

### Collecting WINES

In [None]:
con_wine = sqlite3.connect('wine.db')
cur_wine = con_wine.cursor()
cur_wine.execute('CREATE TABLE IF NOT EXISTS wine('
                 'id, name, price, type_id, is_natural, winemaker, region, country, grapes, average_rating, '
                 ' has_valid_ratings, acidity, fizziness, intensity, sweetness, tannin, user_structure_count)')

In [None]:
def save_wines(json, wine_type):
    wine_data = []
    for i in range(len(json['explore_vintage']['matches'])):
        grapes = []
        try:
            for grape_index in range(
                    len(json['explore_vintage']['matches'][i]['vintage']['wine']['style']['grapes'])):
                grapes.append(
                    json['explore_vintage']['matches'][i]['vintage']['wine']['style']['grapes'][grape_index][
                        'name'])
            grapes = ', '.join(grapes)
        except:
            grapes = None

        # id
        try:
            var_id = json['explore_vintage']['matches'][i]['vintage']['wine']['id']
        except:
            var_id = None

        # name
        try:
            name = json['explore_vintage']['matches'][i]['vintage']['wine']['name']
        except:
            name = None

        # price
        try:
            price = json['explore_vintage']['matches'][i]['price']['amount']
        except:
            price = None

        # is_natural
        try:
            is_natural = json['explore_vintage']['matches'][i]['vintage']['wine']['is_natural']
        except:
            is_natural = False

        # winery
        try:
            winery = json['explore_vintage']['matches'][i]['vintage']['wine']['winery']['name']
        except:
            winery = None

        # region
        try:
            region = json['explore_vintage']['matches'][i]['vintage']['wine']['region']['name']
        except:
            region = None

        # country
        try:
            country = json['explore_vintage']['matches'][i]['vintage']['wine']['region']['country']['name']
        except:
            country = None

        # wine_ratings_average
        try:
            wine_ratings_average = json['explore_vintage']['matches'][i]['vintage']['statistics'][
                'wine_ratings_average']
        except:
            wine_ratings_average = None

        # has_valid_ratings
        try:
            has_valid_ratings = json['explore_vintage']['matches'][i]['vintage']['has_valid_ratings']
        except:
            has_valid_ratings = False

        # acidity
        try:
            acidity = json['explore_vintage']['matches'][i]['vintage']['wine']['taste']['structure']['acidity']
        except:
            acidity = None

        # fizziness
        try:
            fizziness = json['explore_vintage']['matches'][i]['vintage']['wine']['taste']['structure']['fizziness']
        except:
            fizziness = None

        # intensity
        try:
            intensity = json['explore_vintage']['matches'][i]['vintage']['wine']['taste']['structure']['intensity']
        except:
            intensity = None

        # sweetness
        try:
            sweetness = json['explore_vintage']['matches'][i]['vintage']['wine']['taste']['structure']['sweetness']
        except:
            sweetness = None

        # tannin
        try:
            tannin = json['explore_vintage']['matches'][i]['vintage']['wine']['taste']['structure']['tannin']
        except:
            tannin = None

        # user_structure_count
        try:
            user_structure_count = json['explore_vintage']['matches'][i]['vintage']['wine']['taste']['structure'][
                'user_structure_count']
        except:
            user_structure_count = None

        wine_data.append(
            (var_id, name, price, wine_type, is_natural, winery, region, country, grapes,
             wine_ratings_average, has_valid_ratings, acidity, fizziness, intensity, sweetness, tannin,
             user_structure_count)
        )
    cur_wine.executemany('INSERT INTO wine VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', wine_data)
    con_wine.commit()

In [None]:
url_1 = 'https://www.vivino.com/api/explore/explore?min_rating=1&price_range_max=1000&price_range_min=0&wine_type_ids[]={}&page={}'
url_2 = 'https://www.vivino.com/api/explore/explore?min_rating=1&price_range_max=1000&price_range_min=0&wine_type_ids[]={}&page={}&order_by=ratings_average&order=asc'

request_errors = {}
for wine_type in wine_type_ids.values():
    page = 1
    previous_json = set()
    while True:
        url = url_2.format(wine_type, page)
        try:
            response = get_request(request_url=url)
            status_code = response.status_code
        except:
            if page in request_errors.keys():
                page += 1
                time.sleep(random.randint(0, 1))
                continue
            time.sleep(random.randint(0, 1))
            request_errors[page] = status_code
            continue
        if page > 1:
            current_wine_ids = get_wine_ids(response.json())
            if len(response.json()['explore_vintage']['matches']) == 0 or is_equal_to_previous(
                    current_ids=current_wine_ids, previous_ids=previous_wine_ids):
                print('No more entries for wine type ' + str(wine_type) + ' at Page ' + str(page) + ' could be found.')
                break
        save_wines(response.json(), wine_type)
        previous_wine_ids = get_wine_ids(response.json())
        page += 1
        time.sleep(random.randint(0, 1))
con_wine.close()

In [None]:
con_wine = sqlite3.connect('wine.db')
cur_wine = con_wine.cursor()
con_distinct = sqlite3.connect('wine_distinct.db')
cur_distinct = con_distinct.cursor()

cur_distinct.execute('CREATE TABLE IF NOT EXISTS wine_distinct('
                     'id, name, price, type_id, is_natural, winemaker, region, country, '
                     'grapes, average_rating, has_valid_ratings)')


In [None]:
df = pd.read_sql_query("SELECT * from wine", con_wine)

In [None]:
df.columns

In [None]:
df = df.groupby(by='id').agg({'name': 'first', 'price': 'mean', 'type_id': 'first', 'is_natural': 'first',
                              'winemaker': 'first', 'region': 'first', 'country': 'first', 'grapes': 'first',
                              'average_rating': 'first', 'acidity': 'first', 'fizziness': 'first', 'intensity': 'first',
                              'sweetness': 'first', 'tannin': 'first', 'user_structure_count': 'first',
                              'has_valid_ratings': 'first'}).reset_index()

In [None]:
df['price'] = df['price'].round(decimals=2)

In [None]:
df.to_sql('wine_distinct', con=con_distinct, if_exists='replace')
con_distinct.close()
con_wine.close()

### Collecting REVIEWS

In [None]:
con_rev = sqlite3.connect('review.db')
cur_rev = con_rev.cursor()
cur_rev.execute('CREATE TABLE IF NOT EXISTS review('
                'id, wine_id, rating, note, created_at, likes_count, comments_count, scan_image_path, lan_code, user_id)')

con_user = sqlite3.connect('user.db')
cur_user = con_user.cursor()
con_user.execute('CREATE TABLE IF NOT EXISTS user('
                 'id INTEGER NOT NULL PRIMARY KEY, seo_name, is_featured, followers_count, followings_count, ratings_count,'
                 'reviews_count, purchase_order_count, language)')

# con_wine_dis = sqlite3.connect('wine_distinct.db')
# df = pd.read_sql_query('SELECT id from wine_distinct', con_wine_dis)

In [None]:
def save_users(json):
    for i in range(len(json['reviews'])):
        user_data = set()
        # user_id
        try:
            user_id = json['reviews'][i]['user']['id']
        except:
            continue

        # seo_name
        try:
            seo_name = json['reviews'][i]['user']['seo_name']
        except:
            seo_name = None

        # is_featured
        try:
            is_featured = json['reviews'][i]['user']['is_featured']
        except:
            is_featured = False

        # followers_count
        try:
            followers_count = json['reviews'][i]['user']['statistics']['followers_count']
        except:
            followers_count = None

        # followings_count
        try:
            followings_count = json['reviews'][i]['user']['statistics']['followings_count']
        except:
            followings_count = None

        # ratings_count
        try:
            ratings_count = json['reviews'][i]['user']['statistics']['ratings_count']
        except:
            ratings_count = None

        # reviews_count
        try:
            reviews_count = json['reviews'][i]['user']['statistics']['reviews_count']
        except:
            reviews_count = None

        # purchase_order_count
        try:
            purchase_order_count = json['reviews'][i]['user']['statistics']['purchase_order_count']
        except:
            purchase_order_count = None

        # language
        try:
            language = json['reviews'][i]['user']['language']
        except:
            language = None

        user_data.add((user_id, seo_name, is_featured, followers_count, followings_count, ratings_count, reviews_count,
                       purchase_order_count, language))
        try:
            cur_user.executemany('INSERT INTO user VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?)', user_data)
        except:
            continue
    con_user.commit()

In [None]:
def save_reviews(json, wine_id):
    review_data = set()
    for i in range(len(json['reviews'])):

        # id
        try:
            var_id = json['reviews'][i]['id']
        except:
            var_id = None

        # rating
        try:
            rating = json['reviews'][i]['rating']
        except:
            rating = None

        # note
        try:
            note = json['reviews'][i]['note']
        except:
            note = None

        # created_at
        try:
            created_at = json['reviews'][i]['created_at']
        except:
            created_at = None

        # likes_count
        try:
            likes_count = json['reviews'][i]['activity']['statistics']['likes_count']
        except:
            likes_count = None

        # comments_count
        try:
            comments_count = json['reviews'][i]['activity']['statistics']['comments_count']
        except:
            comments_count = None

        # scan_image_path
        try:
            scan_image_path = json['reviews'][i]['vintage']['image']['location'].strip("/")
        except:
            scan_image_path = None

        # lan_code
        try:
            lan_code = json['reviews'][i]['language']
        except:
            lan_code = None

        # user_id
        try:
            user_id = json['reviews'][i]['user']['id']
        except:
            user_id = None

        review_data.add(
            (var_id, wine_id, rating, note, created_at, likes_count, comments_count, scan_image_path, lan_code, user_id)
        )
    cur_rev.executemany('INSERT INTO review VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', review_data)
    con_rev.commit()

In [None]:
request_errors = {}
for wine_id in df[new_ids].id:
    page = 1
    previous_wine_ids = set()
    while True:
        url = 'https://www.vivino.com/api/wines/{}/reviews?per_page=50&page={}&language=en'.format(wine_id, page)
        try:
            response = get_request(request_url=url)
            status_code = response.status_code

        except Exception as e:
            print('Exception: ' + str(e) + ' at page: ' + str(page) + ' for wine (id) ' + str(wine_id))
            if page in request_errors.keys():
                page += 1
                time.sleep(random.randint(0, 1))
                continue
            time.sleep(random.randint(0, 1))
            request_errors[page] = status_code
            continue
        if page > 1:
            current_review_ids = get_review_ids(response.json())
            if len(response.json()['reviews']) == 0 or is_equal_to_previous(
                    current_ids=current_review_ids, previous_ids=previous_review_ids):
                print('No more entries for wine id ' + str(wine_id) + ' at Page ' + str(page) + ' could be found.')
                break
        save_reviews(json=response.json(), wine_id=wine_id)
        save_users(json=response.json())
        previous_review_ids = get_review_ids(response.json())
        page += 1
        time.sleep(random.randint(0, 1))
con_rev.close()

In [None]:
con_rev = sqlite3.connect('review.db')
df = pd.read_sql_query("SELECT * from review", con_rev)
df.head()

In [None]:
df.drop_duplicates(subset='id', inplace=True)
df.to_sql('review', con=con_rev, if_exists='replace')

In [None]:
con_rev.close()