In [1]:
from sentence_transformers import SentenceTransformer, util
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import numpy as np
import requests
import sqlite3
import torch
import json

In [10]:
def load_from_database(
        db: str,
        table: str,
        columns: str,
):
    connection = sqlite3.connect('../database/' + db + '.db')
    df = pd.read_sql_query(
        str('SELECT ' + columns + ' FROM ' + table), con=connection
    )
    connection.close()
    return df


def save_to_database(
        db: str,
        table: str,
        df: pd.DataFrame
):
    connection = sqlite3.connect('../database/' + db + '.db')
    try:
        df.to_sql(name=table, con=connection, if_exists='replace')
    except Exception as e:
        raise e
    print('DataFrame has been saved successfully to: ' + db)

### Embed review texts

In [None]:
df_rev_en_sent = load_from_database(db='review_en_sentiment', table='review_en_sentiment',
                                    columns='wine_id, note, likes_count')
df_rev_en_sent.head()

In [None]:
len(df_rev_en_sent[df_rev_en_sent['likes_count'] > 0]) / len(df_rev_en_sent)

In [None]:
print(len(df_rev_en_sent))
df_rev_en_sent = df_rev_en_sent[df_rev_en_sent['likes_count'] > 0]
print(len(df_rev_en_sent))

In [None]:
embedder = SentenceTransformer.load('../models/zero-shot').to('mps')

In [None]:
test_embeddings = embedder.encode(
    df_rev_en_sent['note'].tolist(),
    convert_to_tensor=True,
    show_progress_bar=True
)

In [None]:
df_rev_en_sent.reset_index(inplace=True)

In [None]:
#
embeddings_dict = {
    index: [df_rev_en_sent.loc[index, 'wine_id'], test_embeddings[index]] for index in df_rev_en_sent.index
}

In [None]:
torch.save(embeddings_dict, '../database/test_embeddings_likes.pt')

In [None]:
torch.save(test_embeddings, '../database/test_embeddings_likes_tensors.pt')

### Load pre-embedded reviews

In [None]:
def get_wine_types():
    return {
        1: 'Red',
        2: 'White',
        3: 'Sparkling',
        4: 'Rosé',
        24: 'Liqueur',
        7: 'Dessert'
    }

In [None]:
def get_wine_data(json: json, soup: BeautifulSoup) -> dict:
    wine_types = get_wine_types()
    try:
        name = json['reviews'][0]['vintage']['wine']['name']
        wine_type = wine_types[int(json['reviews'][0]['vintage']['wine']['type_id'])]
        country = json['reviews'][0]['vintage']['wine']['region']['country']['name']
        region = json['reviews'][0]['vintage']['wine']['region']['name']
        winery = json['reviews'][0]['vintage']['wine']['winery']['name']
        grape_list = []
        img_path = soup.find_all('img', {'class': 'image'})[0]['src'].lstrip('/')
        for grape in json['reviews'][0]['vintage']['wine']['region']['country']['most_used_grapes']:
            grape_list.append(grape['name'])
        wine_data = {
            'Name': name,
            'Wine Type': wine_type,
            'Country': country,
            'Region': region,
            'Winery': winery,
            'Main grapes': ', '.join(grape_list),
            'Image path': img_path
        }
    except Exception as e:
        raise e
    return wine_data

In [None]:
def request_vivino(wine_id: int) -> list:
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15'
    }
    api_response: requests = requests.get(
        url='https://www.vivino.com/api/wines/{}/reviews?per_page=1'.format(wine_id),
        headers=headers
    )
    api_response.raise_for_status()
    html_response: requests = requests.get(
        url='https://www.vivino.com/w/{}'.format(wine_id),
        headers=headers
    )
    html_response.raise_for_status()
    return [api_response, html_response]

In [None]:
embeddings_dict: dict = torch.load('../database/test_embeddings_likes.pt')

In [None]:
embeddings_list: torch.tensor = torch.stack([embeddings_dict[i][1] for i in range(len(embeddings_dict.values()))])
wine_id_list: list = [embeddings_dict[i][0] for i in range(len(embeddings_dict.values()))]
del embeddings_dict

In [None]:
embedder = SentenceTransformer.load('../models/simcse_en').to('mps')

In [None]:
def get_recommendations(query: str, top_n: int = 5) -> list:
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    similarity_scores = util.cos_sim(query_embedding, embeddings_list)
    top_results = torch.topk(similarity_scores, k=top_n)
    top_similar_wines = [wine_id_list[i] for i in top_results[1][0]]
    if len(set(top_similar_wines)) < top_n:
        print('hey')
        get_recommendations(query, top_n + 1)
    recommendations_list: list = list()
    for wine_id in top_similar_wines:
        vivino_data: list = request_vivino(wine_id=wine_id)
        json = vivino_data[0].json()
        soup = BeautifulSoup(vivino_data[1].text, 'html.parser')
        recommendations_list.append(
            get_wine_data(json=json, soup=soup)
        )
    return recommendations_list

In [None]:
query = 'sparkling dry wine fruity'
embedded_query = embedder.encode(query, convert_to_tensor=True)

In [None]:
similarity_scores = util.cos_sim(embedded_query, embeddings_list)
top_results = torch.topk(similarity_scores, k=300)
top_similar_wines = [wine_id_list[i] for i in top_results[1][0]]

In [None]:
id_var = max(set(top_similar_wines), key=top_similar_wines.count)
id_var

In [None]:
indices = [i for i, j in enumerate(top_similar_wines) if j == id_var]
indices

In [None]:
probabilities =  [top_results[0][0][i] for i in indices]
np.round(np.mean(probabilities), decimals=2)

In [None]:
wine_ids = list()
lower, upper = 0, 1000
for i in range(3):
    wines = top_similar_wines[lower:upper]
    id_var = max(set(wines), key=wines.count)
    while id_var in wine_ids:
        wines = [i for i in wines if i != id_var]
        id_var = max(set(wines), key=wines.count)
    wine_ids.append(id_var)
    lower += 1000
    upper += 1000

In [None]:
query = 'light white wine with creamy texture low acidity fruity'
embedded_query = embedder.encode(query, convert_to_tensor=True)

In [None]:
time_dict = dict()
k = 100
for n in [100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000]:
    embeddings_list_test = embeddings_list[:n]
    time_dict[n] = list()
    for i in range(50):
        start = datetime.now()
        similarity_scores = util.cos_sim(a=embedded_query, b=embeddings_list_test)
        top_similar_wines = torch.topk(k=k, input=similarity_scores)
        duration = datetime.now() - start
        time_dict[n].append(duration)

In [None]:
time_dict

In [None]:
df_dict = dict()
for key in time_dict.keys():
    df_dict[key] = np.median([i.microseconds for i in time_dict[key]])

In [None]:
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 13
plt.rcParams['font.family'] = 'Palatino'
fig_path = r'/Users/leonbecker/Library/CloudStorage/OneDrive-UniversitätWürzburg/Universität Würzburg/Master Information-Systems/Thesis/LaTex/graphics/'
plt.rcParams['path.simplify'] = True
# locale.setlocale(locale.LC_NUMERIC, "de_DE")
plt.rcParams['axes.formatter.use_locale'] = True

In [None]:
df = pd.DataFrame({'n': df_dict.keys(), 'duration': df_dict.values()})
df.tail()

In [None]:
fig, ax = plt.subplots()

ax.plot(df['n'], df['duration']/1000000, c='black', marker='o')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.set_ylabel('Dauer in Sekunden (Median)')
ax.set_xlabel('Anzahl der Vergleiche')
plt.tight_layout()
fig.savefig(fig_path + 'duration_torch_topk.pdf')

In [None]:
test_list = embedder.encode(['this is a review text of a beautiful wine' for i in range(10**5)], convert_to_tensor=True)
torch.save(test_list, '../database/tensor100k_test')

In [None]:
r = requests.get(
    url='https://www.vivino.com/w/1299511',
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15'
    }
)

In [None]:
from selenium.webdriver.chrome.options import Options
from selenium import webdriver

options = Options()
options.add_argument('--headless')
executable_path = '/Users/leonbecker/Library/CloudStorage/OneDrive-UniversitätWürzburg/Universität Würzburg/Master Information-Systems/Thesis/chromedriver_mac_arm64/chromedriver'
driver = webdriver.Chrome(options=options, executable_path=executable_path)

driver.get('https://www.vivino.com/w/1299511')

html = driver.page_source

soup = BeautifulSoup(html)

span_var = soup.find_all('span', {'class': 'purchaseAvailabilityPPC__amount--2_4GT'})
a_var = soup.find_all('a', {'class': 'anchor_anchor__m8Qi-'})



In [5]:
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver

start = datetime.now()

options = Options()
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15'
options.add_argument('--headless')
options.add_argument(f'user-agent={user_agent}')
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get('https://www.vivino.com/DE/de/antinori-tuscany-marchesi-antinori-tignanello/w/1652')
html = driver.page_source
soup = BeautifulSoup(html, features='html.parser')

print(datetime.now() - start)

0:00:02.892209


In [2]:
import ai_sommelier_backend

In [None]:
wine_data = get_wine_data(soup, 86684, 0.95)

In [4]:
ai_sommelier_backend.get_recommendations(query='extraordinary wine for special occasion', n=3)

0.41
0.4
0.41


[{'Id': 86684,
  'Probability': 0.41,
  'Name': 'Brut Champagne',
  'Wine Type': 'Schaumwein',
  'Country': 'Frankreich',
  'Winery': 'Dom Pérignon',
  'Main grapes': 'Chardonnay, Pinot Noir',
  'Image path': 'https://images.vivino.com/thumbs/s5aXYaQiTu-V_xEYI3KXRg_pb_x600.png',
  'Price': 'N.A.',
  'Match': 0},
 {'Id': 1652,
  'Probability': 0.4,
  'Name': 'Tignanello',
  'Wine Type': 'Rotwein',
  'Country': 'Italien',
  'Winery': 'Antinori',
  'Main grapes': 'Sangiovese, Cabernet Sauvignon, Cabernet Franc',
  'Image path': 'https://images.vivino.com/thumbs/LtqnB-H2QFmN6ixL3FQcdw_pb_x600.png',
  'Price': 'N.A.',
  'Match': 0},
 {'Id': 1684223,
  'Probability': 0.41,
  'Name': 'Pauillac (Premier Grand Cru Classé)',
  'Wine Type': 'Rotwein',
  'Country': 'Frankreich',
  'Winery': 'Château Mouton Rothschild',
  'Main grapes': 'Merlot, Cabernet Sauvignon',
  'Image path': 'https://images.vivino.com/thumbs/DNeQQKoHQCyq7BakiBm_zQ_pb_x600.png',
  'Price': 'N.A.',
  'Match': 0}]

In [None]:
np.round(np.mean([0.49055444, 0.40551655]), decimals=2) * 100

In [None]:
r = requests.get()

In [None]:
wine_data['Wine Type']

In [7]:
a_var = soup.find_all('a', {'class': 'anchor_anchor__m8Qi-'}) # anchor_anchor__m8Qi-
a_var

[<a class="anchor_anchor__m8Qi- breadCrumbs__link--1TY6b" data-cy="breadcrumb-country" href="/wine-countries/france">Frankreich</a>,
 <a class="anchor_anchor__m8Qi- breadCrumbs__link--1TY6b" data-cy="breadcrumb-region" href="/wine-regions/champagne">Schaumwein</a>,
 <a class="anchor_anchor__m8Qi- breadCrumbs__link--1TY6b" data-cartitemsource="winery-page-wine-page-header" data-cy="breadcrumb-winery" href="/wineries/dom-perignon">Dom Pérignon</a>,
 <a class="anchor_anchor__m8Qi- breadCrumbs__link--1TY6b" data-cartitemsource="breadcrumb-explore" data-cy="breadcrumb-winetype" href="/explore?page=1&amp;wine_type_ids[]=3">Schaumwein</a>,
 <a class="anchor_anchor__m8Qi- breadCrumbs__link--1TY6b" data-cartitemsource="breadcrumb-explore" data-cy="breadcrumb-grape" href="/explore?grape_ids[]=5&amp;grape_ids[]=14&amp;page=1">Cuvée</a>,
 <a class="anchor_anchor__m8Qi- scroll" href="#all_reviews"><div class="vivinoRating_vivinoRating__RbvjH"><div class="vivinoRating_averageValue__uDdPM">4,6</div><

In [None]:
soup.find_all('a', {'class': 'anchor_anchor__m8Qi-'})[0].text

In [None]:
soup.find_all('span', {'class': 'vintage'})[0].text.replace('\n', '')

In [6]:
soup.find_all('div', {'class': 'purchaseAvailability__row--S-DoM purchaseAvailability__prices--1WNrU'})[0].text #purchaseAvailability__currentPrice--3mO4u

'€168'

In [None]:
matching = [str(s) for s in a_var if 'grape' in str(s)][0]
matching

In [None]:
import re

In [None]:
matches = re.findall('grape_ids\[]=\d+', matching)
for match in matches:
    print(re.findall('\d+', match)[0])
# string[len(string) - 1]

In [2]:
x = requests.get(
    url='https://www.vivino.com/w/1652',
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15',
        'Accept-Language': 'de-DE'
    }
)

In [3]:
x.url

'https://www.vivino.com/DE/de/antinori-tuscany-marchesi-antinori-tignanello/w/1652'

In [6]:
if not soup.find_all('div', {'class': 'purchaseAvailability__row--S-DoM purchaseAvailability__prices--1WNrU'}):
    print('Empty!')
else:
    print('We have data!')

We have data!


In [None]:
BeautifulSoup(x.text).find_all('link')[1]

In [139]:
y = requests.get(
    url='https://www.vivino.com/api/wines/7657778/reviews?per_page=1',
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15'
    }
)

In [141]:
y.json()['reviews'][0]['vintage']['wine']

{'id': 7657778,
 'name': 'Kingpin',
 'seo_name': 'kingpin',
 'type_id': 1,
 'vintage_type': 1,
 'is_natural': False,
 'region': {'id': 839,
  'name': 'Vino de España',
  'name_en': 'Vino de Mesa',
  'seo_name': 'vino-de-mesa',
  'country': {'code': 'es',
   'name': 'Spanien',
   'native_name': 'España',
   'seo_name': 'spain',
   'currency': {'code': 'EUR', 'name': 'Euro', 'prefix': '€', 'suffix': None},
   'regions_count': 152,
   'users_count': 2236620,
   'wines_count': 141671,
   'wineries_count': 18046,
   'most_used_grapes': [{'id': 19,
     'name': 'Tempranillo',
     'seo_name': 'tempranillo',
     'has_detailed_info': True,
     'wines_count': 172842},
    {'id': 142,
     'name': 'Garnacha',
     'seo_name': 'garnacha',
     'has_detailed_info': True,
     'wines_count': 58111},
    {'id': 2,
     'name': 'Cabernet Sauvignon',
     'seo_name': 'cabernet-sauvignon',
     'has_detailed_info': True,
     'wines_count': 801751}]},
  'parent_id': None,
  'background_image': None,


In [149]:
z = requests.get(
    url='https://www.vivino.com/api/wines/79852/highlights?per_page=1',
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15'
    }
)
# https://www.vivino.com/api/wines/1143613/tastes?language=de

In [150]:
z.json()['highlights'][0]['metadata']['style']['name']

'Champagne Frankreich'

In [None]:
name = reviews.json()['reviews'][0]['vintage']['wine']['name']
style = highlights.json()['highlights'][0]['metadata']['style']['name']
price = highlights.json()['highlights'][0]['metadata']['price']['amount']
country = highlights.json()['highlights'][0]['metadata']['style']['country']['name']
grape_list = list()
for i in range(len(highlights.json()['highlights'][0]['metadata']['style']['grapes'])):
    grape_list.append(highlights.json()['highlights'][0]['metadata']['style']['grapes'][i]['name'])
grapes = ', '.join(grape_list)
region = reviews.json()['reviews'][0]['vintage']['wine']['region']['name']
winery = reviews.json()['reviews'][0]['vintage']['wine']['winery']['name']

In [None]:
grapes_dict = y.json()['grapes']

In [None]:
{item['id'] : item for item in grapes_dict}[5]

In [None]:
matches = re.findall('grape_ids\[]=\d+', matching)
for match in matches:
    print(re.findall('\d+', match)[0])

In [None]:
{item['id'] : item for item in grapes_dict}[int(re.findall('\d+', matches[0])[0])]

In [None]:
BeautifulSoup(y.text).find_all('a', {'class': 'anchor_anchor__m8Qi-'})

In [None]:
soup.find_all('img', {'class': 'image'})[0]['src']

In [None]:
np.round(0.958462, decimals=2) * 100

In [7]:
if 'dd':
    print('Alles gut')

Alles gut
