In [1]:
!pip install requests
!pip install pandas



In [2]:
import requests
import pandas as pd
import time

In [3]:
# Define the Wine Request
def get_wine_data(wine_id, year, page, retries=3, wait_time=2):
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
    }

    # Vivino User Comments API
    api_url = "https://www.vivino.com/api/wines/{id}/reviews?per_page=50&year={year}&page={page}"

    for attempt in range(retries):
        try:
            response = requests.get(api_url.format(id=wine_id, year=year, page=page), headers=headers)

            if response.status_code != 200:
                print(f"Error: Received response with status code {response.status_code}")
                raise requests.exceptions.RequestException

            data = response.json()
            return data

        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1} failed: {str(e)}")

            if attempt < retries - 1:
                print(f"Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print("Max retries reached. Returning None.")
                return None

In [5]:
Wine_results = []

# Select the pages
# Collect the wines in first 50 pages in 2024/10/1
for page_num in range(1,51):
    print(f"Getting data from page {page_num}")
    r = requests.get(
        "https://www.vivino.com/api/explore/explore",
        params={
            # "country_code": "FR",
            # "country_codes[]": "pt",
            "currency_code": "EUR",
            # "grape_filter": "varietal",
            "min_rating": "1",
            "order_by": "price",
            "order": "asc",
            "page": page_num,
            "price_range_max": "2000",
            "price_range_min": "0",
            # "wine_type_ids[]": "1",
        },
        headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"
        },
    )

    # Store information in list
    results = [
    (
        t["vintage"]["wine"]["winery"]["name"],  #
        t["vintage"]["year"],
        t["vintage"]["wine"]["id"],  # wine id
        f'{t["vintage"]["wine"]["name"]} {t["vintage"]["year"]}',  # full name
        t["vintage"]["statistics"]["ratings_average"],  # average_ratings
        t["vintage"]["statistics"]["ratings_count"],  # ratings_count

        # Basic information
        t["prices"][0]["amount"] if t.get("prices") and len(t["prices"]) > 0 else None,
        t["vintage"]["wine"]["style"]["acidity"] if t["vintage"]["wine"]["style"] else None,
        t["vintage"]["wine"]["style"]["blurb"] if t["vintage"]["wine"]["style"] else None,
        t["vintage"]["wine"]["style"]["body"] if t["vintage"]["wine"]["style"] else None,
        t["vintage"]["wine"]["style"]["body_description"] if t["vintage"]["wine"]["style"] else None,
        t['vintage']['wine']['region']['country']['name'] if t.get('vintage') and t['vintage'].get('wine') and t['vintage']['wine'].get('region') and t['vintage']['wine']['region'].get('country') else None,
        t['vintage']['wine']['style']['description'] if t["vintage"]["wine"]["style"] else None,

        # Top3 food
        t['vintage']['wine']['style']['food'][0]['name'] if t["vintage"]["wine"]["style"] and t["vintage"]["wine"]["style"].get('food') and len(t['vintage']['wine']['style']['food']) > 0 else None,
        t['vintage']['wine']['style']['food'][1]['name'] if t["vintage"]["wine"]["style"] and t["vintage"]["wine"]["style"].get('food') and len(t['vintage']['wine']['style']['food']) > 1 else None,
        t['vintage']['wine']['style']['food'][2]['name'] if t["vintage"]["wine"]["style"] and t["vintage"]["wine"]["style"].get('food') and len(t['vintage']['wine']['style']['food']) > 2 else None,

        # Top3 grapes
        t['vintage']['wine']['region']['country']['most_used_grapes'][0]['name'] if t.get('vintage') and t['vintage'].get('wine') and t['vintage']['wine'].get('region') and t['vintage']['wine']['region'].get('country') and t['vintage']['wine']['region']['country'].get('most_used_grapes') and len(t['vintage']['wine']['region']['country']['most_used_grapes']) > 0 else None,
        t['vintage']['wine']['region']['country']['most_used_grapes'][1]['name'] if t.get('vintage') and t['vintage'].get('wine') and t['vintage']['wine'].get('region') and t['vintage']['wine']['region'].get('country') and t['vintage']['wine']['region']['country'].get('most_used_grapes') and len(t['vintage']['wine']['region']['country']['most_used_grapes']) > 1 else None,
        t['vintage']['wine']['region']['country']['most_used_grapes'][2]['name'] if t.get('vintage') and t['vintage'].get('wine') and t['vintage']['wine'].get('region') and t['vintage']['wine']['region']['country'].get('most_used_grapes') and len(t['vintage']['wine']['region']['country']['most_used_grapes']) > 2 else None,

        # Sturcture?
        t['vintage']['wine']['taste']['structure']['acidity'] if t["vintage"]["wine"]["taste"].get('structure') else None,
        t['vintage']['wine']['taste']['structure']['calculated_structure_count'] if t["vintage"]["wine"]["taste"].get('structure') else None,
        t['vintage']['wine']['taste']['structure']['intensity'] if t["vintage"]["wine"]["taste"].get('structure') else None,
        t['vintage']['wine']['taste']['structure']['sweetness'] if t["vintage"]["wine"]["taste"].get('structure') else None,
        t['vintage']['wine']['taste']['structure']['tannin'] if t["vintage"]["wine"]["taste"].get('structure') else None,
        t['vintage']['wine']['taste']['structure']['user_structure_count'] if t["vintage"]["wine"]["taste"].get('structure') else None,

        # Top3 favor and count
        t['vintage']['wine']['taste']['flavor'][0]['group'] if t["vintage"]["wine"]["taste"].get('flavor') and len(t["vintage"]["wine"]["taste"]["flavor"]) > 0 else None,
        t['vintage']['wine']['taste']['flavor'][0]['stats']['count'] if t["vintage"]["wine"]["taste"].get('flavor') and len(t["vintage"]["wine"]["taste"]["flavor"]) > 0 else None,
        t['vintage']['wine']['taste']['flavor'][1]['group'] if len(t["vintage"]["wine"]["taste"]["flavor"]) > 1 else None,
        t['vintage']['wine']['taste']['flavor'][1]['stats']['count'] if len(t["vintage"]["wine"]["taste"]["flavor"]) > 1 else None,
        t['vintage']['wine']['taste']['flavor'][2]['group'] if len(t["vintage"]["wine"]["taste"]["flavor"]) > 2 else None,
        t['vintage']['wine']['taste']['flavor'][2]['stats']['count'] if len(t["vintage"]["wine"]["taste"]["flavor"]) > 2 else None,
    )
    for t in r.json()["explore_vintage"]["matches"]
]
    Wine_results.extend(results)

columns = [
    "Winery", "Year", "Wine ID", "Wine", "Rating", "num_review", "Price", "Acidity",
    "Blurb", "Body", "Body Description", "Country", "Wine Description", "Food 1",
    "Food 2", "Food 3", "Grape 1", "Grape 2", "Grape 3", "Structure Acidity",
    "Calculated Structure Count", "Intensity", "Sweetness", "Tannin", "User Structure Count",
    "Flavor 1", "Flavor 1 Count", "Flavor 2", "Flavor 2 Count", "Flavor 3", "Flavor 3 Count"
]




# revert to DataFrame
df = pd.DataFrame(Wine_results, columns=columns)
df.to_csv('wines.csv', index=False)
df.shape


Getting data from page 1
Getting data from page 2
Getting data from page 3
Getting data from page 4
Getting data from page 5
Getting data from page 6
Getting data from page 7
Getting data from page 8
Getting data from page 9
Getting data from page 10
Getting data from page 11
Getting data from page 12
Getting data from page 13
Getting data from page 14
Getting data from page 15
Getting data from page 16
Getting data from page 17
Getting data from page 18
Getting data from page 19
Getting data from page 20
Getting data from page 21
Getting data from page 22
Getting data from page 23
Getting data from page 24
Getting data from page 25
Getting data from page 26
Getting data from page 27
Getting data from page 28
Getting data from page 29
Getting data from page 30
Getting data from page 31
Getting data from page 32
Getting data from page 33
Getting data from page 34
Getting data from page 35
Getting data from page 36
Getting data from page 37
Getting data from page 38
Getting data from pag

(1250, 31)

In [7]:
df.head()

Unnamed: 0,Winery,Year,Wine ID,Wine,Rating,num_review,Price,Acidity,Blurb,Body,...,Intensity,Sweetness,Tannin,User Structure Count,Flavor 1,Flavor 1 Count,Flavor 2,Flavor 2 Count,Flavor 3,Flavor 3 Count
0,Armonizar,2019,9330066,Chenin Blanc 2019,3.6,42,3.13,2.0,,3.0,...,3.0,1.4925,,0.0,tree_fruit,4.0,vegetal,2.0,oak,2.0
1,Pure The Winery,N.V.,7830721,Zero Sugar White N.V.,3.0,131,5.02,3.0,,3.0,...,2.6,2.0,,1.0,microbio,1.0,,,,
2,Casarena,2020,6264234,Areyna Torrontes 2020,3.9,78,5.02,2.0,,3.0,...,3.435484,1.376411,,3.0,citrus_fruit,49.0,tree_fruit,32.0,earth,23.0
3,Glass Mountain,2018,778934,Chardonnay 2018,3.4,33,5.02,2.0,,4.0,...,4.788283,2.761159,,4.0,tree_fruit,12.0,oak,12.0,non_oak,7.0
4,Michel Lelu,2018,7415224,Muscadet 2018,3.6,110,5.33,3.0,,2.0,...,1.839695,1.255114,,3.0,citrus_fruit,11.0,tree_fruit,7.0,earth,6.0


In [8]:
# User describe data
ratings = []


for _, row in df.iterrows():
    page = 1
    while True:
        print(f'Getting info about wine {row["Wine ID"]}-{row["Year"]} Page {page}')

        d = get_wine_data(row["Wine ID"], row["Year"], page)

        if not d["reviews"]:
            break

        for r in d["reviews"]:
            if r["language"] != "en":
                continue

            ratings.append(
                [
                    row["Year"],
                    row["Wine ID"],
                    r["rating"],
                    r["note"],
                    r["created_at"],
                ]
            )

        page += 1


ratings = pd.DataFrame(
    ratings, columns=["Year", "Wine ID", "User Rating", "Note", "CreatedAt"]
)



ratings.to_csv('ratings.csv', index=False)
ratings.head()

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
Getting info about wine 1816272-2021 Page 3
Getting info about wine 1816272-2021 Page 4
Getting info about wine 1816272-2021 Page 5
Getting info about wine 1816272-2021 Page 6
Getting info about wine 1816272-2021 Page 7
Getting info about wine 1816272-2021 Page 8
Getting info about wine 1816272-2021 Page 9
Getting info about wine 1816272-2021 Page 10
Getting info about wine 1816272-2021 Page 11
Getting info about wine 1816272-2021 Page 12
Getting info about wine 1816272-2021 Page 13
Getting info about wine 1816272-2021 Page 14
Getting info about wine 1816272-2021 Page 15
Getting info about wine 1816272-2021 Page 16
Getting info about wine 1816272-2021 Page 17
Getting info about wine 1816272-2021 Page 18
Getting info about wine 1816272-2021 Page 19
Getting info about wine 1816272-2021 Page 20
Getting info about wine 1816272-2021 Page 21
Getting info about wine 1816272-2021 Page 22
Getting info about wine 1816272-2021 Page 23
Getting info about wi

Unnamed: 0,Year,Wine ID,User Rating,Note,CreatedAt
0,2019,9330066,4.0,"We liked this. It’s has some acidity, but it’s...",2023-04-07T00:15:06.000Z
1,2019,9330066,3.6,yellow apple pear straw,2023-02-17T02:32:02.000Z
2,2019,9330066,3.8,I liked this wine quite a bit. It’s fairly com...,2024-08-27T02:21:52.000Z
3,2019,9330066,4.0,"Pear high acidity , ripe fruit, dry. Really good",2024-03-16T00:25:52.000Z
4,2019,9330066,2.7,This wine is extremely bland and lacks any fla...,2024-06-09T15:11:08.000Z


In [9]:
ratings.shape

(1233562, 5)

In [10]:
df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
df["Price"] = pd.to_numeric(df["Price"], errors="coerce")
df["Rating"] = pd.to_numeric(df["Rating"], errors="coerce")

# merge data together
df_out = ratings.merge(df)

df_out = df_out.sort_values(by="Year", ascending=True)

df_out.to_csv("data.csv", index=False)

print("Data saved successfully.")

Data saved successfully.


In [None]:
all = pd.read_csv('data.csv')
all.shape

(22471, 34)

In [None]:
all.head()

Unnamed: 0,Year,Wine ID,User Rating,Note,CreatedAt,Winery,Wine,Rating,num_review,Price,...,Intensity,Sweetness,Tannin,User Structure Count,Flavor 1,Flavor 1 Count,Flavor 2,Flavor 2 Count,Flavor 3,Flavor 3 Count
0,2013,4965593,4.0,Great with grilled salmon,2020-08-09T23:00:54.000Z,Beringer Main & Vine,Pinot Grigio 2013,3.1,32,6.26,...,2.917351,2.152783,,26.0,tree_fruit,117,citrus_fruit,68.0,earth,43.0
1,2013,4965593,3.5,"Decent, fruity, crisp. Good table white.",2020-10-25T01:39:42.000Z,Beringer Main & Vine,Pinot Grigio 2013,3.1,32,6.26,...,2.917351,2.152783,,26.0,tree_fruit,117,citrus_fruit,68.0,earth,43.0
2,2013,4965593,4.0,"Nose of honey, orange peel and green apple. Fl...",2018-09-05T00:50:36.000Z,Beringer Main & Vine,Pinot Grigio 2013,3.1,32,6.26,...,2.917351,2.152783,,26.0,tree_fruit,117,citrus_fruit,68.0,earth,43.0
3,2013,4965593,3.0,Pinot Grigio from Sonoma? Works ok but a bit t...,2021-07-04T00:21:08.000Z,Beringer Main & Vine,Pinot Grigio 2013,3.1,32,6.26,...,2.917351,2.152783,,26.0,tree_fruit,117,citrus_fruit,68.0,earth,43.0
4,2013,4965593,2.5,Ok wine. Better for the price. Color good. ...,2019-09-15T03:52:46.000Z,Beringer Main & Vine,Pinot Grigio 2013,3.1,32,6.26,...,2.917351,2.152783,,26.0,tree_fruit,117,citrus_fruit,68.0,earth,43.0
