In [21]:
import numpy as np
import pandas as pd
import random

In [22]:
data = {
    "grape": [],
    "region": [],
    "vintage": [],
    "estate": [],
    "grape_law": [],
    "region_law": [],
    "vintage_law": [],
    "country": [],
    "designation": []
}

df = pd.DataFrame(data)

##Washington
Important grapes: Riesling, Chardonnay, Pinot Gris, Cabernet Sauvignon, Merlot, Syrah

Varietal Label: 75%

Region: 95% (100% for estate)

Vintage: 95% if AVA, 85% if state wine

In [23]:
# Washington wine categories
wa_grape = ["riesling", "chardonnay", "pinot gris", "cabernet sauvignon", "merlot", "syrah"]
wa_region = ["wa", "washington", "columbia valley", "yakima valley", "walla walla valley", "red mountain", "horse heaven hills", "wahluke slope", "rattlesnake hills", "snipes mountain"]
wa_vintage = np.arange(2015, 2025)

In [24]:
# Similar ratio between state wine and AVA wine
# 6 grapes, 2 states (18), 8 AVAs (6)
# Honestly can simplify feature as WA or WA AVA
for region in wa_region:
  num = 6
  if region in ["wa", "washington"]:
    num = 18

  isAVA = region not in ["wa", "washington"]
  for i in range(num):
    isEstate = (random.randint(0, 3) == 0)
    label = {"grape": random.choice(wa_grape),
             "region": region,
             "vintage": random.choice(wa_vintage),
             "estate": isEstate,
             "grape_law": 0.75,
             "region_law": 1 if isEstate else 0.95,
             "vintage_law": 0.95 if isAVA else 0.85,
             "country": "washington",
             "designation": "Estate" if isEstate else ("AVA" if isAVA else "Country")
             }
    df.loc[len(df)] = label

##Oregon

Important grapes: Pinot Noir, Pinot Gris, Chardonnay, Syrah, Cabernet Sauvignon

Varietal Label: 90% except 18 full bodied red varietals (Cabernet franc, Cabernet Sauvignon, Carmenere, Petite Sirah, Grenache, Malbec, Marsanne, Merlot, Mourvedre, Petit Verdot, Roussanne, Sangiovese, Sauvignon blanc, Semillion, Syrah, Tannat, Tempranillo and Zinfandel) that require 75%

Region: 95% if AVA, 100% if state or estate

Vintage: 95% if AVA, 85% if state

In [25]:
or_grape = ["pinot noir", "pinot gris", "chardonnay"]
or_grape_75 = ["cabernet franc", "cabernet sauvignon", "carmenere", "petite sirah", "grenache", "malbec", "marsanne", "merlot", "mourvedre", "petit verdot", "roussanne", "sangiovese", "sauvignon blanc", "semillion", "syrah", "tannat", "tempranillo", "zinfandel"]
or_region = ["or", "oregon", "willamette valley", "dundee hills", "yamhill-carlton", "mcminnville", "ribbon ridge", "chehalem mountains", "umpqua valley", "rogue valley", "applegate valley"]
or_vintage = np.arange(2015, 2025)

In [26]:
# 2 states (18), 9 AVAs (6)
# 75% wines Cabernet franc, Cabernet Sauvignon, Carmenere, Petite Sirah, Grenache, Malbec, Marsanne, Merlot, Mourvedre, Petit Verdot, Roussanne, Sangiovese, Sauvignon blanc, Semillion, Syrah, Tannat, Tempranillo and Zinfandel
for region in or_region:
  num = 6
  if region in ["or", "oregon"]:
    num = 18

  isAVA = region not in ["or", "oregon"]

  for i in range(num):
    is75 = random.randint(0, 1) == 0
    if is75:
      grape = random.choice(or_grape_75)
    else:
      grape = random.choice(or_grape)
    isEstate = (random.randint(0, 3) == 0)

    label = {"grape": grape,
             "region": region,
             "vintage": random.choice(or_vintage),
             "estate": isEstate,
             "grape_law": 0.75 if is75 else 0.90,
             "region_law": 1 if (isEstate or not isAVA) else 0.95,
             "vintage_law": 0.95 if isAVA else 0.85,
             "country": "oregon",
             "designation": "Estate" if isEstate else ("AVA" if isAVA else "Country")
             }
    df.loc[len(df)] = label

##New York
Important grapes: Riesling, Chardonnay, Merlot, Cabernet Sauvignon, Pinot Noir

Varietal Label: 75%

Region: 75% if state, 85% if AVA (100% for estate)

Vintage: 85% if state, 95% if AVA

MISSING SINGLE VINEYARD

In [27]:
ny_grape = ["riesling", "chardonnay", "cabernet franc", "merlot", "cabernet sauvignon", "pinot noir"]
ny_region = ["ny", "new york", "finger lakes", "cayuga lake", "seneca lake", "north fork of long island", "hamptons", "hudson river region", "lake erie"]
ny_vintage = np.arange(2015, 2025)

In [28]:
# 2 states (18), 7 AVAs (8)
for region in ny_region:
  num = 8
  if region in ["ny", "new york"]:
    num = 18
  isAVA = region not in ["ny", "new york"]

  for i in range(num):
    isEstate = (random.randint(0, 3) == 0)
    label = {"grape": random.choice(ny_grape),
             "region": region,
             "vintage": random.choice(ny_vintage),
             "estate": isEstate,
             "grape_law": 0.75,
             "region_law": 0.95 if isAVA else 1 if isEstate else 0.85,
             "vintage_law": 0.95 if isAVA else 0.85,
             "country": "new york",
             "designation": "Estate" if isEstate else ("AVA" if isAVA else "Country")
            }
    df.loc[len(df)] = label

In [31]:
# Save to CSV
df.to_csv('NY_OR_WA_wines.csv', index=False)
print("NY, OR, WA wines saved to NY_OR_WA_wines.csv")

NY, OR, WA wines saved to NY_OR_WA_wines.csv


##Analysis

In [29]:
grapes = ["riesling", "chardonnay", "pinot gris", "cabernet sauvignon", "merlot", "syrah", "pinot noir", "cabernet franc"]
print("\nGRAPES")
for grape in grapes:
  print(grape, len(df[df["grape"] == grape]))

print("\nREGIONS")
for region in df['region'].unique().tolist():
  print(region, len(df[df["region"] == region]))
  print(region + "(not estate)", len(df[(df['region'] == region) & (df['estate'] == False)]))
  print()

print("\nVINTAGES")
for vintage in df['vintage'].unique().tolist():
  print(vintage, len(df[df["vintage"] == vintage]))

print("\nESTATES")
print("estate", len(df[df["estate"] == True]))


GRAPES
riesling 30
chardonnay 32
pinot gris 32
cabernet sauvignon 38
merlot 34
syrah 15
pinot noir 27
cabernet franc 19

REGIONS
wa 18
wa(not estate) 16

washington 18
washington(not estate) 15

columbia valley 6
columbia valley(not estate) 5

yakima valley 6
yakima valley(not estate) 2

walla walla valley 6
walla walla valley(not estate) 5

red mountain 6
red mountain(not estate) 4

horse heaven hills 6
horse heaven hills(not estate) 6

wahluke slope 6
wahluke slope(not estate) 5

rattlesnake hills 6
rattlesnake hills(not estate) 5

snipes mountain 6
snipes mountain(not estate) 5

or 18
or(not estate) 13

oregon 18
oregon(not estate) 13

willamette valley 6
willamette valley(not estate) 5

dundee hills 6
dundee hills(not estate) 5

yamhill-carlton 6
yamhill-carlton(not estate) 4

mcminnville 6
mcminnville(not estate) 3

ribbon ridge 6
ribbon ridge(not estate) 4

chehalem mountains 6
chehalem mountains(not estate) 6

umpqua valley 6
umpqua valley(not estate) 6

rogue valley 6
rogue va

In [30]:
df.head()

Unnamed: 0,grape,region,vintage,estate,grape_law,region_law,vintage_law,country,designation
0,chardonnay,wa,2015,False,0.75,0.95,0.85,washington,Country
1,pinot gris,wa,2024,False,0.75,0.95,0.85,washington,Country
2,pinot gris,wa,2018,False,0.75,0.95,0.85,washington,Country
3,chardonnay,wa,2015,False,0.75,0.95,0.85,washington,Country
4,cabernet sauvignon,wa,2022,False,0.75,0.95,0.85,washington,Country
