In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree

In [6]:
df = pd.read_parquet('data/wines_with_weather.parquet')
df.head()

Unnamed: 0,Winery,Wine,vintage,vintage_rating,vintage_rating_count,wine_rating_count,wine_rating,region,price,cepages,...,TX_summer,temp_amp_summer,hot_days,rainy_days_summer,rain_June,rain_SepOct,frost_days_Apr,avg_TM_Apr,year,dist_km
0,Château de Lavagnac,Bordeaux 2015,2015,3.6,76,766,3.5,bordeaux,4.1,cabernet-sauvignon,...,28.890323,11.954839,4.0,6.0,34.0,71.5,0.0,14.29,2015.0,5.314533
1,Château de Callac,Graves Rouge 2011,2011,3.6,221,3008,3.7,graves,4.5,cabernet-sauvignon,...,26.356452,11.301613,2.0,5.0,55.5,94.5,0.0,15.75,2011.0,8.487753
2,Château Gravelier,Bordeaux Rouge 2022,2022,3.4,621,621,3.4,bordeaux,4.9,cabernet-sauvignon,...,32.291935,13.583871,17.0,1.0,110.8,82.4,0.0,13.3,2022.0,1.518107
3,Château de Brandey,Bordeaux Rouge 2015,2015,3.6,333,1705,3.4,bordeaux,4.9,cabernet-sauvignon,...,28.890323,11.954839,4.0,6.0,34.0,71.5,0.0,14.29,2015.0,5.314533
4,Les Hauts de Palette,Château Haut-Mondain Bordeaux Rouge 2016,2016,3.4,258,1919,3.6,bordeaux,4.9,cabernet-sauvignon,...,28.667742,12.424194,8.0,0.0,74.5,81.5,0.0,11.876667,2016.0,5.314533


In [11]:
# -------------------------------
# 1. Build 30-km neighbourhoods
# -------------------------------
R_EARTH = 6371  # km
coords_rad = np.radians(df[['latitude', 'longitude']])
tree = BallTree(coords_rad, metric='haversine')

radius = 30 / R_EARTH          # 30-km in radians
clusters = tree.query_radius(coords_rad, r=radius)

# Map each index to the “smallest” cluster id so that overlapping
# neighbourhoods collapse into one community (simple union-find)
parent = list(range(len(df)))
def find(i):
    while parent[i] != i:
        parent[i] = parent[parent[i]]
        i = parent[i]
    return i
for i, neigh in enumerate(clusters):
    for j in neigh:
        root_i, root_j = find(i), find(j)
        if root_i != root_j:
            parent[root_j] = root_i
cluster_id = [find(i) for i in range(len(df))]
df['cluster_id'] = cluster_id

# -------------------------------
# 2. Print grouped region names
# -------------------------------
group_map = (df.groupby('cluster_id')['region']
               .apply(lambda x: sorted(set(x)))
               .to_dict())

print("=== New 'neighbourhood' groups ===")
for cid, regions in group_map.items():
    print(f"Cluster {cid:>4}: {', '.join(regions)}")
    
# print the number of clusters
print(f"\nTotal number of clusters: {len(set(cluster_id))}")


=== New 'neighbourhood' groups ===
Cluster   19: cahors
Cluster   40: cotes-de-gascogne, gers, madiran, saint-mont
Cluster   67: loire-valley, touraine, touraine-amboise
Cluster   78: vin-de-pays-vignobles-de-france
Cluster  103: cevennes, saint-estephe
Cluster  144: valencay
Cluster  155: blaye, blaye-cotes-de-bordeaux, bordeaux, bordeaux-superieur, bouches-du-rhone, buzet, cadillac-cotes-de-bordeaux, chambolle-musigny-aux-combottes, cotes-de-bordeaux, cotes-de-bourg, cotes-de-castillon, cotes-du-marmandais, cotes-du-rhone-villages-valreas, ctes-de-duras, fronsac, graves, haut-medoc, libournais, listrac, lussac-st-emilion, margaux, medoc, montagne-saint-emilion, pauillac, pessac-leognan, pomerol, premieres-cotes-de-bordeaux, saint-emilion, saint-georges-saint-emilion, saint-julien, saint-pourcain, sainte-foy-bordeaux
Cluster  160: southwest
Cluster  222: ardeche, chambolle-musigny-les-hauts-doix, cornas, crozes-hermitage, hermitage, puligny-montrachet-premier-cru, rhone, saint-joseph


In [9]:
#number of distinct cepages
distinct_cepages = df['cepages'].nunique()
print(f"\nNumber of distinct cepages: {distinct_cepages}")

#number of distinct winery names
distinct_winery_names = df['Winery'].nunique()
print(f"Number of distinct winery names: {distinct_winery_names}")


Number of distinct cepages: 7
Number of distinct winery names: 2608
