In [1]:
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd

In [2]:
data = pd.read_csv('wine_data.csv')

In [3]:
data['variety'].value_counts()

Pinot Noir                  12787
Chardonnay                  11080
Cabernet Sauvignon           9386
Red Blend                    8476
Bordeaux-style Red Blend     5340
Riesling                     4972
Sauvignon Blanc              4783
Syrah                        4086
Rosé                         3262
Merlot                       3062
Zinfandel                    2708
Malbec                       2593
Sangiovese                   2377
Nebbiolo                     2331
Portuguese Red               2196
White Blend                  2172
Sparkling Blend              2027
Tempranillo                  1789
Rhône-style Red Blend        1405
Pinot Gris                   1391
Cabernet Franc               1305
Champagne Blend              1211
Grüner Veltliner             1145
Pinot Grigio                 1002
Portuguese White              986
Name: variety, dtype: int64

In [4]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["variety"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [5]:
strat_test_set["variety"].value_counts() / len(strat_test_set)

Pinot Noir                  0.136245
Chardonnay                  0.118029
Cabernet Sauvignon          0.099973
Red Blend                   0.090280
Bordeaux-style Red Blend    0.056884
Riesling                    0.052996
Sauvignon Blanc             0.050972
Syrah                       0.043515
Rosé                        0.034780
Merlot                      0.032650
Zinfandel                   0.028868
Malbec                      0.027643
Sangiovese                  0.025300
Nebbiolo                    0.024820
Portuguese Red              0.023382
White Blend                 0.023116
Sparkling Blend             0.021571
Tempranillo                 0.019068
Rhône-style Red Blend       0.014967
Pinot Gris                  0.014807
Cabernet Franc              0.013901
Champagne Blend             0.012889
Grüner Veltliner            0.012197
Pinot Grigio                0.010652
Portuguese White            0.010493
Name: variety, dtype: float64

In [6]:
data["variety"].value_counts() / len(data)

Pinot Noir                  0.136217
Chardonnay                  0.118033
Cabernet Sauvignon          0.099987
Red Blend                   0.090293
Bordeaux-style Red Blend    0.056886
Riesling                    0.052966
Sauvignon Blanc             0.050952
Syrah                       0.043527
Rosé                        0.034749
Merlot                      0.032619
Zinfandel                   0.028848
Malbec                      0.027623
Sangiovese                  0.025322
Nebbiolo                    0.024832
Portuguese Red              0.023394
White Blend                 0.023138
Sparkling Blend             0.021593
Tempranillo                 0.019058
Rhône-style Red Blend       0.014967
Pinot Gris                  0.014818
Cabernet Franc              0.013902
Champagne Blend             0.012901
Grüner Veltliner            0.012197
Pinot Grigio                0.010674
Portuguese White            0.010504
Name: variety, dtype: float64

In [7]:
### Training Dataset
strat_train_set.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
86173,US,While this Tempranillo is straightforward in s...,,83,15,California,Paso Robles,Central Coast,,,Casa de Arcilla 2010 Tempranillo (Paso Robles),Tempranillo,Casa de Arcilla
16816,US,The delicate nose on this reserve Chard offers...,Estate Bottled Reserve,86,20,New York,Finger Lakes,Finger Lakes,Susan Kostrzewa,@suskostrzewa,Lamoreaux Landing 2007 Estate Bottled Reserve ...,Chardonnay,Lamoreaux Landing
75580,England,The crisp nose conjures up shades of green: me...,Blanc de Blancs,91,75,England,,,Anne Krebiehl MW,@AnneInVino,Bride Valley Vineyard 2013 Blanc de Blancs Spa...,Sparkling Blend,Bride Valley Vineyard
45481,US,There's a menthol-infused sense of chaparral s...,Winemaker's Reserve,87,45,California,Temecula Valley,South Coast,Matt Kettmann,@mattkettmann,Callaway 2010 Winemaker's Reserve Cabernet Sau...,Cabernet Sauvignon,Callaway
410,Austria,"That herbal, yeasty tang of Grüner is on displ...",,89,18,Kamptal,,,Anne Krebiehl MW,@AnneInVino,Steininger 2016 Grüner Veltliner (Kamptal),Grüner Veltliner,Steininger


In [8]:
### Testing Dataset
strat_test_set.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
34011,US,"A closed, tight and grippy wine, this has a bi...",Crazy Creek,84,21,California,Alexander Valley,Sonoma,Virginie Boone,@vboone,Katherine Goldschmidt 2013 Crazy Creek Caberne...,Cabernet Sauvignon,Katherine Goldschmidt
69923,Italy,"Dark berry, oak and chopped mint scents lead t...",,87,19,Tuscany,Chianti Classico,,Kerin O’Keefe,@kerinokeefe,Casaloste 2013 Chianti Classico,Red Blend,Casaloste
14931,France,It's an evocative name and that's what Provenc...,Summer Dreams,87,17,Provence,Côtes de Provence,,Roger Voss,@vossroger,Château Sainte-Béatrice 2016 Summer Dreams Ros...,Rosé,Château Sainte-Béatrice
44646,France,"Simple, but up-front strawberry fruit reigns o...",Tradition,86,27,Alsace,Alsace,,Anne Krebiehl MW,@AnneInVino,Domaine Sipp-Mack 2013 Tradition Pinot Noir (A...,Pinot Noir,Domaine Sipp-Mack
26713,US,"Tart with acidity but lush with fruit, this is...",,91,34,California,Sonoma Coast,Sonoma,,,Failla 2012 Chardonnay (Sonoma Coast),Chardonnay,Failla
