In [1]:
import os
import sys

import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import sent_tokenize # sentence tokenization
import spacy

import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from lightgbm import LGBMClassifier

In [2]:
path = '/content/drive/My Drive/wine_tasting'
os.chdir(path)

In [3]:
# # upload the cleaned data
# !wget header =wget --header="Host: storage.googleapis.com" --header="User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36" --header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" --header="Accept-Language: en-US,en;q=0.9" --header="Referer: https://www.kaggle.com/" "https://storage.googleapis.com/kagglesdsdata/datasets/945559/1605343/top_varieties_count.csv?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20201108%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20201108T045531Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=34b012de713fc37be30721e2df0f4fd0b7b18f731d7c9c556e3b28857cd3a445b70d6d7974cbfe968eec3bf0811e66fc1385764fea131b92ead4e0c52021bccdd305eafd8deb3b46b97f42ab781a0dec8259ac345656608b51c43112be8f2880c0a680c4beff697f41e191b501a67a11a696f9da3290d3fb9bb2236cf57f958d1f40261c2ce1837a6a0f5203dd50c198d685d7741cb512f055bc3ae43a1b5587aa695d506aab5d6aceaab3d1bf8f646e19ca59788bce5425d67dd1619ac76df4d4ecaaec2085771fd3e3837839617b85728b8beaad35d670c5763e8fdf97fa0b0c49b9d200667072d32b61853136d5a2673c0cfb4c43b741178b9e1249c28eaf" -c -O 'top_varieties_count.csv'

In [4]:
# !unzip CurlWget916

In [5]:
!ls

archive.zip	      top_varieties_count.csv	 winemag-data_first150k.csv
CurlWget916	      winemag-data-130k-v2.csv
top_40_varieties.csv  winemag-data-130k-v2.json


In [6]:
top_40_varieties = pd.read_csv('top_40_varieties.csv')

In [7]:
top_40_varieties.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,not_vintage,vintage
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,Italy Vulkà Bianco Sicily & Sardinia Etna Nicosia,2013.0
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,Portugal Avidagos Douro Quinta dos Avidagos,2011.0
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,US Oregon Willamette Valley Willamette Valley ...,2013.0
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,US Reserve Late Harvest Michigan Lake Michigan...,2013.0
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,US Vintner's Reserve Wild Child Block Oregon W...,2012.0


In [8]:

# replacing every occurence of 'US' in country with 'United States of America'
top_40_varieties['country'].replace('US', 'United States of America', inplace=True)

# replacing every occurence of 'US' in not_vintage with 'United States of America'
top_40_varieties['not_vintage'] = top_40_varieties['not_vintage'].apply(lambda x: x.replace(\
                                                                        'US', 'United States of America'))

top_40_varieties['not_vintage'].apply(lambda x: 'United States of America' in x).sum() 

50734

In [9]:
top_varieties_count = pd.read_csv('top_varieties_count.csv')
top_varieties_count.head()

Unnamed: 0.1,Unnamed: 0,variety
0,Pinot Noir,13272
1,Chardonnay,11753
2,Cabernet Sauvignon,9472
3,Red Blend,8946
4,Bordeaux-style Red Blend,6915


In [10]:
top_varieties_count = top_varieties_count.rename(columns={'variety': 'count', 'Unnamed: 0': 'variety'})
top_varieties_count = top_varieties_count.set_index('variety') # setting the index
top_varieties_count = top_varieties_count['count'] # making it a Series
top_varieties_count

variety
Pinot Noir                       13272
Chardonnay                       11753
Cabernet Sauvignon                9472
Red Blend                         8946
Bordeaux-style Red Blend          6915
Riesling                          5189
Sauvignon Blanc                   4967
Syrah                             4142
Rosé                              3564
Merlot                            3102
Nebbiolo                          2804
Zinfandel                         2714
Sangiovese                        2707
Malbec                            2652
Portuguese Red                    2466
White Blend                       2360
Sparkling Blend                   2153
Tempranillo                       1810
Rhône-style Red Blend             1471
Pinot Gris                        1455
Champagne Blend                   1396
Cabernet Franc                    1353
Grüner Veltliner                  1345
Portuguese White                  1159
Bordeaux-style White Blend        1066
Pinot Grigio     

Since there is a wide gap between the first few varieties and the buttom varieties, we will rather work with the first 20, to aviod biasness in the model. 

In [11]:
# selecting the first 20 varieties to work with
wine_df = top_40_varieties[top_40_varieties['variety'].isin(top_varieties_count.iloc[:20].index)]
wine_df['variety'].value_counts()

Pinot Noir                  13272
Chardonnay                  11753
Cabernet Sauvignon           9472
Red Blend                    8946
Bordeaux-style Red Blend     6915
Riesling                     5189
Sauvignon Blanc              4967
Syrah                        4142
Rosé                         3564
Merlot                       3102
Nebbiolo                     2804
Zinfandel                    2714
Sangiovese                   2707
Malbec                       2652
Portuguese Red               2466
White Blend                  2360
Sparkling Blend              2153
Tempranillo                  1810
Rhône-style Red Blend        1471
Pinot Gris                   1455
Name: variety, dtype: int64

These first 20 look much better. However, there is still a wide gap between _Pinot Noir_ and _Pinot Gris_. Therfore, we will be oversampling _varieties_ that has value counts less than 5000.

In [12]:
threshold = 5000

# making a dataframe of the miniority classes
minority_df = wine_df[wine_df['variety'].isin(top_varieties_count[top_varieties_count < \
                                                                      threshold].index)]
print(f'minority_df shape: {minority_df.shape}')
print(f"No. of unique varieties: {len(minority_df['variety'].unique())}","\n")
minority_df['variety'].value_counts()

minority_df shape: (38367, 15)
No. of unique varieties: 14 



Sauvignon Blanc          4967
Syrah                    4142
Rosé                     3564
Merlot                   3102
Nebbiolo                 2804
Zinfandel                2714
Sangiovese               2707
Malbec                   2652
Portuguese Red           2466
White Blend              2360
Sparkling Blend          2153
Tempranillo              1810
Rhône-style Red Blend    1471
Pinot Gris               1455
Name: variety, dtype: int64

In [13]:
oversampled_miniority_lst = [] # empty list to store sentences as tokens miniority corpus

# creating a function to use sentences as tokens for the miniority classes
def over_sample_miniority(row):
    doc_list = sent_tokenize(row['description'])
    for sentence in doc_list:
        row['description'] = sentence
        oversampled_miniority_lst.append(list(row))

In [14]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [15]:
# apply on the minority df to oversample them
minority_df.apply(over_sample_miniority, axis=1)

0         None
1         None
2         None
7         None
13        None
          ... 
111781    None
111782    None
111787    None
111791    None
111795    None
Length: 38367, dtype: object

In [16]:
oversampled_miniority_lst[0]

['Italy',
 'Aromas include tropical fruit, broom, brimstone and dried herb.',
 'Vulkà Bianco',
 87,
 nan,
 'Sicily & Sardinia',
 'Etna',
 nan,
 'Kerin O’Keefe',
 '@kerinokeefe',
 'Nicosia 2013 Vulkà Bianco  (Etna)',
 'White Blend',
 'Nicosia',
 'Italy Vulkà Bianco Sicily & Sardinia Etna Nicosia',
 2013.0]

In [17]:
# converts oversampled_miniority_lst to a dataframe
oversampled_miniority_df = pd.DataFrame(oversampled_miniority_lst, columns=minority_df.columns)
print(f'oversampled_minority_df shape: {oversampled_miniority_df.shape}','\n')
oversampled_miniority_df.head(2)

oversampled_minority_df shape: (104101, 15) 



Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,not_vintage,vintage
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,Italy Vulkà Bianco Sicily & Sardinia Etna Nicosia,2013.0
1,Italy,"The palate isn't overly expressive, offering u...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,Italy Vulkà Bianco Sicily & Sardinia Etna Nicosia,2013.0


We can observe that after applying the _over_sample_minority_ function to the _minority_df_, the new dataframe (oversampled_minority_df) has  length of __104101__ as against the initial __38367__. This is because, the function oversampled the minority dataframe by splitting each description in the _description_ column per sentence.

In [18]:
oversampled_miniority_df['variety'].value_counts()

Sauvignon Blanc          12801
Syrah                    11128
Rosé                      9207
Nebbiolo                  8107
Malbec                    8047
Merlot                    7874
Portuguese Red            7734
Sangiovese                7432
Zinfandel                 6512
White Blend               6034
Sparkling Blend           5740
Tempranillo               5552
Rhône-style Red Blend     3972
Pinot Gris                3961
Name: variety, dtype: int64

We can see the clear difference between this dataframe and the minority dataframe. Now that we have our minority classes oversampled, we can then concatenate this dataframe with the top varieties that have value counts greater than 5000.

In [19]:
majority_df = wine_df[wine_df['variety'].isin(\
                                        top_varieties_count[top_varieties_count >= threshold].index)]

# concatenates majority_df to oversampled_miniority_lst
balanced_df = pd.concat([majority_df, oversampled_miniority_df]) 
balanced_df = balanced_df.reset_index().drop('index', axis=1) # resets index
print(f'balanced_df shape: {balanced_df.shape}','\n')
balanced_df.head(2)

balanced_df shape: (159648, 15) 



Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,not_vintage,vintage
0,United States of America,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,United States of America Reserve Late Harvest ...,2013.0
1,United States of America,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,United States of America Vintner's Reserve Wil...,2012.0


In [20]:
balanced_df['variety'].value_counts()

Pinot Noir                  13272
Sauvignon Blanc             12801
Chardonnay                  11753
Syrah                       11128
Cabernet Sauvignon           9472
Rosé                         9207
Red Blend                    8946
Nebbiolo                     8107
Malbec                       8047
Merlot                       7874
Portuguese Red               7734
Sangiovese                   7432
Bordeaux-style Red Blend     6915
Zinfandel                    6512
White Blend                  6034
Sparkling Blend              5740
Tempranillo                  5552
Riesling                     5189
Rhône-style Red Blend        3972
Pinot Gris                   3961
Name: variety, dtype: int64

So we are good. We will begin the next steps for preprocessing and model building

In [21]:
balanced_df.isnull().sum()

country                     69
description                  0
designation              43512
points                       0
price                     9325
province                    69
region_1                 25858
region_2                 95561
taster_name              30409
taster_twitter_handle    35171
title                        0
variety                      0
winery                       0
not_vintage                  0
vintage                   4522
dtype: int64

Since this project is a univarient problem of using the _description_ to predict the _variety_, we wouldn't be bothered about handling null values because neither of them have any null value.

In [24]:
# creating a spacy pipeline and disabling tagger, parser and ner to speed up tokenizer
nlp = spacy.load('en', disable=['tagger', 'parser', 'ner'])

# getting spacy's stop-words
stop_words = spacy.lang.en.STOP_WORDS

In [25]:
stop_words_lemma = {word.lemma_.lower() for word in nlp(' '.join(stop_words))} | {'-pron-', '10', '12', 
                    'aah', 'aa', 'ab', 'aaa', 'aand', '16', '2', '20', '30', '4', '40', '5', '6', '7', '8', '9'}

# defining tokenzer function to tokenize the lower case lemma of sentences in a corpus and 
# filter out stop-words  
def tokenizer(text):
    return [word.lemma_.lower() for word in nlp(text) if word.is_alpha and word.lemma_.lower() \
                not in stop_words_lemma]

In [26]:
from gensim.models import Word2Vec

# defining a function to return the mean word2vec vectors of words in a review (document) as the document vector 
def get_doc_vector(model, doc):
    return np.mean([model[word] for word in doc if word in model.wv.vocab], axis=0)

In [27]:
# for sentence oversampling
sentence_corpus = [doc1 + ' ' + doc2 for doc1, doc2 in zip(\
                                                        balanced_df['description'], balanced_df['not_vintage'])]


sentence_corpus = [tokenizer(doc) for doc in sentence_corpus] # gets corpus as a list of lists
print(sentence_corpus[:2])

[['pineapple', 'rind', 'lemon', 'pith', 'orange', 'blossom', 'start', 'aroma', 'palate', 'bite', 'opulent', 'note', 'honey', 'drizzle', 'guava', 'mango', 'way', 'slightly', 'astringent', 'semidry', 'finish', 'united', 'states', 'america', 'reserve', 'late', 'harvest', 'michigan', 'lake', 'michigan', 'shore', 'julian'], ['like', 'regular', 'bottle', 'come', 'rough', 'tannic', 'rustic', 'earthy', 'herbal', 'characteristic', 'nonetheless', 'think', 'pleasantly', 'unfussy', 'country', 'wine', 'companion', 'hearty', 'winter', 'stew', 'united', 'states', 'america', 'vintner', 'reserve', 'wild', 'child', 'block', 'oregon', 'willamette', 'valley', 'willamette', 'valley', 'sweet', 'cheeks']]


In [28]:
# getting indices of documents with no sentences
empty_list = [i for i in range(len(sentence_corpus)) if len(sentence_corpus[i])==0]
print('there are {} observations with no sentences after sentence oversampling and tokenizing'.format(len(\
                                                                                    empty_list)))

there are 0 observations with no sentences after sentence oversampling and tokenizing


In [29]:

# trains sent_oversample_corpus2 on Word2Vec. For sent_oversample_corpus2, min_count of 5 and iter (epoch) of 10 
# seems to be the best
sent_w2v_model = Word2Vec(sentence_corpus, size=300, min_count=5, iter=10)

print(sent_w2v_model) # sumarizes the word2vec


Word2Vec(vocab=19825, size=300, alpha=0.025)


In [30]:
# getting the word2vec embeddings (vectors) of the corpus as input array
sent_embedding = np.array([get_doc_vector(sent_w2v_model, doc) for doc in sentence_corpus])
sent_embedding[:1]

  """


array([[-0.9990358 , -0.383886  , -0.3056177 , -0.11168904,  0.41223928,
         0.66684955, -0.05332007, -0.31344813,  0.1440147 ,  0.3939431 ,
         0.2621555 ,  0.1077801 ,  0.1573572 , -0.13811217, -0.0762599 ,
         0.0225034 ,  0.317035  ,  0.05327213,  0.15135902,  0.49565867,
         0.6109521 ,  0.49093014,  0.36285442, -0.16635248, -0.01810125,
        -0.7246843 ,  0.23477423,  0.4604282 , -0.17538744,  0.04294502,
        -0.30166888,  0.18816115,  0.46937692, -0.5760046 , -0.219445  ,
        -0.24310288, -0.22793418,  0.18048073,  0.00119619, -0.13347885,
        -0.40587994, -0.58731323, -0.00963013,  0.7194553 , -0.6239001 ,
        -0.16996983, -0.43085515, -0.15492512, -0.00873299,  0.35536936,
         0.27765507,  0.46206495,  0.22072515,  0.43608215, -0.11256321,
         0.48017535, -0.01921457,  0.18857384, -0.6853629 , -0.45218825,
        -1.024661  ,  0.09894302,  0.30259958,  0.11836885, -0.33269846,
         0.05732047,  0.23127097,  0.5567293 , -0.0

In [31]:
# encoding the varieties as labels
encoder = LabelEncoder()
sentence_labels = encoder.fit_transform([label for label in balanced_df['variety']])

sentence_labels

array([11,  7,  1, ...,  6,  6,  6])

### Model Building with LGBM

In [32]:
assert len(sent_embedding)==len(sentence_labels)

In [None]:
# from sklearn.model_selection import train_test_split
# train_sent_embedding, test_sent_embedding, train_sent_labels, test_sent_labels = train_test_split(sent_embedding,
#                                                                                                   sentence_labels, test_size=0.25,
#                                                                                                   random_state=1)

In [33]:
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=3, random_state=1)

In [34]:
# initializes a classifier object
lgb_model = LGBMClassifier(class_weight='balanced', random_state=1)

# getting the cross validation scores
scores = cross_val_score(lgb_model, sent_embedding, sentence_labels, \
                         scoring='f1_weighted', cv=cv, n_jobs=4, verbose=True)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  12 out of  12 | elapsed: 95.0min finished


In [35]:
print(f'f1_weighted score = {round(np.mean(scores), 4)}')

f1_weighted score = 0.784
