## 1.0 Getting Data

In [1]:
# importing useful libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

### Downloading the data from our github repository

In [2]:
url1 = 'https://raw.github.com/HamoyeHQ/stage-f-06-wine-tasting/master/data/top_40_varieties.zip'
url2 = 'https://raw.github.com/HamoyeHQ/stage-f-06-wine-tasting/master/data/top_varieties_count.csv'

In [3]:
top_40_varieties = pd.read_csv(url1)
top_varieties_count = pd.read_csv(url2)

In [4]:
# replacing every occurence of 'US' in country with 'United States of America'
top_40_varieties['country'].replace('US', 'United States of America', inplace=True)

# replacing every occurence of 'US' in not_vintage with 'United States of America'
top_40_varieties['not_vintage'] = top_40_varieties['not_vintage'].apply(lambda x: x.replace(\
                                                                        'US', 'United States of America'))

In [5]:
# renaming the columns in top_varieties_count
top_varieties_count = top_varieties_count.rename(columns={'variety': 'count', 'Unnamed: 0': 'variety'})
top_varieties_count = top_varieties_count.set_index('variety') # setting the index
top_varieties_count = top_varieties_count['count'] # making it a Series

In [6]:
top = 20 # selecting top 20 varities as our working varieties. note 1 < n <= 40

# making a datframe of our selecting top n varieties
top_df = top_40_varieties[top_40_varieties['variety'].isin(top_varieties_count.iloc[:top].index)]

In [7]:
# threshold of miniority variety to over sample (use sentences as document instead of the whole description)
minority_threshold = 5000 

# making a dataframe of the miniority classes
minority_df = top_df[top_df['variety'].isin(top_varieties_count[top_varieties_count < \
                                                                      minority_threshold].index)]

In [8]:
from nltk.tokenize import sent_tokenize # importing useful library

In [9]:
oversampled_miniority_lst = [] # empty list to store sentences as tokens miniority corpus

# creating a function to use sentences as tokens for the miniority classes
def over_sample_miniority(row):
    doc_list = sent_tokenize(row['description'])
    for sent in doc_list:
        row['description'] = sent
        oversampled_miniority_lst.append(list(row))

In [10]:
minority_df.apply(over_sample_miniority, axis=1); # over sample the miniority classes

In [11]:
# converts oversampled_miniority_lst to a dataframe
oversampled_miniority_df = pd.DataFrame(oversampled_miniority_lst, columns=minority_df.columns)

In [12]:
# selecting majority classes as a dataframe to concatenate to oversampled_miniority_lst
majority_df = top_df[top_df['variety'].isin(\
                                        top_varieties_count[top_varieties_count >= minority_threshold].index)]

# concatenates majority_df to oversampled_miniority_lst
balanced_df = pd.concat([majority_df, oversampled_miniority_df]) 
balanced_df = balanced_df.reset_index().drop('index', axis=1) # resets index

In [13]:
balanced_variety = balanced_df['variety'].value_counts() # gets a Series of the variety count in balanced_df

In [14]:
# importing useful libraries
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [15]:
# for sentence oversampling
sent_oversample_corpus = [doc1 + ' ' + doc2 for doc1, doc2 in zip(\
                                                        balanced_df['description'], balanced_df['not_vintage'])]

In [16]:
sent_oversample_le = LabelEncoder()
sent_oversample_one_hot = OneHotEncoder(sparse=False) # initializes a LabelEncoder object

labels = [label for label in balanced_df['variety']]

# encodes the labels
encoded_labels = sent_oversample_le.fit_transform(labels)

# one_hot encoding the labels
one_hot_labels = sent_oversample_one_hot.fit_transform(encoded_labels.reshape(-1, 1))


## 2.0 Data Preprocessing

In [17]:
import spacy

In [18]:
# creating a spacy pipeline and disabling tagger, parser and ner to speed up tokenizer
nlp = spacy.load('en', disable=['tagger', 'parser', 'ner']) 

In [19]:
stop_words = spacy.lang.en.STOP_WORDS # getting spacy's stop-words

In [20]:
stop_words_lemma = {word.lemma_.lower() for word in nlp(' '.join(stop_words))} | {'-pron-', '10', '12', 
                    'aah', 'aa', 'ab', 'aaa', 'aand', '16', '2', '20', '30', '4', '40', '5', '6', '7', '8', '9'}

### creating custom transformers to encapsulate our preprocessing

In [21]:
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin

In [22]:
class GetTokens(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words=stop_words_lemma, list_of_lists=False):
        self.stop_words = stop_words
        self.list_of_lists = list_of_lists
    
    # defining tokenzer function to tokenize the lower case lemma of documents in a corpus and 
    # filter out stop-words  
    def tokenize(self, text):
        return [word.lemma_.lower() for word in nlp(text) if word.is_alpha and word.lemma_.lower() \
                not in self.stop_words]

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.list_of_lists:
            self.tokens = [self.tokenize(doc) for doc in X]
        else:
            self.tokens = [' '.join(self.tokenize(doc)) for doc in X]
            
        return self.tokens

In [23]:
tokens = GetTokens()

In [24]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [25]:
class Text2Sequence(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.sequence_tokenizer = Tokenizer(oov_token=-99)

    def fit(self, X, y=None):
        self.sequence_tokenizer.fit_on_texts(X)
        self.words_indices = self.sequence_tokenizer.word_index
        return self
    
    def transform(self, X):
        self.get_sequences = self.sequence_tokenizer.texts_to_sequences(X)
        return self.get_sequences

In [26]:
text_2_seq = Text2Sequence()

In [27]:
from keras.preprocessing.sequence import pad_sequences

In [28]:
class Padding(BaseEstimator, TransformerMixin):
    def __init__(self, pad='post'):
        self.pad = pad
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        self.get_paddings = pad_sequences(X, padding=self.pad)
        return self.get_paddings

In [29]:
pad = Padding()

In [30]:
from sklearn.pipeline import Pipeline

In [31]:
data_prep_pipe = Pipeline([('get_tokens', tokens), ('text_2_sequence', text_2_seq), ('padding', pad)])

In [32]:
X_train = data_prep_pipe.fit_transform(sent_oversample_corpus)

## Training

#### Word2Vec

In [33]:
from gensim.models import Word2Vec # importing Word2Vec

In [34]:
list_of_lists_tokens = GetTokens(list_of_lists=True)
sent_oversample_corpus2 = list_of_lists_tokens.fit_transform(sent_oversample_corpus)

In [35]:
# trains sent_oversample_corpus2 on Word2Vec. For sent_oversample_corpus2, min_count of 5 and iter (epoch) of 10 
# seems to be the best
sent_w2v_model = Word2Vec(sent_oversample_corpus2, size=300, min_count=1, iter=10)

In [36]:
def get_embedding_matrix(model, word_index):
    vocab_size = len(word_index) + 1
    embedding_dim = model.wv.vector_size
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    
    for word in model.wv.vocab:
        ind = word_index[word]
        embedding_matrix[ind] = model[word]
        
    return embedding_matrix

In [37]:
words_indices = data_prep_pipe.named_steps['text_2_sequence'].words_indices
embedding_matrix = get_embedding_matrix(sent_w2v_model, words_indices)

In [38]:
from sklearn.utils import class_weight

In [39]:
class_weights = class_weight.compute_class_weight('balanced', np.arange(20), encoded_labels)
class_weights = dict(enumerate(class_weights))

In [40]:
# importing deep learning libraries
from tensorflow.keras.layers import Embedding, Dense, GlobalMaxPool1D, Conv1D, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

#### Model

In [41]:
def multi_class_fbeta(ytrue , ypred, beta=1, weighted=True, raw=False, epsilon=1e-7):
    beta_squared = beta**2

    ytrue = tf.cast(ytrue, tf.float32)
    ypred= tf.cast(ypred, tf.float32)
    
    max_prob = tf.reduce_max(ypred, axis=-1, keepdims=True)
    ypred = tf.cast(tf.equal(ypred, max_prob), tf.float32)
        
    tp = tf.reduce_sum(ytrue*ypred, axis=0)
    predicted_positive = tf.reduce_sum(ypred, axis=0)
    actual_positive = tf.reduce_sum(ytrue, axis=0)
    
    precision = tp/(predicted_positive+epsilon)
    recall = tp/(actual_positive+epsilon)
    
    fb = (1+beta_squared)*precision*recall / (beta_squared*precision + recall + epsilon)
    
    if raw:
        return fb
    
    if weighted:
        supports = tf.reduce_sum(ytrue, axis=0)
        return tf.reduce_sum(fb*supports / tf.reduce_sum(supports))

    return tf.reduce_mean(fb)

In [42]:
input_length = X_train.shape[1]

In [43]:
def build_cnn_model():
    model = Sequential()
    model.add(Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], 
                           weights=[embedding_matrix], 
                           input_length=input_length, 
                           trainable=False))
    
    model.add(Conv1D(128, 3, activation='relu'))
    model.add(Conv1D(128, 3, activation='relu'))
    
    model.add(GlobalMaxPool1D())
    
    model.add(Dropout(0.2))
    
    model.add(Dense(20, activation='softmax'))

    model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy', multi_class_fbeta])

    return model

In [44]:
# build model
cnn_model = KerasClassifier(build_fn=build_cnn_model, epochs=5, batch_size=128, class_weight=class_weights, \
                        verbose=0) 

In [45]:
model = Pipeline([('data_prep', data_prep_pipe), ('model', cnn_model)], verbose=0)

In [46]:
model.fit(sent_oversample_corpus, encoded_labels)

Pipeline(steps=[('data_prep',
                 Pipeline(steps=[('get_tokens', GetTokens()),
                                 ('text_2_sequence', Text2Sequence()),
                                 ('padding', Padding())])),
                ('model',
                 <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x7f74f6b1b8d0>)],
         verbose=0)

## Prediction

### Note: lines of code starting with a triple '#' should be uncommented when running the deployment. I commented them out with a triple '#' in this notebook so they would be run in my notebook.

In [47]:
### users_input = user's description will be inputed here as a string enclosed with double quotes and not single

In [48]:
# predicting user's description
### pred = model.predict([users_input])

#printing the prediction (variety) of the user's description
### print(sent_oversample_le.inverse_transform(pred)[0])

# Note: lines of code below are for demostration purposes and should not be included in our deployment.

### Our model only predicts the top 20 varieties and they are as listed below

In [49]:
balanced_df['variety'].unique()

array(['Riesling', 'Pinot Noir', 'Cabernet Sauvignon', 'Chardonnay',
       'Red Blend', 'Bordeaux-style Red Blend', 'White Blend',
       'Portuguese Red', 'Pinot Gris', 'Malbec', 'Merlot',
       'Sauvignon Blanc', 'Sangiovese', 'Rosé', 'Zinfandel', 'Syrah',
       'Nebbiolo', 'Rhône-style Red Blend', 'Sparkling Blend',
       'Tempranillo'], dtype=object)

### Let's do a demo prediction of Bordeaux-style Red Blend from a recent review (published 12/1/2020) not in our training set from [wine ethusiast](https://www.winemag.com/buying-guide/alpha-omega-2017-era-red-napa-valley/)

In [50]:
user_input = "Juicy and richly interwoven with a fresh underlay of acidity, " + \
             "this powerhouse blend is substantially etched in cassis, plum and red currant. " + \
             "The oak and tannin are equally substantial and present, contributing weight, " + \
             "breadth and length. This will do well to cellar; enjoy best from 2027–2032"

### Notice the double quotes we used. This is to prevent error should the user's input contain apostrophe.

In [51]:
user_input

'Juicy and richly interwoven with a fresh underlay of acidity, this powerhouse blend is substantially etched in cassis, plum and red currant. The oak and tannin are equally substantial and present, contributing weight, breadth and length. This will do well to cellar; enjoy best from 2027–2032'

In [52]:
# predicting user's description
pred = model.predict([user_input])

#printing the prediction (variety) of the user's description
print(sent_oversample_le.inverse_transform(pred)[0])

Bordeaux-style Red Blend


### ...and voila, our model's prediction was as good as right!