In [1]:
# Regular libs
import pandas as pd
import numpy as np

# Scikit Learn
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer

# nltk
from nltk import word_tokenize

# other
from collections import defaultdict
from collections import Counter

In [120]:
# Dataset: https://www.kaggle.com/zynicide/wine-reviews

df = pd.read_csv('wine-reviews/wine_data_150k.csv')

In [121]:
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [122]:
# sample description
df['description'][10]

"Elegance, complexity and structure come together in this drop-dead gorgeous winethat ranks among Italy's greatest whites. It opens with sublime yellow spring flower, aromatic herb and orchard fruit scents. The creamy, delicious palate seamlessly combines juicy white peach, ripe pear and citrus flavors while white almond and savory mineral notes grace the lingering finish."

In [123]:
df.shape

(150930, 11)

In [124]:
df.isna().sum()

Unnamed: 0         0
country            5
description        0
designation    45735
points             0
price          13695
province           5
region_1       25060
region_2       89977
variety            0
winery             0
dtype: int64

In [125]:
# feature selection
data = df.filter(['description','variety'], axis=1)

In [126]:
data.head()

Unnamed: 0,description,variety
0,This tremendous 100% varietal wine hails from ...,Cabernet Sauvignon
1,"Ripe aromas of fig, blackberry and cassis are ...",Tinta de Toro
2,Mac Watson honors the memory of a wine once ma...,Sauvignon Blanc
3,"This spent 20 months in 30% new French oak, an...",Pinot Noir
4,"This is the top wine from La Bégude, named aft...",Provence red blend


In [127]:
data.shape

(150930, 2)

## Data Cleaning

In [128]:
# count varieties
variety_count = Counter(data['variety'].tolist())
variety_count

Counter({'Cabernet Sauvignon': 12800,
         'Tinta de Toro': 221,
         'Sauvignon Blanc': 6320,
         'Pinot Noir': 14291,
         'Provence red blend': 25,
         'Friulano': 137,
         'Tannat': 140,
         'Chardonnay': 14482,
         'Tempranillo': 2556,
         'Malbec': 3208,
         'Rosé': 2817,
         'Tempranillo Blend': 756,
         'Syrah': 5825,
         'Mavrud': 4,
         'Sangiovese': 3345,
         'Sparkling Blend': 2004,
         'Rhône-style White Blend': 409,
         'Red Blend': 10062,
         'Mencía': 148,
         'Palomino': 25,
         'Petite Sirah': 897,
         'Riesling': 5524,
         'Cabernet Sauvignon-Syrah': 151,
         'Portuguese Red': 2216,
         'Nebbiolo': 2241,
         'Pinot Gris': 1365,
         'Meritage': 317,
         'Baga': 22,
         'Glera': 622,
         'Malbec-Merlot': 109,
         'Merlot-Malbec': 15,
         'Ugni Blanc-Colombard': 14,
         'Viognier': 1263,
         'Cabernet Sauvignon

In [129]:
# filter data to contain only top 10 varieties, i.e. get rid of the noise

top_10_varieties = {i[0]: idx for idx, i in enumerate(variety_count.most_common(10))}
top_10_varieties

{'Chardonnay': 0,
 'Pinot Noir': 1,
 'Cabernet Sauvignon': 2,
 'Red Blend': 3,
 'Bordeaux-style Red Blend': 4,
 'Sauvignon Blanc': 5,
 'Syrah': 6,
 'Riesling': 7,
 'Merlot': 8,
 'Zinfandel': 9}

In [130]:
data = data[data['variety'].map(lambda x: x in top_10_varieties)]

In [131]:
data.shape

(85520, 2)

## Data Preprocessing 

In [14]:
# transform description to vectorized count of the words - optimizing the data for algorithm

description_list = data['description'].tolist()

varietal_list = [top_10_varieties[i] for i in data['variety'].tolist()]
varietal_list = np.array(varietal_list)

print(varietal_list[0])
print(description_list[0])

2
This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak. Juicy red-cherry fruit and a compelling hint of caramel greet the palate, framed by elegant, fine tannins and a subtle minty tone in the background. Balanced and rewarding from start to finish, it has years ahead of it to develop further nuance. Enjoy 2022–2030.


In [15]:
# Vectorize the count of words in each [description]

# Default settings
#
# analyzer = 'word'
# default preprocessor
# stopword removal
# lowercase = True
# default tokenizer
# max_features = None
# default ngram = (1, 1)


count_vect = CountVectorizer()

x_train_counts = count_vect.fit_transform(description_list)

print(x_train_counts)

  (0, 20568)	1
  (0, 21010)	1
  (0, 27)	1
  (0, 21832)	1
  (0, 22616)	1
  (0, 9495)	1
  (0, 8543)	2
  (0, 13952)	1
  (0, 1149)	4
  (0, 22319)	1
  (0, 804)	1
  (0, 14335)	1
  (0, 20605)	1
  (0, 22845)	2
  (0, 10291)	2
  (0, 13938)	1
  (0, 10972)	1
  (0, 16467)	1
  (0, 3981)	1
  (0, 8566)	1
  (0, 4630)	1
  (0, 9876)	1
  (0, 14031)	2
  (0, 3464)	1
  (0, 9279)	1
  :	:
  (85519, 621)	1
  (85519, 20778)	1
  (85519, 17685)	1
  (85519, 14186)	1
  (85519, 11002)	1
  (85519, 13826)	1
  (85519, 18258)	1
  (85519, 22500)	1
  (85519, 20673)	1
  (85519, 20211)	1
  (85519, 13361)	1
  (85519, 11705)	1
  (85519, 18793)	1
  (85519, 9880)	1
  (85519, 8961)	1
  (85519, 15729)	1
  (85519, 13833)	1
  (85519, 2333)	1
  (85519, 4677)	1
  (85519, 6398)	1
  (85519, 1636)	1
  (85519, 15395)	1
  (85519, 15189)	1
  (85519, 22177)	1
  (85519, 21755)	1


In [16]:
print(count_vect.get_feature_names())



In [17]:
# Transform counts to frequency

tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)

In [18]:
# dividing the data into train(70%) and test(30%)

train_x, test_x, train_y, test_y = train_test_split(x_train_tfidf, varietal_list, test_size=0.3)

## Naive Bayes

In [21]:
NB_model = MultinomialNB().fit(train_x, train_y)

In [22]:
y_score = NB_model.predict(test_x)

In [23]:
num_correct = 0

for i in range(len(y_score)):
    if y_score[i] == test_y[i]:
        num_correct += 1

print("Accuracy: %.2f%%" % ((num_correct/float(len(test_y)) * 100)))

Accuracy: 64.67%


## SVM model

In [24]:
from sklearn.svm import SVC

%time SVM_model = SVC(kernel='linear').fit(train_x, train_y)

CPU times: user 8min 47s, sys: 4.92 s, total: 8min 52s
Wall time: 8min 53s


In [25]:
y_score = SVM_model.predict(test_x)

In [26]:
num_correct = 0

for i in range(len(y_score)):
    if y_score[i] == test_y[i]:
        num_correct += 1

print("Accuracy: %.2f%%" % ((num_correct/float(len(test_y)) * 100)))

Accuracy: 83.59%


## Deep learning model using Keras

In [27]:
def count_top_x_words(corpus, top_x, skip_top_n):
    
    count = defaultdict(lambda: 0)
    
    for c in corpus:
        for w in word_tokenize(c):
            count[w] += 1
    count_tuples = sorted([(w, c) for w, c in count.items()], key=lambda x: x[1], reverse=True)
    
    return [i[0] for i in count_tuples[skip_top_n: skip_top_n + top_x]]


In [28]:
def replace_top_x_words_with_vectors(corpus, top_x):
    
    topx_dict = {top_x[i]: i for i in range(len(top_x))}

    return [ [topx_dict[w] for w in word_tokenize(s) if w in topx_dict] for s in corpus], topx_dict

In [29]:
def filter_to_top_x(corpus, n_top, skip_n_top=0):
    
    top_x = count_top_x_words(corpus, n_top, skip_n_top)
    
    return replace_top_x_words_with_vectors(corpus, top_x)

In [30]:
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.utils import to_categorical
from collections import Counter
from sklearn.model_selection import train_test_split

### Data Preprocessing 2

In [31]:
data = df.filter(['description','variety'], axis=1)

c = Counter(data['variety'].tolist())

top_10_varieties = {i[0]: idx for idx, i in enumerate(c.most_common(10))}

data = data[data['variety'].map(lambda x: x in top_10_varieties)]

In [103]:
description_list = data['description'].str.lower().tolist()

# Tokenization of top 2500 words, skip 10 most frequent words

mapped_list, word_list = filter_to_top_x(description_list, 2500,10)

varietal_list_o = [top_10_varieties[i] for i in data['variety'].tolist()]
varietal_list = to_categorical(varietal_list_o)

In [108]:
mapped_list_df = pd.DataFrame(mapped_list)
mapped_list_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,85,86,87,88,89,90,91,92,93,94
0,1397.0,338.0,38.0,373.0,0.0,14.0,1903.0,266.0,469.0,105.0,...,,,,,,,,,,
1,0.0,891.0,133.0,24.0,967.0,2.0,130.0,82.0,102.0,114.0,...,,,,,,,,,,
2,1661.0,677.0,474.0,2.0,951.0,38.0,104.0,363.0,21.0,5.0,...,,,,,,,,,,
3,129.0,266.0,2489.0,30.0,65.0,462.0,374.0,2319.0,193.0,493.0,...,,,,,,,,,,
4,752.0,1737.0,14.0,365.0,129.0,11.0,46.0,198.0,112.0,2300.0,...,,,,,,,,,,


In [91]:
df2 = pd.DataFrame(list(word_list.items()),columns = ['word','token'])
df2.head(10)

Unnamed: 0,word,token
0,wine,0
1,flavors,1
2,in,2
3,'s,3
4,to,4
5,fruit,5
6,but,6
7,that,7
8,on,8
9,finish,9


In [109]:
max_review_length = 80

mapped_list = sequence.pad_sequences(mapped_list, maxlen=max_review_length)

train_x, test_x, train_y, test_y = train_test_split(mapped_list, varietal_list, test_size=0.3)

In [110]:
embedding_vector_length = 64
model = Sequential()

model.add(Embedding(2500, embedding_vector_length, input_length=max_review_length))
model.add(Conv1D(50, 5))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(max(varietal_list_o) + 1, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
%time model.fit(train_x, train_y, epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3
CPU times: user 1min 24s, sys: 1min 2s, total: 2min 26s
Wall time: 30.8 s


<tensorflow.python.keras.callbacks.History at 0x7f7f882ab890>

In [111]:
y_score = model.predict(test_x)
y_score = [[1 if i == max(sc) else 0 for i in sc] for sc in y_score]

num_correct = 0

for i in range(len(y_score)):
    if all(y_score[i][j] == test_y[i][j] for j in range(len(y_score[i]))):
        num_correct += 1

print("Accuracy: %.2f%%" % ((num_correct/float(len(test_y)) * 100)))

Accuracy: 82.80%


## RandomForest model

In [39]:
import re
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [137]:
train_index, test_index = list(), list()
for i in range (0, len(data)):
    if np.random.uniform(0,1) < 0.7:
        train_index += [i]
    else:
        test_index +=[i]

train_data = data.iloc[train_index]
test_data = data.iloc[test_index]

In [138]:
train_data = train_data[train_data['variety'].map(lambda x: x in top_10_varieties)]
test_data = test_data[test_data['variety'].map(lambda x: x in top_10_varieties)]

In [139]:
train_data.reset_index(inplace=True)
train_data.drop(['index'], axis=1, inplace=True)
train_data.head()

test_data.reset_index(inplace=True)
test_data.drop(['index'], axis=1, inplace=True)
test_data.head()

Unnamed: 0,description,variety
0,This tremendous 100% varietal wine hails from ...,Cabernet Sauvignon
1,This re-named vineyard was formerly bottled as...,Pinot Noir
2,The producer sources from two blocks of the vi...,Pinot Noir
3,"First made in 2006, this succulent luscious Ch...",Chardonnay
4,"Dark in color and in flavor profile, this medi...",Syrah


### Data Preprocessing 3

In [141]:
def text_to_words( text ):
    
    letters_only = re.sub("[^a-zA-Z]", " ", text)
    words = letters_only.lower().split()
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]
    
    return( " ".join( meaningful_words ))

In [142]:
# Only letter words, we remove stopwords

clean_text = []

for i in range( 0, len(train_data) ):
    clean_text.append( text_to_words( train_data["description"][i] ) )

In [146]:
# Same as first preprocesing, but limited features

vectorizer = CountVectorizer(max_features = 2500)

train_data_features = vectorizer.fit_transform(clean_text)

train_data_features = train_data_features.toarray()

In [147]:
forest = RandomForestClassifier(n_estimators = 100)

%time forest = forest.fit( train_data_features, train_data["variety"] )

CPU times: user 2min 20s, sys: 1.06 s, total: 2min 21s
Wall time: 2min 22s


In [148]:
clean_test_text = []

for i in range( 0, len(test_data) ):
    clean_test_text.append( text_to_words( test_data["description"][i] ) )
    
test_data_features = vectorizer.fit_transform(clean_test_text)
test_data_features = test_data_features.toarray()

scores = cross_val_score(forest, test_data_features, test_data['variety'], cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean()*100, scores.std() * 2))

Accuracy: 79.54 (+/- 0.05)
