In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from nltk import word_tokenize
from collections import defaultdict

In [2]:
# Dataset: https://www.kaggle.com/zynicide/wine-reviews

df = pd.read_csv('wine-reviews/wine_data_150k.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [6]:
# sample description
df['description'][10]

"Elegance, complexity and structure come together in this drop-dead gorgeous winethat ranks among Italy's greatest whites. It opens with sublime yellow spring flower, aromatic herb and orchard fruit scents. The creamy, delicious palate seamlessly combines juicy white peach, ripe pear and citrus flavors while white almond and savory mineral notes grace the lingering finish."

In [4]:
df.shape

(150930, 11)

In [5]:
df.isna().sum()

Unnamed: 0         0
country            5
description        0
designation    45735
points             0
price          13695
province           5
region_1       25060
region_2       89977
variety            0
winery             0
dtype: int64

In [6]:
# feature selection
data = df.filter(['description','variety'], axis=1)

In [7]:
data.head()

Unnamed: 0,description,variety
0,This tremendous 100% varietal wine hails from ...,Cabernet Sauvignon
1,"Ripe aromas of fig, blackberry and cassis are ...",Tinta de Toro
2,Mac Watson honors the memory of a wine once ma...,Sauvignon Blanc
3,"This spent 20 months in 30% new French oak, an...",Pinot Noir
4,"This is the top wine from La Bégude, named aft...",Provence red blend


In [8]:
data.shape

(150930, 2)

In [13]:
# count varieties
variety_count = Counter(data['variety'].tolist())

In [14]:
# filter data to contain only top 10 varieties, i.e. get rid of the noise
top_10_varieties = {i[0]: idx for idx, i in enumerate(variety_count.most_common(10))}
top_10_varieties

{'Chardonnay': 0,
 'Pinot Noir': 1,
 'Cabernet Sauvignon': 2,
 'Red Blend': 3,
 'Bordeaux-style Red Blend': 4,
 'Sauvignon Blanc': 5,
 'Syrah': 6,
 'Riesling': 7,
 'Merlot': 8,
 'Zinfandel': 9}

In [15]:
data = data[data['variety'].map(lambda x: x in top_10_varieties)]

In [16]:
data.shape

(85520, 2)

In [17]:
# transform description to vectorized count of the words (optimizing the data for algorithm)
description_list = data['description'].tolist()

varietal_list = [top_10_varieties[i] for i in data['variety'].tolist()]
varietal_list = np.array(varietal_list)

In [18]:
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(description_list)

In [19]:
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)

In [20]:
# dividing the data into train(70%) and test(30%)
train_x, test_x, train_y, test_y = train_test_split(x_train_tfidf, varietal_list, test_size=0.3)

In [21]:
# creating Naive Bayes model
NB_model = MultinomialNB().fit(train_x, train_y)

In [22]:
y_score = NB_model.predict(test_x)

In [23]:
num_correct = 0

for i in range(len(y_score)):
    if y_score[i] == test_y[i]:
        num_correct += 1

print("Accuracy: %.2f%%" % ((num_correct/float(len(test_y)) * 100)))

Accuracy: 64.30%


In [24]:
### Creating SVM model

from sklearn.svm import SVC

SVM_model = SVC(kernel='linear').fit(train_x, train_y)

In [25]:
y_score = SVM_model.predict(test_x)

In [26]:
num_correct = 0

for i in range(len(y_score)):
    if y_score[i] == test_y[i]:
        num_correct += 1

print("Accuracy: %.2f%%" % ((num_correct/float(len(test_y)) * 100)))

Accuracy: 83.24%


In [27]:
### Deep learning model using Keras

In [25]:
def count_top_x_words(corpus, top_x, skip_top_n):
    
    count = defaultdict(lambda: 0)
    
    for c in corpus:
        for w in word_tokenize(c):
            count[w] += 1
    count_tuples = sorted([(w, c) for w, c in count.items()], key=lambda x: x[1], reverse=True)
    
    return [i[0] for i in count_tuples[skip_top_n: skip_top_n + top_x]]


In [26]:
def replace_top_x_words_with_vectors(corpus, top_x):
    
    topx_dict = {top_x[i]: i for i in range(len(top_x))}

    return [ [topx_dict[w] for w in word_tokenize(s) if w in topx_dict] for s in corpus], topx_dict

In [27]:
def filter_to_top_x(corpus, n_top, skip_n_top=0):
    
    top_x = count_top_x_words(corpus, n_top, skip_n_top)
    
    return replace_top_x_words_with_vectors(corpus, top_x)

In [177]:
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.utils import to_categorical
from collections import Counter
from sklearn.model_selection import train_test_split

In [178]:
data = df.filter(['description','variety'], axis=1)

c = Counter(data['variety'].tolist())

top_10_varieties = {i[0]: idx for idx, i in enumerate(c.most_common(10))}

data = data[data['variety'].map(lambda x: x in top_10_varieties)]

In [179]:
description_list = data['description'].tolist()
mapped_list, word_list = filter_to_top_x(description_list, 2500, 10)
varietal_list_o = [top_10_varieties[i] for i in data['variety'].tolist()]
varietal_list = to_categorical(varietal_list_o)

In [180]:
max_review_length = 150

mapped_list = sequence.pad_sequences(mapped_list, maxlen=max_review_length)

train_x, test_x, train_y, test_y = train_test_split(mapped_list, varietal_list, test_size=0.3)

In [181]:
embedding_vector_length = 64
model = Sequential()

model.add(Embedding(2500, embedding_vector_length, input_length=max_review_length))
model.add(Conv1D(50, 5))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(max(varietal_list_o) + 1, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_x, train_y, epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1a2fc9e7f0>

In [182]:
y_score = model.predict(test_x)
y_score = [[1 if i == max(sc) else 0 for i in sc] for sc in y_score]

num_correct = 0

for i in range(len(y_score)):
    if all(y_score[i][j] == test_y[i][j] for j in range(len(y_score[i]))):
        num_correct += 1

print("Accuracy: %.2f%%" % ((num_correct/float(len(test_y)) * 100)))

Accuracy: 81.49%


In [34]:
### RandomForest model

In [152]:
import re
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [141]:
data = df.filter(['description','variety'], axis=1)
c = Counter(data['variety'])

top_10_varieties = {i[0]: idx for idx, i in enumerate(c.most_common(10))}

data = data[data['variety'].map(lambda x: x in top_10_varieties)]

In [142]:
train_index, test_index = list(), list()
for i in range (0, len(data)):
    if np.random.uniform(0,1) < 0.75:
        train_index += [i]
    else:
        test_index +=[i]

train_data = data.iloc[train_index]
test_data = data.iloc[test_index]

In [154]:
train_data = train_data[train_data['variety'].map(lambda x: x in top_10_varieties)]
test_data = test_data[test_data['variety'].map(lambda x: x in top_10_varieties)]

In [155]:
train_data.reset_index(inplace=True)
train_data.drop(['index'], axis=1, inplace=True)
train_data.head()

test_data.reset_index(inplace=True)
test_data.drop(['index'], axis=1, inplace=True)
test_data.head()

Unnamed: 0,description,variety
0,This tremendous 100% varietal wine hails from ...,Cabernet Sauvignon
1,Bergström has made a Shea designate since 2003...,Pinot Noir
2,"Cranberry, baked rhubarb, anise and crushed sl...",Pinot Noir
3,"Dark in color and in flavor profile, this medi...",Syrah
4,"A blend of 90% Sangiovese and 10% Canaiolo, th...",Red Blend


In [120]:
def text_to_words( text ):
    
    letters_only = re.sub("[^a-zA-Z]", " ", text)
    words = letters_only.lower().split()
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]
    
    return( " ".join( meaningful_words ))

In [146]:
clean_text = []

for i in range( 0, len(train_data) ):
    clean_text.append( text_to_words( train_data["description"][i] ) )

In [147]:
vectorizer = CountVectorizer(analyzer = "word", \
tokenizer = None, \
preprocessor = None, \
stop_words = None, \
max_features = 5000)

train_data_features = vectorizer.fit_transform(clean_text)

train_data_features = train_data_features.toarray()

In [148]:
forest = RandomForestClassifier(n_estimators = 100)

forest = forest.fit( train_data_features, train_data["variety"] )

In [183]:
clean_test_text = []

for i in range( 0, len(test_data) ):
    clean_test_text.append( text_to_words( test_data["description"][i] ) )
    
test_data_features = vectorizer.fit_transform(clean_test_text)
test_data_features = test_data_features.toarray()

scores = cross_val_score(forest, test_data_features, test_data['variety'], cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean()*100, scores.std() * 2))

Accuracy: 78.16 (+/- 0.05)
