In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from itertools import chain
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

import matplotlib

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\spong\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\spong\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\spong\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Summary as of 11/21

#### My second hypothesis: wine from different area (in this dataset, I mean province) have different falvors
1. I did lift analysis like what we did in text analysis to match flavors and area (province). 
2. Wine from California taste dry, more like pineapple, smoky, blackberry
3. Wine from Washington taste like coffee and herb 
4. Wine from Bordeaux taste like wood (the lift is 5.7, crazy!!!)
5. Wine from Tuscany taste like berry and raspberry and goes well with tobacco and meat
6. Application is that next time when you are talking about wine with your friends, you can claim these are all backed by data analysis


In [2]:
df = pd.read_csv('winemag-data_first150k.csv')

## Preprocess

#### Change df['description'] to whatever you are using

In [3]:
%%time
# tokenization
df['tokens'] = df['description'].map(nltk.word_tokenize)
df['bigrams'] = df['tokens'].map(lambda x: list(nltk.bigrams(x)))
df['trigrams'] = df['tokens'].map(lambda x: list(nltk.trigrams(x)))
df['tokens'] = df['tokens'].map(lambda x: set(x))

# lemmatization
lemmatizer = WordNetLemmatizer()
df['lemmatized'] = df ['tokens'].map(lambda x: [lemmatizer.lemmatize(word) for word in x])
df['lemmatized'] = df['lemmatized'].map(lambda x: set(x))

# remove stop words
stop_words = set(stopwords.words('english'))
df['processed'] = df['lemmatized'].map(lambda x: [word for word in x if word.lower() not in stop_words])

# remove punctuations
punc = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~..."
df['processed'] = df['processed'].map(lambda x: [word for word in x if word.lower() not in punc])

# remove some other stuff and return lower case
others = ["''", "``", "n't", "l", "oh", "lol", "'m", "'s"]
df['processed'] = df['processed'].map(lambda x: [word.lower() for word in x if word.lower() not in others])

Wall time: 1min 16s


In [4]:
all_words = pd.Series(chain(*list(df['processed'])))
word_frequency = all_words.value_counts()

## Choose some flavors

In [5]:
flavors = 'fruit cherry acidity spice dry rich sweet berry blackberry oak vanilla apple raspberry chocolate citrus peach herb spicy pear pineapple creamy lemon wood smoky tobacco coffee orange melon strawberry grapefruit floral meat cinnamon'.split()
flavors

['fruit',
 'cherry',
 'acidity',
 'spice',
 'dry',
 'rich',
 'sweet',
 'berry',
 'blackberry',
 'oak',
 'vanilla',
 'apple',
 'raspberry',
 'chocolate',
 'citrus',
 'peach',
 'herb',
 'spicy',
 'pear',
 'pineapple',
 'creamy',
 'lemon',
 'wood',
 'smoky',
 'tobacco',
 'coffee',
 'orange',
 'melon',
 'strawberry',
 'grapefruit',
 'floral',
 'meat',
 'cinnamon']

In [6]:
## for sanity check

a = 'fruit'
b = 'California'
print(np.sum((df['processed'].map(lambda x: a in x)) & (df['province'] == b)))
print(sum(df['processed'].map(lambda x: a in x)))
print(sum(df['province'] == b))
# df['processed'].map(lambda x: a in x)


10330
60584
44508


In [7]:
def lift(a, b):
    N = df['processed'].shape[0]
    N_ab = sum((df['processed'].map(lambda x: a in x)) & (df['province'] == b))
    N_a = sum(df['processed'].map(lambda x: a in x))
    N_b = sum(df['province'] == b)
#     print('N', 'N_ab', 'N_a', 'N_b')
#     print(N, N_ab, N_a, N_b)
    if N_a*N_b == 0:
        return(None)
    else:
        return(N*N_ab/(N_a*N_b))

Then I choose 5 province I want to look at

#### If you want to look at more area, you can change number_of_province variable. It allows you to access the analysis for top areas (the areas with the most data).

In [11]:
number_of_province = 5

provinces = df['province'].value_counts().iloc[:number_of_province].index.values
provinces

provinces = pd.DataFrame(provinces, columns=['province'])
for flavor in flavors:
#     print(flavor)
    provinces[flavor] = provinces['province'].map(lambda x: lift(flavor, x))
provinces = provinces.set_index('province')
# provinces.T

## Print out top 5 regions and their top flavors

In [9]:
for province in provinces.index:
    print("Wine from {} taste like:".format(province), "==============")
    print(provinces.T[province].sort_values(ascending=False)[:5])

dry           2.094528
pineapple     1.965826
smoky         1.935283
blackberry    1.914715
oak           1.848095
Name: California, dtype: float64
coffee        2.650685
herb          2.409139
strawberry    1.873638
fruit         1.612798
spicy         1.558598
Name: Washington, dtype: float64
tobacco    3.788644
meat       2.617308
spice      2.262905
cherry     2.057002
coffee     1.914722
Name: Tuscany, dtype: float64
wood       5.693182
acidity    1.680284
fruit      1.659204
rich       1.573435
smoky      1.261768
Name: Bordeaux, dtype: float64
berry        2.444871
raspberry    2.004492
tobacco      1.827583
coffee       1.660355
vanilla      1.597128
Name: Northern Spain, dtype: float64


#### Look at only texas'wine
#### If you want to look at wine from specific areas, you can add area in the list below.

In [10]:
provinces = ['Texas']

provinces = pd.DataFrame(provinces, columns=['province'])
for flavor in flavors:
#     print(flavor)
    provinces[flavor] = provinces['province'].map(lambda x: lift(flavor, x))
provinces = provinces.set_index('province')
provinces.T

for province in provinces.index:
    print("Wine from {} taste like:".format(province), "==============")
    print(provinces.T[province].sort_values(ascending=False)[:5])

spicy      4.779676
floral     3.795072
smoky      3.639967
vanilla    3.408011
pear       2.327234
Name: Texas, dtype: float64
