# Using NLTK to process reviews of beer to classify as "ipa-like" or "not ipa-like"

In [18]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
%matplotlib inline
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
import nltk
import numpy
import random

In [19]:
def main():
    ## Put stuff in notebook here
    print "main!"

In [20]:
#In Python: Define a database name (we're using a dataset on births, so I call it 
# birth_db), and your username for your computer (CHANGE IT BELOW). 
dbname = 'beer_db_2'
username = 'postgres'
mypassword = 'simple'
engine = create_engine('postgres://%s:%s@localhost/%s'%(username,mypassword,dbname))

con = psycopg2.connect(database = dbname, user = username,host='/var/run/postgresql',password=mypassword)

# Queries

In [22]:
beer_style_query = '''
SELECT
    breweries.style_name,count(distinct breweries.beer_name) as number_of_beers
FROM
    breweries
GROUP BY
    breweries.style_name
ORDER BY
    number_of_beers desc;
'''
beer_style_rank = pd.read_sql_query(beer_style_query,con)

beer_reviews_query = '''
SELECT
    reviews.review_text,
    reviews.beer_key,
    reviews.username,
    reviews.look,
    reviews.taste,
    reviews.feel,
    reviews.overall,
    breweries.style_key,
    breweries.style_name
FROM 
    reviews,breweries
WHERE
    breweries.beer_name_key = reviews.beer_key
ORDER BY
    reviews.username
'''
beer_reviews = pd.read_sql_query(beer_reviews_query,con)

ratings_per_user_query = '''
SELECT
    reviews.username,count(reviews.review_text) as number_of_reviews
FROM 
    reviews,breweries
WHERE
    reviews.beer_key = breweries.beer_name_key
GROUP BY
    reviews.username
ORDER BY 
    number_of_reviews desc;
'''

user_ratings = pd.read_sql_query(ratings_per_user_query,con)

# IPA, Double IPA, English IPA
ipa_query = """
SELECT
    reviews.username,
    reviews.review_text,
    reviews.beer_key,
    breweries.brewery_key,
    breweries.beer_name,
    reviews.look,
    reviews.taste,
    reviews.feel,
    reviews.overall,
    breweries.style_key,
    breweries.style_name
FROM 
    reviews,breweries
WHERE
    breweries.beer_name_key = reviews.beer_key
AND breweries.ratings_count > 5
AND 
( 
 breweries.style_key = 116
 OR breweries.style_key = 140
 OR breweries.style_key = 150
)
ORDER BY
    breweries.avg_score desc;
"""

# american porter, english porter, oatmeal stout, milk/sweet stout
not_ipa_query = """
SELECT
    reviews.username,
    reviews.review_text,
    reviews.beer_key,
    breweries.brewery_key,
    breweries.beer_name,
    reviews.look,
    reviews.taste,
    reviews.feel,
    reviews.overall,
    breweries.style_key,
    breweries.style_name
FROM 
    reviews,breweries
WHERE
    breweries.beer_name_key = reviews.beer_key
AND breweries.ratings_count > 5
AND 
(
 breweries.style_key = 159
 OR breweries.style_key = 101
 OR breweries.style_key = 67
 OR breweries.style_key = 82
)
ORDER BY
    reviews.overall desc;
"""

# Extracting Data From the Queries

In [40]:
ipa = pd.read_sql_query(ipa_query,con)
not_ipa = pd.read_sql_query(not_ipa_query,con)

In [41]:
print 'ipa reviews:',ipa.shape[0]
print 'not_ipa reviews:',not_ipa.shape[0]

ipa reviews: 7557
not_ipa reviews: 1471


In [59]:
# Put this stuff into a form that nltk can use
beer_reviews = []
hoppy_reviews = []
malty_reviews = []

all_words = []
hoppy_words = []
malty_words = []
counter = 0

for index,row in ipa.iterrows():
    try:
        review_words = row['review_text'].split()
        beer_reviews.append((review_words,"ipa"))
        hoppy_reviews.append((review_words,"ipa"))
        for w in review_words:
            all_words.append(w.lower())
            hoppy_words.append(w.lower())
        counter += 1
        if counter == 500:
            print counter,"hoppy reviews gotten"
            break
    except:
        "skipping, no review data" 

counter = 0

for index,row in not_ipa.iterrows():
    try:
        review_words = row['review_text'].split()
        beer_reviews.append((review_words,"not_ipa"))
        malty_reviews.append((review_words,"not_ipa"))
        for w in review_words:
            all_words.append(w.lower())
            malty_words.append(w.lower())
        counter += 1
        if counter == 500:
            print counter,"malty top reviews gotten"
            break
    except:
        "skipping, no review data"
print "got",len(beer_reviews),'reviews, with',len(all_words),'words.'

500 hoppy reviews gotten
500 malty top reviews gotten
got 1000 reviews, with 65857 words.


In [60]:
random.shuffle(beer_reviews)

# Compile Words

Here, we can get the frequency distribution of keywords, which are not stemmed.

In [61]:
all_words = nltk.FreqDist(all_words)
hoppy_words = nltk.FreqDist(hoppy_words)
malty_words = nltk.FreqDist(malty_words)
print "Hoppy Words:",hoppy_words.most_common(30),"\n"
print "Malty Words:",malty_words.most_common(30),"\n"
print "All words:",all_words.most_common(30),"\n"

Hoppy Words: [('i', 777), ('beer', 497), ('hops', 310), ('the', 309), ('citrus', 302), ('ipa', 275), ('taste', 275), ('hop', 266), ('head', 256), ('orange', 246), ('tropical', 235), ('good', 216), ('grapefruit', 212), ('like', 201), ('a', 193), ('malt', 192), ('pine', 191), ('fruit', 189), ('nice', 184), ('white', 183), ('one', 177), ('this', 173), ('light', 171), ('finish', 168), ('great', 164), ('bitterness', 160), ('flavor', 159), ('aroma', 159), ('carbonation', 157), ('medium', 157)] 

Malty Words: [('i', 679), ('coffee', 660), ('beer', 529), ('chocolate', 517), ('dark', 462), ('the', 398), ('head', 384), ('porter', 307), ('taste', 302), ('roasted', 299), ('black', 276), ('like', 268), ('nice', 266), ('a', 236), ('one', 234), ('malt', 231), ('good', 227), ('flavor', 216), ('light', 212), ('brown', 208), ('well', 205), ('vanilla', 204), ('this', 195), ('medium', 179), ('sweet', 177), ('smooth', 177), ('glass', 175), ('creamy', 170), ('finish', 166), ('great', 158)] 

All words: [('i

# Create sparse vectors of words for bayesian classificaiton

In [66]:
word_features = list(all_words.keys())[:3000]

def find_features(beer_review):
    words = set(beer_review)
    features = {}
    for w in word_features:
        features[w] = (w in words) # boolean assignment
    return features
print find_features(malty_reviews[0][0])

{'clamoring': False, 'desirable': False, 'swirls': False, 'yellow': False, 'four': False, 'maltier': False, 'woods': False, 'hanging': False, 'ringlets': False, 'woody': False, 'centimeter': False, 'chili': False, 'adorning': False, 'payoff': False, 'increase': False, 'quaffable': False, 'canes': False, 'buddy': False, 'quadruple': False, 'tickle': False, 'tingle': False, 'swap': False, 'lord': False, 'gracious': False, 'worth': False, 'deli': False, 'blanket': False, 'manic': False, 'expierences': False, 'figs': False, 'bringing': False, 'caramels': False, 'basics': False, 'daydream': False, 'caramely': False, 'pinch': False, 'piling': False, 'specialties': False, 'differentiates': False, 'nondescript': False, 'shows': False, 'whirlwind': False, 'chew': False, 'specially': False, 'tired': False, 'blasphemy': False, 'companyalpine': False, 'preface': False, 'bacon': False, 'elegant': False, 'second': False, 'glassl': False, 'rigorous': False, 'glassa': False, 'beigebrown': False, 'luci

In [70]:
featuresets = [(find_features(rev),category) for (rev,category) in beer_reviews]
training_set = feature_sets[:500]
testing_set = feature_sets[500:]

# Bayes Classifier

In [75]:
classifier = nltk.NaiveBayesClassifier.train(training_set)
print "Naive Bayes Algo % accuracy:",(nltk.classify.accuracy(classifier,testing_set))*100.
classifier.show_most_informative_features(15)


 Naive Bayes Algo % accuracy: 96.8
Most Informative Features
                     ipa = True              ipa : not_ip =     66.1 : 1.0
                  coffee = True           not_ip : ipa    =     44.6 : 1.0
                   brown = True           not_ip : ipa    =     35.0 : 1.0
                 roasted = True           not_ip : ipa    =     26.8 : 1.0
                  roasty = True           not_ip : ipa    =     24.2 : 1.0
                   cream = True           not_ip : ipa    =     13.3 : 1.0
                  barrel = True           not_ip : ipa    =     13.3 : 1.0
                   pliny = True              ipa : not_ip =     12.0 : 1.0
                    dark = True           not_ip : ipa    =     11.9 : 1.0
                  golden = True              ipa : not_ip =     11.3 : 1.0
                  fruity = True              ipa : not_ip =     10.3 : 1.0
                   mocha = True           not_ip : ipa    =      9.6 : 1.0
                   clear = True        