## Web Crawling

In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
import pandas as pd
from selenium.webdriver.common.keys import Keys

### Reviews of top15 beer brands

In [2]:
driver = webdriver.Chrome(service=Service("chromedriver.exe"))
url = "https://www.beeradvocate.com/beer/top-rated/"
driver.get(url)

In [3]:
links = []
for i in range(2,17):
    l1 = '//*[@id="ba-content"]/table/tbody/tr['+ str(i) +']/td[2]/a'
    l2 = driver.find_elements(By.XPATH, l1)
    links.extend(l2)

In [4]:
top = []
for l in links:
    l.send_keys(Keys.CONTROL +"\n") 
    time.sleep(3)
    
    main_window = driver.current_window_handle
    driver.switch_to.window(driver.window_handles[1])
    
    end = len(driver.find_elements_by_id("rating_fullview_content_2"))
    idid = [driver.find_elements_by_id("rating_fullview_content_2")[s].text for s in range(0, end)]
    
    for r in idid:
        if '\n' in r:
            top.append(r)
   
    driver.close()
    driver.switch_to.window(main_window)
    time.sleep(3)

  end = len(driver.find_elements_by_id("rating_fullview_content_2"))
  idid = [driver.find_elements_by_id("rating_fullview_content_2")[s].text for s in range(0, end)]


In [5]:
top1 = [top[i].split("\n\n")[1:-1] for i in range(len(top))]
top2 = [''.join(top1[i]).replace("\n", "") for i in range(len(top))]

### Reviews of the worst15 beer brands

In [6]:
driver = webdriver.Chrome(service=Service("chromedriver.exe"))
url = "https://www.beeradvocate.com/beer/worst/"
driver.get(url)

In [7]:
links = []
for i in range(2,17):
    l1 = '//*[@id="ba-content"]/table/tbody/tr['+ str(i) +']/td[2]/a'
    l2 = driver.find_elements(By.XPATH, l1)
    links.extend(l2)

In [8]:
worst = []
for l in links:
    r_count = 0
    l.send_keys(Keys.CONTROL +"\n") 
    time.sleep(3)
    
    main_window = driver.current_window_handle
    driver.switch_to.window(driver.window_handles[1])
    
    end = len(driver.find_elements_by_id("rating_fullview_content_2"))
    idid = [driver.find_elements_by_id("rating_fullview_content_2")[s].text for s in range(0, end)]
    
    for r in idid:
        if '\n' in r:
            worst.append(r)
   
    driver.close()
    driver.switch_to.window(main_window)
    time.sleep(3)

  end = len(driver.find_elements_by_id("rating_fullview_content_2"))
  idid = [driver.find_elements_by_id("rating_fullview_content_2")[s].text for s in range(0, end)]


In [9]:
worst1 = [worst[i].split("\n\n")[1:-1] for i in range(len(worst))]
worst2 = [''.join(worst1[i]).replace("\n", "") for i in range(len(worst))]

### Reviews of our target brands (Kloud, Hite, Cass)

In [10]:
klinks = ['https://www.beeradvocate.com/beer/profile/34664/120494/', 
          'https://www.beeradvocate.com/beer/profile/1472/3981/',
         'https://www.beeradvocate.com/beer/profile/874/8750/']

In [12]:
driver = webdriver.Chrome(service=Service("chromedriver.exe"))
kbeer = []
kbeer1 = []
for l in klinks:
    driver.get(l)
    main_window = driver.current_window_handle
    end = len(driver.find_elements_by_id("rating_fullview_content_2"))
    idid = [driver.find_elements_by_id("rating_fullview_content_2")[s].text for s in range(0, end)]
    
    for r in idid:
        if '\n' in r:
            kbeer.append(r)
    
    kbeer1.extend(kbeer[0:10])   
    kbeer = []

  end = len(driver.find_elements_by_id("rating_fullview_content_2"))
  idid = [driver.find_elements_by_id("rating_fullview_content_2")[s].text for s in range(0, end)]


In [13]:
kbeer2 = [kbeer1[i].split("\n\n")[1:-1] for i in range(len(kbeer1))]
kbeer3 = [''.join(kbeer2[i]).replace("\n", "") for i in range(len(kbeer1))]
kloud = kbeer3[0:10]
hite = kbeer3[10:20]
cass = kbeer3[20:]

## Preprocessing

In [14]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk_stopwords = set(stopwords.words("english"))
my_stopwords = ['beer', 'look', 'smell', 'taste', 'feel', 'mouthfeel', 'overall', 'good', 'terrible', 'well', 'korean']
new_stopwords = nltk_stopwords.union(my_stopwords)

In [15]:
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()
from nltk import pos_tag
from nltk.corpus import wordnet

In [16]:
def get_pos(w):
    tag = pos_tag([w])[0][1][0].upper()
    if tag == "V":
        return wordnet.VERB
    elif tag == "N":
        return wordnet.NOUN
    elif tag == "J":
        return wordnet.ADJ
    elif tag == "R":
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [17]:
def my_tokenizer(r):
    r1 = word_tokenize(r)
    r2 = [w.lower() for w in r1 if w.isalpha()]
    r3 = [w for w in r2 if not w in new_stopwords]
    r4 = [lem.lemmatize(w, get_pos(w)) for w in r3]
    return r4

# Topic Modeling

## 1. LSA (using SVD)

### top15

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(tokenizer = my_tokenizer)
tfidf_top = tfidf.fit_transform(top2)
tfidf_terms = tfidf.get_feature_names()

In [29]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=3, random_state=0)
svd.fit(tfidf_top)

TruncatedSVD(n_components=3, random_state=0)

In [30]:
svd.components_

array([[ 0.00924106,  0.00714241,  0.00366116, ...,  0.00640376,
         0.00765156,  0.01025254],
       [-0.00512379,  0.00802232, -0.00341113, ..., -0.00487367,
         0.00648551,  0.02080725],
       [-0.00095513, -0.00060882,  0.00068192, ..., -0.00220068,
         0.00641947,  0.00761665]])

In [31]:
svd_topics = svd.components_.argsort()[:, ::-1]
for i in range(0, 3):
    top_words = [tfidf_terms[x] for x in svd_topics[i, :5]]
    print(top_words)

['maple', 'chocolate', 'thick', 'barrel', 'like']
['orange', 'citrus', 'hop', 'mango', 'tropical']
['bottle', 'courtesy', 'johnnyhops', 'share', 'thx']


### worst15

In [32]:
tfidf_worst = tfidf.fit_transform(worst2)
tfidf_terms = tfidf.get_feature_names()

In [33]:
svd = TruncatedSVD(n_components=4, random_state=0)
svd.fit(tfidf_worst)

TruncatedSVD(n_components=4, random_state=0)

In [34]:
svd.components_

array([[ 0.00470303,  0.00554007,  0.01223386, ...,  0.00362638,
         0.00705661,  0.0163984 ],
       [-0.00013567,  0.00515128,  0.00525685, ...,  0.01099566,
         0.00641434, -0.03322886],
       [-0.00078342,  0.00523268,  0.00054709, ..., -0.00611817,
        -0.00041922, -0.0265412 ],
       [ 0.0031838 , -0.00803221,  0.01832461, ..., -0.00484437,
         0.01200559, -0.00283707]])

In [35]:
svd_topics = svd.components_.argsort()[:, ::-1]
for i in range(0, 4):
    top_words = [tfidf_terms[x] for x in svd_topics[i, :5]]
    print(top_words)

['light', 'like', 'water', 'taste', 'drink']
['ice', 'best', 'drink', 'get', 'price']
['water', 'like', 'taste', 'miller', 'coors']
['light', 'bud', 'miller', 'lite', 'easy']


### Korean brands

### Kloud

In [36]:
tfidf_kloud = tfidf.fit_transform(kloud)
tfidf_terms = tfidf.get_feature_names()

In [37]:
svd = TruncatedSVD(n_components=1, random_state=0)
svd.fit(tfidf_kloud)

TruncatedSVD(n_components=1, random_state=0)

In [38]:
svd.components_

array([[0.02659428, 0.02025397, 0.05044032, 0.02025397, 0.01103005,
        0.04157749, 0.0924181 , 0.09985318, 0.01103005, 0.04050795,
        0.053487  , 0.04050795, 0.01103005, 0.02025397, 0.04266514,
        0.09594895, 0.04157749, 0.01103005, 0.1560938 , 0.09985318,
        0.08699785, 0.05044032, 0.02025397, 0.02025397, 0.1617596 ,
        0.02025397, 0.13987002, 0.05044032, 0.01103005, 0.04050795,
        0.03856473, 0.05619323, 0.02025397, 0.07731433, 0.10088063,
        0.02025397, 0.02025397, 0.02025397, 0.02025397, 0.02025397,
        0.02025397, 0.01103005, 0.02025397, 0.05044032, 0.05619323,
        0.13064762, 0.02025397, 0.04266514, 0.06126847, 0.02025397,
        0.05044032, 0.02025397, 0.02025397, 0.01103005, 0.05044032,
        0.05619323, 0.1214427 , 0.02025397, 0.01103005, 0.04157749,
        0.04050795, 0.04266514, 0.02025397, 0.05044032, 0.02025397,
        0.01103005, 0.01103005, 0.02025397, 0.05619323, 0.01103005,
        0.05044032, 0.01103005, 0.01103005, 0.04

In [39]:
svd_topics = svd.components_.argsort()[:, ::-1]
for i in range(0, 1):
    top_words = [tfidf_terms[x] for x in svd_topics[i, :5]]
    print(top_words)

['malt', 'light', 'hop', 'grassy', 'slightly']


### Hite

In [40]:
tfidf_hite = tfidf.fit_transform(hite)
tfidf_terms = tfidf.get_feature_names()

In [41]:
svd = TruncatedSVD(n_components=1, random_state=0)
svd.fit(tfidf_hite)

TruncatedSVD(n_components=1, random_state=0)

In [42]:
svd.components_

array([[0.04706187, 0.08487791, 0.02764983, 0.02764983, 0.02865599,
        0.08487791, 0.04587161, 0.09354658, 0.02865599, 0.05219714,
        0.05827533, 0.13787549, 0.16699719, 0.02764983, 0.05219714,
        0.04587161, 0.05827533, 0.05827533, 0.02865599, 0.02764983,
        0.05443721, 0.04706187, 0.09373209, 0.02764983, 0.06351177,
        0.06335526, 0.17875065, 0.08487791, 0.05827533, 0.05219714,
        0.05827533, 0.05219714, 0.04587161, 0.02764983, 0.02865599,
        0.02764983, 0.0816052 , 0.04587161, 0.02865599, 0.09412374,
        0.08138699, 0.13451405, 0.20354603, 0.05529965, 0.04587161,
        0.04706187, 0.0816052 , 0.05443721, 0.05827533, 0.02764983,
        0.11872573, 0.04706187, 0.04786509, 0.04587161, 0.02865599,
        0.02865599, 0.13988651, 0.12158045, 0.07304423, 0.04587161,
        0.02764983, 0.06978148, 0.02764983, 0.12718619, 0.05219714,
        0.02865599, 0.02764983, 0.02764983, 0.05827533, 0.05219714,
        0.11374422, 0.10836693, 0.15733892, 0.04

In [43]:
svd_topics = svd.components_.argsort()[:, ::-1]
for i in range(0, 1):
    top_words = [tfidf_terms[x] for x in svd_topics[i, :5]]
    print(top_words)

['well', 'cold', 'light', 'bottle', 'asian']


### Cass

In [44]:
tfidf_cass = tfidf.fit_transform(cass)
tfidf_terms = tfidf.get_feature_names()

In [46]:
svd = TruncatedSVD(n_components=1, random_state=0)
svd.fit(tfidf_cass)

TruncatedSVD(n_components=1, random_state=0)

In [47]:
svd.components_

array([[ 5.24975517e-02,  5.98811003e-02,  5.98811003e-02,
         1.31064489e-02,  5.98811003e-02,  1.93448928e-01,
         1.04995103e-01,  9.59618863e-02,  9.35378363e-02,
         9.35378363e-02,  1.31064489e-02,  2.16900744e-02,
        -9.95286401e-17,  1.31064489e-02,  5.98811003e-02,
         5.01515864e-02,  5.01515864e-02,  9.65331783e-02,
         3.45327072e-02,  1.31064489e-02,  9.59618863e-02,
         1.91748001e-01,  4.33801489e-02,  9.65331783e-02,
         4.33801489e-02,  2.95802238e-02,  1.31064489e-02,
         5.24975517e-02,  5.01515864e-02,  1.26204086e-01,
         1.24209801e-01,  5.24975517e-02,  2.21438127e-01,
         1.31064489e-02,  5.01515864e-02,  5.98811003e-02,
         9.59618863e-02,  2.16900744e-02,  2.16900744e-02,
         2.95802238e-02,  5.98811003e-02, -9.95286401e-17,
         2.16900744e-02,  2.16900744e-02,  3.45327072e-02,
         5.24975517e-02,  1.31064489e-02,  2.16900744e-02,
         2.16900744e-02,  8.02603620e-02,  5.01515864e-0

In [48]:
svd_topics = svd.components_.argsort()[:, ::-1]
for i in range(0, 1):
    top_words = [tfidf_terms[x] for x in svd_topics[i, :5]]
    print(top_words)

['come', 'lager', 'american', 'grassy', 'bottle']


## 2. LDA

### TOP15

In [49]:
top3 = []

for d in top2:
    d1 = my_tokenizer(d)
    top3.append(d1)

top3

[['didnt',
  'think',
  'go',
  'give',
  'perfect',
  'score',
  'look',
  'smell',
  'taste',
  'however',
  'mouth',
  'mean',
  'equal',
  'pardon',
  'sit',
  'mouth',
  'yeh',
  'like',
  'twice',
  'anything',
  'ive',
  'ever',
  'grab',
  'face',
  'hand',
  'say',
  'go',
  'throat',
  'taste',
  'familiar',
  'like',
  'real',
  'pure',
  'butter',
  'real',
  'buttermilk',
  'pancake',
  'fresh',
  'highbush',
  'blueberry',
  'real',
  'vermont',
  'maple',
  'syrup',
  'madagascar',
  'vanilla',
  'serve',
  'seattle',
  'coffee',
  'shot',
  'pappy'],
 ['read',
  'review',
  'call',
  'legendary',
  'kbbs',
  'kinda',
  'make',
  'nervous',
  'dig',
  'stout',
  'thought',
  'could',
  'miss',
  'white',
  'stout',
  'happily',
  'guy',
  'drinking',
  'something',
  'else',
  'maybe',
  'meant',
  'kbbs',
  'like',
  'mountain',
  'dew',
  'enjoy',
  'reviewer',
  'afraid',
  'bah',
  'care',
  'kbbs',
  'look',
  'great',
  'pour',
  'wine',
  'stem',
  'really',
  'da

In [52]:
import gensim
from gensim import corpora

In [62]:
gensim_terms = corpora.Dictionary(top3)

In [63]:
top_matrix = [gensim_terms.doc2bow(w) for w in top3]

In [64]:
lda = gensim.models.ldamodel.LdaModel

In [65]:
lda_model = lda(top_matrix, num_topics=3, id2word=gensim_terms, random_state=0)

In [66]:
lda_model.print_topics(num_words=5)

[(0,
  '0.013*"maple" + 0.010*"chocolate" + 0.009*"barrel" + 0.007*"flavor" + 0.007*"coffee"'),
 (1,
  '0.008*"hop" + 0.007*"like" + 0.006*"balance" + 0.006*"head" + 0.006*"fruit"'),
 (2,
  '0.010*"note" + 0.009*"like" + 0.009*"head" + 0.008*"thick" + 0.007*"sweet"')]

### worst15

In [67]:
worst3 = []

for d in worst2:
    d1 = my_tokenizer(d)
    worst3.append(d1)

worst3

[['hold',
  'review',
  'one',
  'go',
  'back',
  'forth',
  'consider',
  'straight',
  'light',
  'rat',
  'jut',
  'poor',
  'high',
  'end',
  'extremely',
  'watery',
  'flat',
  'taste',
  'rank',
  'low',
  'calorie',
  'brew',
  'solely',
  'would',
  'rank',
  'halfway',
  'poor',
  'average'],
 ['ever',
  'use',
  'found',
  'ground',
  'kid',
  'smell',
  'fizzy',
  'watery',
  'literally',
  'reminiscent',
  'tonic',
  'water',
  'soda',
  'bad',
  'bubbly',
  'carbonate',
  'pallette',
  'feel',
  'like',
  'caloric',
  'intake',
  'name',
  'make',
  'diet',
  'restricts',
  'many',
  'calorie',
  'consume',
  'decent',
  'replacement',
  'regular',
  'light',
  'miller',
  'lite',
  'bud',
  'light',
  'coors',
  'light',
  'review',
  'beer',
  'let',
  'know',
  'substitute',
  'beer',
  'really',
  'water',
  'though'],
 ['one',
  'golf',
  'course',
  'great',
  'well',
  'thought',
  'would',
  'enough',
  'short',
  'review',
  'review',
  'pretty',
  'much',
  'i

In [68]:
gensim_terms = corpora.Dictionary(worst3)

In [69]:
worst_matrix = [gensim_terms.doc2bow(w) for w in worst3]

In [70]:
lda = gensim.models.ldamodel.LdaModel

In [71]:
lda_model = lda(worst_matrix, num_topics=4, id2word=gensim_terms, random_state=0)

In [72]:
lda_model.print_topics(num_words=5)

[(0,
  '0.034*"light" + 0.018*"like" + 0.010*"flavor" + 0.010*"one" + 0.009*"water"'),
 (1,
  '0.013*"like" + 0.012*"flavor" + 0.008*"light" + 0.008*"drink" + 0.007*"would"'),
 (2,
  '0.009*"drink" + 0.007*"water" + 0.007*"like" + 0.007*"bad" + 0.007*"best"'),
 (3,
  '0.018*"light" + 0.011*"like" + 0.010*"drink" + 0.009*"head" + 0.008*"malt"')]

### Korean brands

### Kloud

In [73]:
kloud2 = []

for d in kloud:
    d1 = my_tokenizer(d)
    kloud2.append(d1)

kloud2

[['fl',
  'oz',
  'store',
  'englewood',
  'village',
  'colorado',
  'abv',
  'malt',
  'classic',
  'whatever',
  'mean',
  'premium',
  'quality',
  'lager',
  'website',
  'kloud',
  'make',
  'ferment',
  'concentrate',
  'without',
  'dilution',
  'water',
  'use',
  'original',
  'gravity',
  'method',
  'make',
  'raw',
  'material',
  'chosen',
  'strict',
  'selection',
  'process',
  'provide',
  'authentic',
  'flavor',
  'name',
  'kloud',
  'composite',
  'k',
  'korea',
  'cloud',
  'similar',
  'appearance',
  'foam',
  'hope',
  'provide',
  'represent',
  'korea',
  'express',
  'product',
  'name',
  'appearance',
  'look',
  'filter',
  'carbonate',
  'slightly',
  'foamy',
  'head',
  'vibrant',
  'look',
  'innocuous',
  'bit',
  'insipid',
  'aroma',
  'slightly',
  'doughy',
  'slighty',
  'raw',
  'malty',
  'sweetness',
  'heavy',
  'across',
  'board',
  'honeyed',
  'sweetness',
  'hint',
  'cidery',
  'cane',
  'sugar',
  'hop',
  'aromatics',
  'yeast',
 

In [74]:
gensim_terms = corpora.Dictionary(kloud2)

In [75]:
kloud_matrix = [gensim_terms.doc2bow(w) for w in kloud2]

In [77]:
lda = gensim.models.ldamodel.LdaModel
lda_model = lda(kloud_matrix, num_topics=1, id2word=gensim_terms, random_state=0)
lda_model.print_topics(num_words=5)

[(0,
  '0.014*"malt" + 0.011*"light" + 0.011*"hop" + 0.009*"body" + 0.009*"slightly"')]

### Cass

In [78]:
cass2 = []

for d in cass:
    d1 = my_tokenizer(d)
    cass2.append(d1)

cass2

[['bottle',
  'prince',
  'korea',
  'town',
  'los',
  'feel',
  'straight',
  'pretty',
  'flat',
  'van',
  'gogh',
  'zoloft',
  'like',
  'lot',
  'go',
  'vincent',
  'dull',
  'blank',
  'face',
  'sad',
  'even',
  'suppose',
  'wild',
  'spirit',
  'one',
  'hurray',
  'ok'],
 ['hey',
  'come',
  'definitely',
  'awful',
  'yeah',
  'know',
  'adjunct',
  'cr',
  'p',
  'yeah',
  'water',
  'aftertaste',
  'quite',
  'bad',
  'feel',
  'well',
  'japanese',
  'beer',
  'japan',
  'lager',
  'always',
  'well',
  'mention',
  'go',
  'food',
  'even'],
 ['ml',
  'bottle',
  'green',
  'pea',
  'stavanger',
  'abv',
  'clear',
  'pale',
  'golden',
  'colour',
  'moderate',
  'white',
  'head',
  'sweetish',
  'aroma',
  'pilsener',
  'malt',
  'starch',
  'grass',
  'candy',
  'fizzy',
  'medium',
  'sweet',
  'flavour',
  'element',
  'aroma',
  'moderate',
  'hop'],
 ['secret',
  'drinking',
  'ca',
  'step',
  'buy',
  'large',
  'plastic',
  'bottle',
  'variety',
  'multip

In [79]:
gensim_terms = corpora.Dictionary(cass2)

In [80]:
cass_matrix = [gensim_terms.doc2bow(w) for w in cass2]

In [81]:
lda = gensim.models.ldamodel.LdaModel
lda_model = lda(cass_matrix, num_topics=1, id2word=gensim_terms, random_state=0)
lda_model.print_topics(num_words=5)

[(0,
  '0.016*"bottle" + 0.014*"come" + 0.012*"soju" + 0.012*"lager" + 0.009*"hop"')]

### Hite

In [82]:
hite2 = []

for d in hite:
    d1 = my_tokenizer(d)
    hite2.append(d1)

hite2

[['light',
  'easy',
  'drinking',
  'hint',
  'sweetness',
  'even',
  'bit',
  'soda',
  'nose',
  'reflect',
  'one',
  'lot',
  'laud',
  'complain'],
 ['ml',
  'bottle',
  'green',
  'pea',
  'stavanger',
  'abv',
  'crystal',
  'clear',
  'pale',
  'golden',
  'colour',
  'moderate',
  'white',
  'head',
  'sweetish',
  'aroma',
  'pilsener',
  'malt',
  'grass',
  'maize',
  'light',
  'body',
  'citric',
  'sweet',
  'citric',
  'flavour',
  'note',
  'grass',
  'maize',
  'modest',
  'bitterness'],
 ['drink',
  'cold',
  'cold',
  'get',
  'pours',
  'clear',
  'minimal',
  'aroma',
  'bland',
  'fizzy',
  'flavor',
  'quick',
  'finish',
  'come',
  'can',
  'bottle',
  'even',
  'plastic',
  'bottle',
  'something'],
 ['great',
  'even',
  'really',
  'one',
  'love',
  'reason',
  'always',
  'seem',
  'case',
  'house',
  'bland',
  'boring',
  'real',
  'another',
  'fantastic',
  'hot',
  'day',
  'want',
  'hop',
  'anything',
  'beyond',
  'malt',
  'rice',
  'go',
  '

In [83]:
gensim_terms = corpora.Dictionary(hite2)

In [84]:
hite_matrix = [gensim_terms.doc2bow(w) for w in hite2]

In [85]:
lda = gensim.models.ldamodel.LdaModel
lda_model = lda(hite_matrix, num_topics=1, id2word=gensim_terms, random_state=0)
lda_model.print_topics(num_words=5)

[(0,
  '0.014*"light" + 0.010*"flavor" + 0.010*"drink" + 0.010*"lager" + 0.010*"cold"')]