In [2]:
from pymongo import MongoClient
from bs4 import BeautifulSoup
import logging
import pandas as pd

we_eat_client = MongoClient()
we_eat_db = we_eat_client['we_eat']
website_collection = we_eat_db['websites']
review_collection = we_eat_db['reviews']

In [11]:
website_collection.count_documents({})

657

In [12]:
website = website_collection.find_one()

In [15]:
def parse_review(review, alias):
    r = {}
    r['alias'] = alias
    r['username'] = review.select_one('li.user-name').text.strip()
    r['href'] = review.select_one('li.user-name a').attrs['href']
    r['userid'] = r['href'].partition('?userid=')[2]
    r['rating'] = float(review.select_one('div.i-stars').attrs['title'].partition(' ')[0])
    r['date'] = review.select_one('span.rating-qualifier').text.strip()
    r['review_text'] = review.select_one('div.review-content p').text
    return r
    

In [16]:
def collect_reviews(website_collection):
    for website in website_collection.find():
        url = website['url']
        print(url)
        logging.info(url)
        alias = url.rpartition('/')[2]
        html = website['html']
        soup = BeautifulSoup(html, 'html.parser')
        reviews = soup.select('div.review')
        for review in reviews[1:]:
            review_data = parse_review(review, alias)
            save_review(review_data)
            

In [17]:
def save_review(review_data):
    review_collection.delete_many({
        'alias': review_data['alias'],
        'userid': review_data['userid']
    })
    review_collection.insert_one(review_data)
    

In [14]:
collect_reviews(website_collection)

https://www.yelp.com/biz/amazon-go-seattle-5


ValueError: invalid literal for int() with base 10: '5.0'

In [22]:
html = website['html']

In [24]:
soup = BeautifulSoup(html, 'html.parser')

In [28]:
reviews = soup.select('div.review')

In [None]:
#[review.text for review in reviews]

In [29]:
review = reviews[1]

In [38]:
review.select_one('li.user-name a')

<a class="user-display-name js-analytics-click" data-analytics-label="about_me" data-hovercard-id="Ytg6yFIK8NbNcVHBboKJPA" href="/user_details?userid=JuLCP_tSVkfgs6884f_SvQ" id="dropdown_user-name">Yan Z.</a>

In [39]:
review.select_one('div.i-stars')

<div class="i-stars i-stars--regular-5 rating-large" title="5.0 star rating">
<img alt="5.0 star rating" class="offscreen" height="303" src="https://s3-media2.fl.yelpcdn.com/assets/srv0/yelp_design_web/9b34e39ccbeb/assets/img/stars/stars.png" width="84"/>
</div>

In [42]:
review.select_one('span.rating-qualifier')

<span class="rating-qualifier">
        11/1/2018
    </span>

In [18]:
df = pd.DataFrame(list(review_collection.find()))

In [19]:
df.shape

(11431, 8)

In [20]:
names = df.userid.value_counts()

In [27]:
multiple_reviewers = names[names>1]

In [28]:
multiple_reviewers

iNIxjH-qoi0DW-0M2ML2DA    65
YQ01_0vr9TahbQiBSr69Nw    42
PSnUXFT5luMvb73mQQSjUg    36
4IsfIY1Df9dxEeDu7lHd5g    33
NfU0zDaTMEQ4-X9dbQWd9A    32
nfL3ilIVq58nvT1iqS_ZpQ    28
on72W_uXxaomHGxxcM3S_w    27
iztl5NRH01qvWA0LEICw8Q    26
OGLkivfkMESvh-2Fkg3bjw    24
a2osd_dBVDYaO-kL_Whjng    21
oTae-gLGMpNNFxcBn2D77A    21
5ju4tmqmdYS3QquclUxIkg    20
T0HWgeviH7722ZJpWFrB4Q    17
4teLaidLkOwhWuee9PET7g    17
cKTA-iJbfrioKWFiDreghw    16
KTz8KGjhIW3ejJ_RlTKH_g    16
QtIPWTVbngTszYtBU8tCkQ    16
P1FObQIDR0ErchMIKc05Xg    16
s3OwzXmv5Xrd2OuYu8YmHw    15
7u5sOo6-W-iwAgg-yrXE6Q    14
7f6ZDbtiFvVK0Ajhb0fuAA    13
6yw_4m8IBhdPeAvQNbFMDw    13
Mof6BV_iz_JP1RZvyg1sUw    13
TiAunhv-wmGaUZvtox7xvg    13
y79iXFlBPRO588FvlJVZWQ    13
6gMaEJU2xjRCPjiKYsB2Nw    13
ATo6Yuweoi3ZVB0qNz34sw    13
jC_eLNR8Le-FvQOK5jMi8g    12
lcQRKGDkc_uRTK_Yqx5z-A    12
zMKWSl7SxxPW3nidytb91g    12
                          ..
rrfHL-1xvGM3kDaXsNDkQA     2
XRMArbiZOGNYFGlap9fBDA     2
cczCUack5vpprurWhc0ckg     2
but2s3yQA5LokH

In [22]:
more_than_one_review = df[df.duplicated('userid', keep=False)]

In [23]:
more_than_one_review.shape

(5216, 8)

In [24]:
more_than_one_review.sort_values(by='userid')

Unnamed: 0,_id,alias,date,href,rating,review_text,userid,username
159,5c07264e91d56fb8e2f0ac86,elliotts-oyster-house-seattle-2,11/26/2018\n \n\n\n\n\n\nUpdated re...,/user_details?userid=-0EvRubXjITOMSvLrMu76A,4.0,Got the fish and chips - first two pieces of f...,-0EvRubXjITOMSvLrMu76A,Brian R.
4178,5c0726b191d56fb8e2f0bc39,le-pichet-seattle,10/19/2018,/user_details?userid=-0EvRubXjITOMSvLrMu76A,5.0,Hidden gem in Seattle. If you are looking for ...,-0EvRubXjITOMSvLrMu76A,Brian R.
8629,5c07272891d56fb8e2f0cd9c,cafe-nordstrom-seattle-5,7/7/2018,/user_details?userid=-Cys-ssZV8WqUc2Zvu-0Pg,4.0,"The staff at this cafe is amazing, from the ca...",-Cys-ssZV8WqUc2Zvu-0Pg,Rebeca G.
6328,5c0726e991d56fb8e2f0c49f,delaurenti-food-and-wine-seattle-2,8/6/2018,/user_details?userid=-Cys-ssZV8WqUc2Zvu-0Pg,5.0,"Large variety, super cool placeIf you like to ...",-Cys-ssZV8WqUc2Zvu-0Pg,Rebeca G.
10296,5c07275691d56fb8e2f0d41f,chip-and-drews-seattle,9/4/2014,/user_details?userid=-D4T2oVxSjvVN9_owkEmyw,1.0,You've gotta be kidding me.I work in the build...,-D4T2oVxSjvVN9_owkEmyw,Jonelle T.
8083,5c07271991d56fb8e2f0cb7a,new-star-seafood-restaurant-seattle,3/14/2018,/user_details?userid=-D4T2oVxSjvVN9_owkEmyw,3.0,I feel like New Star is really expensive for C...,-D4T2oVxSjvVN9_owkEmyw,Jonelle T.
4654,5c0726be91d56fb8e2f0be15,fado-irish-pub-seattle-3,10/19/2018,/user_details?userid=-DrzglrYFZG_qg_YtbknSQ,4.0,Nice find if you like funky bars it's an Irish...,-DrzglrYFZG_qg_YtbknSQ,Karen V.
5486,5c0726d491d56fb8e2f0c155,turkish-delight-seattle,10/19/2018,/user_details?userid=-DrzglrYFZG_qg_YtbknSQ,4.0,Stopped in on a cold day and had the lentil so...,-DrzglrYFZG_qg_YtbknSQ,Karen V.
7663,5c07270b91d56fb8e2f0c9d6,tats-truck-seattle,11/12/2018,/user_details?userid=-EnSMkETzd9oKlu3Kc3_Ew,2.0,Ordered an Italian hoagie which is always sold...,-EnSMkETzd9oKlu3Kc3_Ew,Gun H.
8308,5c07271f91d56fb8e2f0cc5b,buffalo-wild-wings-seattle-2,5/2/2018,/user_details?userid=-EnSMkETzd9oKlu3Kc3_Ew,1.0,"Absolutely terrible experience here, and a gig...",-EnSMkETzd9oKlu3Kc3_Ew,Gun H.


# Build dataframe for ALS model

In [29]:
aliases = more_than_one_review['alias'].unique()

In [30]:
alias_ids = zip(aliases, range(len(aliases)))

In [31]:
alias_dict = dict(alias_ids)

In [32]:
more_than_one_review['item_id'] = more_than_one_review['alias'].apply(alias_dict.get)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [34]:
user_ids = more_than_one_review['userid'].unique()

In [35]:
user_id = zip(user_ids, range(len(user_ids)))

In [36]:
user_dict = dict(user_id)

In [37]:
more_than_one_review['user_id'] = more_than_one_review['userid'].apply(user_dict.get)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [54]:
more_than_one_review['date'] = pd.to_datetime(more_than_one_review['date'], errors='input')

AssertionError: 

In [53]:
s = '8/24/2018\n            \n\n\n\n\n\nUpdated review'

s.partition('\n')[0]

'8/24/2018'

In [58]:
more_than_one_review['date'] = more_than_one_review['date'].apply(lambda x: x.partition('\n')[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [60]:
more_than_one_review['date'] = pd.to_datetime(more_than_one_review['date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [70]:
#more_than_one_review.sort_values('date')

In [65]:
pruned_df = more_than_one_review.drop(['_id', 'href', 'review_text', 'userid', 'username'], axis=1)

In [68]:
sorted_pruned_df = pruned_df.sort_values('date')

In [69]:
sorted_pruned_df

Unnamed: 0,alias,date,rating,item_id,user_id
11108,chocolate-and-ice-cream-delight-seattle,2005-08-02,3.0,596,1520
11032,unconventional-pizza-seattle,2005-09-14,4.0,592,1520
9153,chew-chews-and-eatery-seattle,2006-05-13,4.0,480,1369
11174,subway-seattle-6,2006-05-19,5.0,601,1369
9291,moghul-express-seattle,2006-05-22,5.0,488,1369
10280,taco-del-mar-seattle-15,2007-01-19,4.0,544,1470
11369,subway-seattle-3,2007-02-06,3.0,615,1499
10048,king-dome-deli-seattle,2007-02-08,4.0,530,293
7143,rays-deli-seattle,2007-02-18,5.0,370,1369
9137,king-street-kafe-seattle,2007-06-20,2.0,479,293
