In [0]:
import json
import warnings
import pandas as pd
from lxml import html
from requests import Session
from concurrent.futures import ThreadPoolExecutor as Executor
import requests


base_url = "https://www.yelp.com/biz/" # add business id
api_url = "/review_feed?sort_by=date_desc&start="
bid = 'Rc1lxc5lSKJYd162JHNMfQ'

class Scraper():
    def __init__(self):
        self.data = pd.DataFrame()

    def get_data(self, n, bid=bid):
        with Session() as s:
            with s.get(base_url+bid+api_url+str(n*20)) as resp: #makes an http get request to given url and returns response as json
                r = json.loads(resp.content) #converts json response into a dictionary
                _html = html.fromstring(r['review_list']) #loads from dictionary

                dates = _html.xpath("//div[@class='review-content']/descendant::span[@class='rating-qualifier']/text()")
                reviews = [el.text for el in _html.xpath("//div[@class='review-content']/p")]
                ratings = _html.xpath("//div[@class='review-content']/descendant::div[@class='biz-rating__stars']/div/@title")

                df = pd.DataFrame([dates, reviews, ratings]).T

                self.data = pd.concat([self.data,df])

    def scrape(self): #makes it faster
        # multithreaded looping
        with Executor(max_workers=40) as e:
            list(e.map(self.get_data, range(10)))

s = Scraper()
s.scrape()
df = s.data

In [0]:
df.head(3)

Unnamed: 0,0,1,2
0,\n 11/2/2019\n,Hells Kitchen is having a resurgence in good r...,5.0 star rating
1,\n 11/1/2019\n,"Delicious tapas, great chef, delicious wine. L...",5.0 star rating
2,\n 11/1/2019\n,If you want to see my personal brunch experien...,5.0 star rating


In [0]:
    # df = pd.DataFrame(reviews, columns=['date', 'text','stars'])
    # df['date']= pd.to_datetime(df['date']) 
    df = df.rename(columns = {0:'date', 2:'stars',1:'text'})
    df['date'] = df['date'].str.replace('\n','')
    df['date'] = df['date'].str.replace(' ','')
    df['date'] = df['date'].astype('datetime64[ns]')
    ratingDict = {'5.0 star rating':5,'4.0 star rating':4, '3.0 star rating':3, '2.0 star rating':2, '1.0 star rating':1}
    df['stars'] = df['stars'].map(ratingDict) 
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['week_number_of_year'] = df['date'].dt.week
    df.head(2)

Unnamed: 0,date,text,stars,month,year,week_number_of_year
0,2019-11-02,Hells Kitchen is having a resurgence in good r...,5.0,11.0,2019.0,44.0
1,2019-11-01,"Delicious tapas, great chef, delicious wine. L...",5.0,11.0,2019.0,44.0


In [0]:
    bydate = df.groupby(['year', 'month','week_number_of_year']).mean()
    bydate = pd.DataFrame(bydate.to_records())#flatten groupby column
    bydate = bydate.iloc[::-1]
    bydate = bydate.head(8)
    bydate['cumulative_avg_rating'] = bydate['stars'].mean()
    # bydate['last'] = bydate['week_number_of_year'].astype(int)
    bydate['year'] = bydate['year'].astype(int)
    bydate['week_number_of_year'] = bydate['week_number_of_year'].astype(int)
    bydate['date_of_week'] = bydate['year'].astype(str) + '-' + bydate['week_number_of_year'].astype(str) + '-2'
    bydate['date_of_week'] = bydate['date_of_week'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%W-%w'))
    bydate.head(2)

Unnamed: 0,year,month,week_number_of_year,stars,cumulative_avg_rating,date_of_week
34,2020,1.0,2,3.666667,4.302083,2020-01-14
33,2020,1.0,1,5.0,4.302083,2020-01-07


In [0]:
for row in bydate[['date_of_week','stars']].values:
  print(row)

[Timestamp('2020-01-14 00:00:00') 3.6666666666666665]
[Timestamp('2020-01-07 00:00:00') 5.0]
[Timestamp('2019-12-31 00:00:00') 4.166666666666667]
[Timestamp('2019-12-24 00:00:00') 3.75]
[Timestamp('2019-12-17 00:00:00') 3.6666666666666665]
[Timestamp('2019-12-10 00:00:00') 4.666666666666667]
[Timestamp('2019-12-03 00:00:00') 4.5]
[Timestamp('2019-01-08 00:00:00') 5.0]


In [0]:
b = bydate

In [0]:
import datetime
d = "2013-W26-6"
b['week_number_of_year']
r = datetime.datetime.strptime(d, "%Y-W%W-%w")
print(r)

2013-07-06 00:00:00


In [0]:
results = {'viztype0':{'positive': [{'term': pos_term, 'score': pos_score} 
                        for pos_term, pos_score in zip(positive_df['term'], positive_df['score'])], 
                      'negative': [{'term': neg_term, 'score': neg_score} 
                                    for neg_term, neg_score in zip(negative_df['term'], negative_df['score'])]},
          'viztype3':{'const star_data': [{'date': term, 'cumulative_avg_rating': current_rating, 'weekly_avg_rating': week_rating}
                                  for term, current_rating, week_rating in zip(bydate['week_number_of_year'], bydate['cumulative_avg_rating'], bydate['stars'])]}
          }
results