In [1]:
import requests
import pandas as pd
from src import utilities as utils
import random
from retry import retry
from src import config
from pathlib import Path
from src import clogger as logger
import hashlib
import time
import json
from datetime import datetime
import re

In [2]:

from concurrent.futures import ThreadPoolExecutor

# BASE_DIR = config.PROJECT_DIR / "data" / "zomato"
CACHE_DIR = Path("/media/jyotiraditya/Ultra Touch/Spatic/cache_home/cache_zomato_reviews")
log = logger.get_logger('ZomatoRestaurants')
logging = log


def get_random_proxy(index=None):
    available_proxies = utils.get_other_proxies()
    if len(available_proxies) == 0:
        raise Exception('No proxy available')

    if index is not None and 0 <= index < len(available_proxies):
        index = index % len(available_proxies)
        current_proxy = available_proxies[index]
    else:
        current_proxy_ind = random.choice(range(len(available_proxies)))
        current_proxy = available_proxies[current_proxy_ind]
    proxy = {
        "https": current_proxy
    }
    return proxy


class EmptyPageError(Exception):
    pass


In [7]:

@retry(exceptions=(EmptyPageError,), tries=3, delay=3)
def get_html_response_google(hex_id, num_reviews, proxy):
    # proxy=get_random_proxy()
    BASE_DIR = config.PROJECT_DIR / "data" / "google"
    CACHE_DIR = Path("/media/jyotiraditya/Ultra Touch/Spatic/cache_home/cache_google_reviews_oldest")
    log = logger.get_logger('GoogleReviews')
    logging = log
    dig = hashlib.md5(f"{hex_id}_{num_reviews}_fetch_latest=true".encode()).hexdigest()
    cache_dirs = [BASE_DIR / 'cache', CACHE_DIR, ]
    lambda_url = random.choice(["https://u7ub4ix7quehlb4mptoucjvndu0brrnf.lambda-url.us-east-2.on.aws",
                                "https://sadxe5vx7h3wdbi3j5z5za3o5a0mxgtk.lambda-url.us-east-1.on.aws"])
    url = f"{lambda_url}/?hex_id={hex_id}&num_reviews={num_reviews}&fetch_latest=true"
    for cache_dir in cache_dirs:
        cache_dir.mkdir(parents=True, exist_ok=True)
        cache_file = cache_dir / f'{dig}.txt'
        if cache_file.exists():
            with open(cache_file) as f:
                resp = f.read()
                if "Expecting value" in resp:
                    print('deleting file')
                    cache_file.unlink()
                    continue
                return resp
    try:
        print("making request")
        response = requests.get(
            url,
            headers=None,
            proxies=None, timeout=10,
        )
        if response.status_code == 200:
            time.sleep(1)
            text = response.text
            if "Expecting value" in text:
                raise EmptyPageError
            with open(cache_file, 'w+') as f:
                f.write(text)
            data = json.loads(text)
            if data['statusCode'] == 200:
                return text
            else:
                log.error(text)
                raise ValueError
        else:
            log.error(response.text, response.status_code)
            raise EmptyPageError
    except Exception as e:
        log.error(f'{proxy} | {e}')
        raise e


def get_google_reviews(hex_id, num_reviews):
    # print(f"querying : {hex_id = } ")
    if num_reviews < 11:
        return
    # if num_reviews > 0 and num_reviews % 10 == 0:
    #     num_reviews -= 1
    try:
        data = get_html_response_google(hex_id, num_reviews=num_reviews, proxy=None)
    except:
        return None
    if 'Expecting' in data:
        raise ValueError
    else:
        pass
    data = json.loads(data)
    if data['statusCode'] != 200:
        print(data)
        return 0
    # resp = dict(
    #     date=data['data']['date'], number_of_reviews=data['data']['number_of_reviews'])
    return data['data']


def get_reviews_per_day_(cid, number_of_votes, source):
    if number_of_votes < 30 and source == 'gmaps_v2':
        return {'date': 0, "number_of_reviews": 0}
    f = {'gmaps_v2': get_google_reviews, 'zomato': get_reviews_zomato}
    try:
        res = f[source](cid, number_of_votes)
    except KeyError:
        res = {'date': 0, "number_of_reviews": 0}
    return res





@retry(exceptions=(EmptyPageError,), tries=3, delay=1)
def get_html_response(url, proxy):
    proxy = get_random_proxy()
    # proxy=None
    BASE_DIR = config.PROJECT_DIR / "data" / "zomato"

    dig = hashlib.md5(url.encode()).hexdigest()
    cache_dirs = [BASE_DIR / 'cache', CACHE_DIR, ]
    for cache_dir in cache_dirs:
        cache_dir.mkdir(parents=True, exist_ok=True)
        cache_file = cache_dir / f'{dig}.txt'
        if cache_file.exists():
            with open(cache_file) as f:
                resp = f.read()
                return resp
    # headers = get_headers()
    headers = {"User-Agent": "PostmanRuntime/7.32.3",
               "Accept": "*/*",
               "Cache-Control": "no-cache",
               "Postman-Token": "51227458-1214-44a6-8f4b-f761ec0099ce",
               "Host": "www.zomato.com",
               "Accept-Encoding": "gzip, deflate, br",
               "Connection": "keep-alive",
               "Cookie": r"PHPSESSID=d211e97ee90f64010ad7b769d4d04ecf; ak_bmsc=4B18F182E4143B6397D227D52152B8BB~000000000000000000000000000000~YAAQbW4/F7lGi0KKAQAAw6m+ShSNTUWoLjzwHd3EHr5WBV5Knrg6PnMYUNEMXodRwWzhxg6ngrqA00b8kNSNTmwUnJn0m+6GqjEK/lCRk/1SRHCTcMSdSiBk2JgAqWYYDekRP05CDc20hjCwhzA1/jgF9NE2t1zShtAEnOVc2zgjGa4bWLEnhQxu/GNs1B6YYs9GomL6oq7tXBDd2ntUtWWCG/G8EKN3YH3VUnTB5Glc77VFkJtMLgX2dO8Vi+NT8x194KT0hRbdTH8rXLUo2O6qQjSiHRxSKoIv60oLeu52ch9Bm3VDiHMhWSRrL02BGipXnZkidWjwLpmx51j6FgBgrnW0qoQklpVhQ1lTsT/oFssWZChORJwRq4s=; csrf=21d1af5ae835429f207597a6cec658d6; fbcity=3; fbtrack=eeb9eaaeb90b494e2b326e56643a9bad; fre=0; rd=1380000; zl=en; AWSALBTG=RK0ccuPffBh8VFY/BolIJO3spzo2UtOXUYpj1ruvnVmsCj/pwymk0pgOYRb1U2QtgR2mhV2DYWDmeuqOVVMTRhpSS0+TcPNYh1IElZN9vXVqZ/9G1isrxtMou4ghKKSzdD6Yx8zwzx9Kpjb+Vhe0gDFvXnubSa+oliwYNbKfhlQg; AWSALBTGCORS=RK0ccuPffBh8VFY/BolIJO3spzo2UtOXUYpj1ruvnVmsCj/pwymk0pgOYRb1U2QtgR2mhV2DYWDmeuqOVVMTRhpSS0+TcPNYh1IElZN9vXVqZ/9G1isrxtMou4ghKKSzdD6Yx8zwzx9Kpjb+Vhe0gDFvXnubSa+oliwYNbKfhlQg; locus=%7B%22addressId%22%3A0%2C%22lat%22%3A19.017656%2C%22lng%22%3A72.856178%2C%22cityId%22%3A3%2C%22ltv%22%3A3%2C%22lty%22%3A%22city%22%2C%22fetchFromGoogle%22%3Afalse%2C%22dszId%22%3A77482%2C%22fen%22%3A%22Mumbai%22%7D; ltv=3; lty=3"
               }

    try:
        response = requests.get(
            url,
            headers=headers,
            proxies=proxy, timeout=10,
        )
        if response.status_code == 200:
            with open(cache_file, 'w+') as f:
                f.write(response.text)
            return response.text
        else:
            log.error(response.text, response.status_code)
            raise EmptyPageError
    except Exception as e:
        log.error(f'{proxy} | {e}')
        raise EmptyPageError


def get_reviews_zomato(rest_id, number_of_reviews):
    url = 'https://www.zomato.com/webroutes/reviews/loadMore?res_id={rest_id}&filter=reviews-dd&sort=da'
    url_ = url.format(rest_id=rest_id)
    # print(f"querying : {url_}")
    data = get_html_response(url_, None)
    data = json.loads(data)
    reviews = data['entities']["REVIEWS"]
    timestamps = [reviews[k]['timestamp'] for k in reviews]
    if not timestamps:
        return None
    date = timestamps[0]
    s = data['page_data']['sections']['SECTION_REVIEWS']['pageReviewsText']
    pattern = re.compile("of (?P<number_of_reviews>\d+) reviews")
    try:
        res = pattern.search(s).groupdict()
    except:
        res = {}
    date = datetime.strptime(date, "%b %d, %Y").timestamp()
    res['date'] = int(date)
    return res


In [13]:
def get_reviews_per_day(df, suffix="_1"):
    for i in range(10):
        dd = df.to_dict(orient='records')
        cids = [i[f'cid_{suffix}'] for i in dd]
        reviews = [i[f'number_of_votes_{suffix}'] for i in dd]
        sources = [i[f'source_{suffix}'] for i in dd]
        with ThreadPoolExecutor(max_workers=10) as executor:
            dd_ = executor.map(get_reviews_per_day_, cids, reviews, sources)
    df['additional_data'] = list(dd_)
    df = pd.concat([df, pd.json_normalize(df['additional_data'])], axis=1).drop(
        columns="additional_data")
    df=df.rename(columns={"date":f"date_{suffix}"})
    df[f'reviews_per_day_{suffix}'] = 86400 * df[f'number_of_votes_{suffix}'] / (df[f'created_at_{suffix}'] - df[f'date_{suffix}'])
    df = df.sort_values(by=f'reviews_per_day_{suffix}', ascending=False)
    return df

In [6]:
get_reviews_per_day()

2024-01-31 19:33:40,349 ERROR ZomatoRestaurants:143 - {'https': 'http://vsatyen:unWLWiVPCd@107.181.187.120:10003'} | HTTPSConnectionPool(host='www.zomato.com', port=443): Max retries exceeded with url: /webroutes/reviews/loadMore?res_id=54199&filter=reviews-dd&sort=da (Caused by ProxyError('Unable to connect to proxy', OSError('Tunnel connection failed: 502 Bad Gateway')))


{'number_of_reviews': '1214', 'date': 1360953000}

In [42]:
df=pd.read_csv("zomato_gmaps_deduped_data.csv").head(40)

In [43]:
df

Unnamed: 0,id_1,id_2,cid_1,cid_2,source_1,source_2,name_1,number_of_votes_1,created_at_1,name_2,number_of_votes_2,created_at_2,service_options_1,service_options_2,brand_id_1,brand_id_2
0,ChIJR5XRNnc9rjsROCln6pfY2ww,19573302,0x3bae3d7736d19547:0xcdbd897ea672938,19573302,gmaps_v2,zomato,Hello GOWDA BIRIYANI KABAB ADDA,1,1687847039,H Gowda Biryani Kabab Adda,43,1688639100,[],[delivery],,N_A
1,ChIJq96HAxUXrjsRTEP8a6kyxeo,19269962,0x3bae17150387deab:0xeac532a96bfc434c,19269962,gmaps_v2,zomato,Plan B,2154,1699615207,Plan B,1264,1688639100,[],"[dining, delivery]",,zomato_plan_b
2,ChIJnThGvO4XrjsREMVeMmh_GAA,18872888,0x3bae17eebc46389d:0x187f68325ec510,18872888,gmaps_v2,zomato,Chef Kitchen,47,1699615207,Chef Kitchen,1842,1688639100,[],"[dining, delivery]",,N_A
3,ChIJrTLShas9rjsRyvhmFLp2JY0,20024640,0x3bae3dab85d232ad:0x8d2576ba1466f8ca,20024640,gmaps_v2,zomato,TOLL PARADISE,16,1687847039,Toll Paradise,42,1688639100,[],"[dining, delivery]",,N_A
4,ChIJcyMT4X49rjsRnKUPyPqI12k,18742896,0x3bae3d7ee1132373:0x69d788fac80fa59c,18742896,gmaps_v2,zomato,1992 Chats Space,154,1699615207,1992 Chats - Space,650,1688639100,[],"[dining, delivery]",,N_A
5,ChIJrbeWFr49rjsRgrYVepIjUN4,51305,0x3bae3dbe1696b7ad:0xde5023927a15b682,51305,gmaps_v2,zomato,Kentaky Fast Food,463,1699615207,Kentaky Fast Food,373,1688639100,[],"[dining, delivery]",,N_A
6,ChIJAbD-pT09rjsR_XnP7qgnSE4,19449687,0x3bae3d3da5feb001:0x4e4827a8eecf79fd,19449687,gmaps_v2,zomato,Cafe Coffee Day,633,1699615207,Cafe Coffee Day,480,1688639100,[],"[dining, delivery]",cafecoffeeday,zomato_cafe_coffee_day
7,ChIJg3bVF-09rjsRozhTJiYrS8g,18892504,0x3bae3ded17d57683:0xc84b2b26265338a3,18892504,gmaps_v2,zomato,Chai Point - Orion Mall,183,1699615207,Chai Point,314,1688639100,[],"[dining, delivery]",chaipoint,zomato_chai_point
8,ChIJbVs8CNA9rjsRLGTel5n5FWk,19487952,0x3bae3dd0083c5b6d:0x6915f99997de642c,19487952,gmaps_v2,zomato,DOSE MANE,268,1699615207,Dose Mane,137,1688639100,[],"[dining, delivery]",,N_A
9,ChIJyUvDu68XrjsREIYDxjUGIag,19016086,0x3bae17afbbc34bc9:0xa8210635c6038610,19016086,gmaps_v2,zomato,Kenzai,541,1699615207,Kenzai,785,1688639100,[],"[dining, delivery]",,N_A


In [44]:
df=get_reviews_per_day(df,"1")
df=get_reviews_per_day(df,"2")

In [45]:
df

Unnamed: 0,id_1,id_2,cid_1,cid_2,source_1,source_2,name_1,number_of_votes_1,created_at_1,name_2,...,service_options_2,brand_id_1,brand_id_2,date_1,number_of_reviews,url,reviews_per_day_1,number_of_reviews.1,date_2,reviews_per_day_2
23,ChIJ3de7cd8XrjsRpDs2CgrLksE,18232664,0x3bae17df71bbd7dd:0xc192cb0a0a363ba4,18232664,gmaps_v2,zomato,Delhi Mess,1957,1699615207,Delhi Mess,...,"[dining, delivery]",,N_A,1467182000.0,1966,https://www.google.com/maps/preview/review/lis...,0.727456,113.0,1607020200,26.252743
31,ChIJeXWlTmA9rjsRwayijh__N44,18160086,0x3bae3d604ea57579:0x8e37ff1f8ea2acc1,18160086,gmaps_v2,zomato,Domino's Pizza,3385,1699615207,Domino's Pizza,...,"[dining, delivery]",dominos,zomato_domino's_pizza,1471520000.0,3417,https://www.google.com/maps/preview/review/lis...,1.282201,11.0,1635532200,20.010959
37,ChIJs1OiG489rjsR3OA2ita5gm0,55655,0x3bae3d8f1ba253b3:0x6d82b9d68a36e0dc,55655,gmaps_v2,zomato,Hotel Sagar,19392,1699615207,Sagar Hotel,...,"[dining, delivery]",,N_A,1563794000.0,19964,https://www.google.com/maps/preview/review/lis...,12.335861,29.0,1636655400,17.784036
12,ChIJdR6R3YU9rjsR0qHeJqZ_IoE,18640097,0x3bae3d85dd911e75:0x81227fa626dea1d2,18640097,gmaps_v2,zomato,Green Chilly,1031,1699615207,Green Chilly,...,"[dining, delivery]",business,zomato_green_chilly,1475485000.0,1055,https://www.google.com/maps/preview/review/lis...,0.39744,6.0,1576521000,3.96559
13,ChIJb0fMPSwWrjsRdpbRbGluIT0,50282,0x3bae162c3dcc476f:0x3d216e696cd19676,50282,gmaps_v2,zomato,Chung's Chinese Corner,2643,1699615207,Chung's Chinese Corner,...,"[dining, delivery]",,zomato_chung's_chinese_corner,1329236000.0,2664,https://www.google.com/maps/preview/review/lis...,0.616544,234.0,1493145000,2.493964
21,ChIJVVVVxXQ-rjsR3rUjTglWt9o,18419851,0x3bae3e74c5555555:0xdab756094e23b5de,18419851,gmaps_v2,zomato,New Shanthi Sagar,8068,1699615207,New Shanthi Sagar,...,"[dining, delivery]",,zomato_new_shanthi_sagar,,8151,https://www.google.com/maps/preview/review/lis...,,27.0,1600194600,2.202873
10,ChIJNTptIjkWrjsRHAmI3yRH5qc,54225,0x3bae1639226d3a35:0xa7e64724df88091c,54225,gmaps_v2,zomato,Gullu's Chaats,4782,1699615207,Gullu's Chat,...,"[dining, delivery]",,zomato_gullu's_chat,1314083000.0,4810,https://www.google.com/maps/preview/review/lis...,1.071673,831.0,1323455400,2.034702
28,ChIJU-Z9rmE9rjsR_i85IdrDr7U,60560,0x3bae3d61ae7de653:0xb5afc3da21392ffe,60560,gmaps_v2,zomato,Amma's Pastries,3705,1687847039,Amma's Pastries,...,"[dining, delivery]",ammaspastries,zomato_amma's_pastries,1410584000.0,3728,https://www.google.com/maps/preview/review/lis...,1.154544,263.0,1639506600,2.031079
38,ChIJpxVZZz0XrjsRrJi4ytC_B_I,18774218,0x3bae173d675915a7:0xf207bfd0cab898ac,18774218,gmaps_v2,zomato,Third Wave Coffee,1946,1687847039,Third Wave Coffee,...,"[dining, delivery]",thirdwavecoffee,zomato_third_wave_coffee,1535700000.0,2053,https://www.google.com/maps/preview/review/lis...,1.105078,14.0,1608575400,1.530222
22,ChIJxa6khY89rjsRmTG3TXrGxkU,52333,0x3bae3d8f85a4aec5:0x45c6c67a4db73199,52333,gmaps_v2,zomato,Jalpaan,5359,1699615207,Jalpaan,...,"[dining, delivery]",dhyaanfoods,N_A,1367998000.0,5459,https://www.google.com/maps/preview/review/lis...,1.396239,369.0,1536777000,1.120806


In [47]:
df['ratio']=df['number_of_votes_2']/df['number_of_votes_1']
df

Unnamed: 0,id_1,id_2,cid_1,cid_2,source_1,source_2,name_1,number_of_votes_1,created_at_1,name_2,...,brand_id_1,brand_id_2,date_1,number_of_reviews,url,reviews_per_day_1,number_of_reviews.1,date_2,reviews_per_day_2,ratio
23,ChIJ3de7cd8XrjsRpDs2CgrLksE,18232664,0x3bae17df71bbd7dd:0xc192cb0a0a363ba4,18232664,gmaps_v2,zomato,Delhi Mess,1957,1699615207,Delhi Mess,...,,N_A,1467182000.0,1966,https://www.google.com/maps/preview/review/lis...,0.727456,113.0,1607020200,26.252743,12.672458
31,ChIJeXWlTmA9rjsRwayijh__N44,18160086,0x3bae3d604ea57579:0x8e37ff1f8ea2acc1,18160086,gmaps_v2,zomato,Domino's Pizza,3385,1699615207,Domino's Pizza,...,dominos,zomato_domino's_pizza,1471520000.0,3417,https://www.google.com/maps/preview/review/lis...,1.282201,11.0,1635532200,20.010959,3.633678
37,ChIJs1OiG489rjsR3OA2ita5gm0,55655,0x3bae3d8f1ba253b3:0x6d82b9d68a36e0dc,55655,gmaps_v2,zomato,Hotel Sagar,19392,1699615207,Sagar Hotel,...,,N_A,1563794000.0,19964,https://www.google.com/maps/preview/review/lis...,12.335861,29.0,1636655400,17.784036,0.551774
12,ChIJdR6R3YU9rjsR0qHeJqZ_IoE,18640097,0x3bae3d85dd911e75:0x81227fa626dea1d2,18640097,gmaps_v2,zomato,Green Chilly,1031,1699615207,Green Chilly,...,business,zomato_green_chilly,1475485000.0,1055,https://www.google.com/maps/preview/review/lis...,0.39744,6.0,1576521000,3.96559,4.991271
13,ChIJb0fMPSwWrjsRdpbRbGluIT0,50282,0x3bae162c3dcc476f:0x3d216e696cd19676,50282,gmaps_v2,zomato,Chung's Chinese Corner,2643,1699615207,Chung's Chinese Corner,...,,zomato_chung's_chinese_corner,1329236000.0,2664,https://www.google.com/maps/preview/review/lis...,0.616544,234.0,1493145000,2.493964,2.135074
21,ChIJVVVVxXQ-rjsR3rUjTglWt9o,18419851,0x3bae3e74c5555555:0xdab756094e23b5de,18419851,gmaps_v2,zomato,New Shanthi Sagar,8068,1699615207,New Shanthi Sagar,...,,zomato_new_shanthi_sagar,,8151,https://www.google.com/maps/preview/review/lis...,,27.0,1600194600,2.202873,0.279499
10,ChIJNTptIjkWrjsRHAmI3yRH5qc,54225,0x3bae1639226d3a35:0xa7e64724df88091c,54225,gmaps_v2,zomato,Gullu's Chaats,4782,1699615207,Gullu's Chat,...,,zomato_gullu's_chat,1314083000.0,4810,https://www.google.com/maps/preview/review/lis...,1.071673,831.0,1323455400,2.034702,1.798411
28,ChIJU-Z9rmE9rjsR_i85IdrDr7U,60560,0x3bae3d61ae7de653:0xb5afc3da21392ffe,60560,gmaps_v2,zomato,Amma's Pastries,3705,1687847039,Amma's Pastries,...,ammaspastries,zomato_amma's_pastries,1410584000.0,3728,https://www.google.com/maps/preview/review/lis...,1.154544,263.0,1639506600,2.031079,0.311741
38,ChIJpxVZZz0XrjsRrJi4ytC_B_I,18774218,0x3bae173d675915a7:0xf207bfd0cab898ac,18774218,gmaps_v2,zomato,Third Wave Coffee,1946,1687847039,Third Wave Coffee,...,thirdwavecoffee,zomato_third_wave_coffee,1535700000.0,2053,https://www.google.com/maps/preview/review/lis...,1.105078,14.0,1608575400,1.530222,0.728674
22,ChIJxa6khY89rjsRmTG3TXrGxkU,52333,0x3bae3d8f85a4aec5:0x45c6c67a4db73199,52333,gmaps_v2,zomato,Jalpaan,5359,1699615207,Jalpaan,...,dhyaanfoods,N_A,1367998000.0,5459,https://www.google.com/maps/preview/review/lis...,1.396239,369.0,1536777000,1.120806,0.367606
