# 01-01 : Retrieve Hellopeter Reviews

In [1]:
import datetime
import pandas as pd
import requests
from time import sleep

## Functions

In [2]:
class Hellopeter():
    """
    This class is used to retrieve Hellopeter reviews via the `https://api.hellopeter.com/consumer/business/` API.

    Parameters
    ----------
    business : str
        The business name to retrieve reviews for.
    api_url : str
        The base URL used to invoke the Hellopeter API.
    """
    def __init__(self, business:str, api_url:str='https://api.hellopeter.com/consumer/business/') -> None:
        self.business = business
        self.api_url = api_url
        self.url_template = self.api_url + self.business + '/reviews?page='

        # initialize the session to use for requests to the API
        self.request_session = requests.Session()

    def request_page(self, page_number:int, retries:int=5) -> dict:
        """
        Request a specific review page for the business.

        Parameters
        ----------
        page_number : int
            The page number to retrieve the reviews from.   

        Returns
        -------
        response_json : dict
        """
        # set the full url for the request
        url = self.url_template + str(page_number)
       
        # set the request headers
        headers = {
            'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
            'accept': 'application/json'
        }

        for retry in range(retries):
            # request the review page
            respose = self.request_session.get(url=url)

            # implement basic error handling
            if respose.status_code == 202:
                return respose.json()
            else:
                if retry == retries - 1:
                    raise Exception('An unexpected response code were received: %s' % respose.status_code)
                
                # wait before retrying
                print('An unexpected response code were received: %s' % respose.status_code)
                sleep((retry + 1)*2)

    def process_request_page(self, page_json:dict) -> pd.DataFrame:
        """
        Process the raw JSON data of a review page and convert it to a Pandas DataFrame.

        Parameters
        ----------
        page_json : dict
            The raw JSON (represented as a Python dictionary) that was retrieved from the API.

        Returns
        -------
        page_data : pandas.DataFrame
            The page data converted to a DataFrame.            
        """
        # create the dataframe
        page_data = pd.DataFrame(page_json['data'])

        # add the business name to the dataframe
        page_data['business'] = self.business

        # basic data type conversions
        page_data.created_at = pd.to_datetime(page_data.created_at, format='%Y-%m-%d %H:%M:%S')
        page_data.replied = page_data.replied.astype('bool')        

        # return the processed page data
        return page_data

    def retrieve_reviews(self, stop_at:datetime) -> pd.DataFrame:
        """
        Retrieve reviews for the business up to, and including the `stop_at` date.

        Parameters
        ----------
        stop_at : datetime
            The date of the last review to retrieve.

        Returns
        -------
        review_data : pandas.DataFrame
            A DataFrame containing the reviews retrieved.            
        """
        page_data = pd.DataFrame()
        current_page = 1
        stop_retrieval = False

        while not stop_retrieval:
            # retrieve the reviews for the current page
            current_reviews = self.process_request_page(self.request_page(current_page))

            # add the current page to the output dataframe
            page_data = pd.concat([page_data, current_reviews])

            # increment the page counter
            current_page += 1

            # determine if data retrieval should be stopped
            #print(current_reviews.created_at.min(), stop_at)
            stop_retrieval = current_reviews.created_at.min() < stop_at

            # print a progress indicator
            if current_page % 100 == 0:
                print(current_page, current_reviews.created_at.min())
                sleep(2)

        # perform the final filter for the stop date
        page_data = page_data.query('created_at >= @stop_at')

        # return the result dataframe
        return page_data

In [3]:
def retrieve_bussiness_reviews(business:str, stop_at:datetime, output_path='data/raw/') -> pd.DataFrame:
    """
    Retrieve reviews for a business and store the output in Parquet format.

    Parameters
    ----------
    business : str
        The business name to retrieve reviews for.        
    stop_at : datetime
        The date of the last review to retrieve.    

    Returns
    -------
    review_data : pandas.DataFrame
        A DataFrame containing the reviews retrieved.     
    """
    # retrieve the reviews
    peter = Hellopeter(business)
    review_data = peter.retrieve_reviews(stop_at)

    # save the dataset
    review_data.to_parquet('../../data/raw/%s.parquet.gz' % business.replace('-', '_'), 
        compression='gzip', index=False)

    # return the retrieved data for futher processing
    return review_data

## Data Retrieval

In [4]:
# retrieve the vodacom dataset
vodacom_reviews = retrieve_bussiness_reviews(business='vodacom', stop_at=datetime.datetime(2021, 1, 1))

100 2023-06-09 21:01:59
200 2023-04-25 12:44:07
300 2023-03-08 17:40:34
400 2023-01-31 07:17:38
An unexpected response code were received: 502
500 2022-12-22 13:45:13
600 2022-11-21 20:08:41
700 2022-10-18 16:31:03
An unexpected response code were received: 502
800 2022-09-12 11:14:23
900 2022-08-06 16:15:17
1000 2022-07-01 13:06:40
1100 2022-05-20 12:13:14
1200 2022-04-11 09:24:41
1300 2022-03-07 11:09:42
1400 2022-02-03 17:18:26
1500 2022-01-03 18:21:18
1600 2021-11-30 19:41:01
1700 2021-11-02 05:34:55
1800 2021-10-01 14:38:48
1900 2021-08-27 09:26:43
An unexpected response code were received: 502
2000 2021-07-21 14:24:50
2100 2021-06-17 10:21:26
2200 2021-05-17 07:54:04
An unexpected response code were received: 524
2300 2021-04-19 09:59:22
2400 2021-03-23 08:42:45
2500 2021-02-28 08:16:01
2600 2021-02-05 09:32:05
2700 2021-01-13 08:24:11


In [5]:
# retrieve the mtn dataset
mtn_reviews = retrieve_bussiness_reviews(business='mtn', stop_at=datetime.datetime(2021, 1, 1))

100 2023-05-15 20:26:40
200 2023-03-03 11:03:02
300 2023-01-05 10:01:09
400 2022-11-11 11:23:31
500 2022-09-23 13:24:34
600 2022-08-02 20:22:11
700 2022-06-10 13:09:56
800 2022-04-19 20:23:34
900 2022-02-28 20:39:15
1000 2022-01-12 12:11:20
1100 2021-11-18 13:49:18
An unexpected response code were received: 524
1200 2021-09-26 20:41:59
1300 2021-08-04 08:14:47
1400 2021-06-10 10:49:22
1500 2021-04-14 13:18:25
1600 2021-03-02 10:40:20
1700 2021-01-23 10:14:19


In [6]:
# retrieve the telkom dataset
telkom_reviews = retrieve_bussiness_reviews(business='telkom', stop_at=datetime.datetime(2021, 1, 1))

100 2023-04-17 13:37:05
200 2023-01-26 09:38:16
300 2022-11-03 11:05:14
400 2022-08-15 09:23:33
500 2022-05-17 08:32:30
600 2022-03-03 09:37:12
700 2021-12-31 08:28:23
800 2021-10-28 11:04:47
900 2021-08-27 09:41:38
1000 2021-06-30 17:25:16
1100 2021-05-07 10:12:36
1200 2021-03-17 16:47:59
1300 2021-02-06 06:43:34


In [7]:
# retrieve the cell-c dataset
dflancell_c_reviews = retrieve_bussiness_reviews(business='cell-c', stop_at=datetime.datetime(2021, 1, 1))

100 2023-03-27 12:02:36
An unexpected response code were received: 524
200 2022-12-02 17:10:52
300 2022-09-06 12:33:31
400 2022-06-06 12:03:05
500 2022-03-08 15:11:02
600 2021-12-20 09:16:05
700 2021-10-08 10:32:58
800 2021-08-09 15:40:44
900 2021-06-01 11:48:04
1000 2021-03-16 11:27:39
1100 2021-01-16 08:10:46


In [8]:
# retrieve the rain dataset
rain_reviews = retrieve_bussiness_reviews(business='rain-internet-service-provider', stop_at=datetime.datetime(2021, 1, 1))

100 2023-06-28 04:56:53
200 2023-05-29 15:26:51
300 2023-04-11 20:02:51
400 2023-02-22 17:55:45
500 2023-01-16 20:06:39
600 2022-12-10 06:45:41
700 2022-11-07 17:52:05
800 2022-09-02 13:55:17
900 2022-06-13 12:44:34
1000 2022-04-01 15:48:57
An unexpected response code were received: 500
1100 2021-12-21 13:52:38
1200 2021-09-18 09:50:36
1300 2021-06-01 14:31:54
1400 2021-03-04 13:34:50
1500 2021-01-05 08:45:06
