In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
from itertools import chain

In [2]:
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0'

In [3]:
class Requester():
    def __init__(self):
        self.__requests_tries = 3
        self.timeout = 15
        self.__init_session()

    def __init_session(self):
        """
        Initializes a session object for making requests.

        Returns:
            requests.Session: The session object.

        """
        self.session = requests.Session()

    def get_requests(self, url, headers, proxy=None):
        """
        Sends a GET request to the specified URL using the provided session, headers, and proxy.

        Args:
            session (requests.Session): The session object to use for making the request.
            url (str): The URL to send the request to.
            headers (dict): The headers to include in the request.
            proxy (dict, optional): The proxy to use for the request. Defaults to None.

        Returns:
            requests.Response or None: The response object if the request is successful (status code 200), 
            otherwise None.

        Raises:
            requests.exceptions.RequestException: If an error occurs while making the request.
            requests.exceptions.Timeout: If the request times out.

        """
        for _ in range(self.__requests_tries):
            try:
                response = self.session.get(
                    url,
                    headers=headers,
                    proxies=proxy,
                    timeout=self.timeout
                )
                if response.status_code == 200:
                    return response
            except requests.exceptions.RequestException as e:
                print(e)
            except requests.exceptions.Timeout as e:
                print(e)
        return None


    def post_requests(self, url, headers, data=None, proxy=None):
        """
        Sends a POST request to the specified URL with the given headers, data, and proxy.

        Parameters:
        url (str): The URL to send the POST request to.
        headers (dict): The headers to include in the request.
        data (dict, optional): The data to include in the request body. Defaults to None.
        proxy (dict, optional): The proxy to use for the request. Defaults to None.

        Returns:
        response (requests.Response): The response object if the request is successful and the status code is 200.
        None: If the request fails or the status code is not 200.
        """
        for _ in range(self.__requests_tries):
            try:
                response = self.session.post(
                    url,
                    headers=headers,
                    data=data,
                    proxies=proxy
                )
                if response.status_code == 200:
                    return response
            except requests.exceptions.RequestException as e:
                print(e)
            except requests.exceptions.Timeout as e:
                print(e)
        return None

In [4]:
url_json_sale = 'https://century21mexico.com/v/resultados/operacion_venta?json=true'
url_json_rent = 'https://century21mexico.com/v/resultados/operacion_renta?json=true'
headers_json = {
    'Host': 'century21mexico.com',
    'User-Agent': user_agent,
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'en-CA,en-US;q=0.7,en;q=0.3',
    'Accept-Encoding': 'gzip, deflate, br, zstd',
    'DNT' : '1',
    'Sec-GPC' : '1',
    'Connection' : 'keep-alive',
    'Referer' : 'https://century21mexico.com/v/resultados',
    'Sec-Fetch-Dest' : 'empty',
    'Sec-Fetch-Mode' : 'cors',
    'Sec-Fetch-Site' : 'same-origin'
}

In [6]:
def get_price_range(req, url, header):
    '''
    Generates a list of URLs with price ranges to extract product data by making multiple requests.

    This function takes a base URL and queries the API to retrieve data about products within certain price ranges. 
    The price range is incremented dynamically based on the total number of hits (results) returned by the API. 
    It generates a list of URLs corresponding to specific price ranges and ensures that all results are fetched.

    Args:
        req: An object that has a method `get_requests` to perform HTTP GET requests.
        url (str): The base URL to which the requests will be made.
        header (dict): A dictionary containing the headers required for the HTTP request.

    Returns:
        list: A list of URLs that correspond to different price ranges for querying the API.

    Process:
        1. The function makes an initial request to the provided URL to get the total number of hits (totalHits).
        2. Depending on whether a certain filter is active (`response_json['filtros'][1]['validValues'][1]['isActive']`),
           the price range starts at either 12,000 or 500,000 units.
        3. It iterates over the total hits, adjusting the price range dynamically, and generates new URLs for each request.
        4. The generated URLs are stored in a list `urls_price`, which is returned at the end.

    '''
    start_price = 0
    hits_counter = 0
    subtotal_hits_results = 0
    urls_price = []

    # Initial request to get total number of hits and determine initial price range
    response = req.get_requests(url, header)
    response_json = response.json()
    total_hits_results = int(response_json['totalHits'].replace(',',''))

     # Determine the price range based on the filter, True if rent and false if sale
    if response_json['filtros'][1]['validValues'][1]['isActive'] == True:
        end_price = 12000
        increment = end_price
    else:
        end_price = 500000
        increment = end_price
    
     # Remove '?json=true' from the base URL to format future requests
    url_final = url.replace('?json=true', '')

    # Loop until all hits are processed, creating URLs for specific price ranges
    while hits_counter < total_hits_results - subtotal_hits_results:
        # Build the URL depending on how many hits remain to be processed
        if (total_hits_results - hits_counter) > 1500:
            url_price = f'{url_final}/precio-desde_{start_price}/precio-hasta_{end_price}?json=true'
        else:
            url_price = f'{url_final}/precio-desde_{start_price}?json=true'

        # Make a request to get the results for the current price range
        response_price = req.get_requests(url_price, header)
        response_price_json = response_price.json()
        subtotal_hits_results = int(response_price_json['totalHits'].replace(',',''))

        # Update the hit counter and adjust the price range for the next iteration
        hits_counter += subtotal_hits_results
        start_price = end_price + 1
        end_price += increment

        # Append the generated URL to the list
        urls_price.append(url_price)
        
    return urls_price

In [7]:
def get_page_number(req, url, header):
    '''
     Extracts and generates URLs for each page based on the pagination information from the API response.

    This function makes an HTTP request to the provided URL to retrieve a JSON response, which contains information
    about available pages. It then constructs and returns a list of URLs, each corresponding to a specific page.

    Args:
        req: An object that has a method `get_requests` to perform HTTP GET requests.
        url (str): The base URL to which the requests will be made.
        header (dict): A dictionary containing the headers required for the HTTP request.

    Returns:
        list: A list of URLs, each corresponding to a different page in the pagination.

    Process:
        1. The function makes a request to the given URL and parses the JSON response.
        2. It extracts the pagination information from the key `validValues` located within the filter at index 20.
        3. For each page, it generates a new URL by replacing the `?` in the base URL with the respective page URL.
        4. The generated page URLs are stored in a list and returned.
    '''
    response = req.get_requests(url, header)
    response_json = response.json()

    # Extract the pagination info from the filter at index 20 in the JSON response which contains the number of page for each price range given in the url
    pages = response_json['filtros'][20]['validValues']

    # List comprehension to extract URLs from the 'validValues'
    url_page = [page['url'] for page in pages]

    # Construct new URLs for each page and append them to the list 'urls_page'
    urls_page = []
    for i in range(len(url_page)):
        urls_page.append(url.replace('?', f'{url_page[i]}?'))
        
    return urls_page

In [8]:
def get_results_info(req, url, header):
    '''
    Retrieves and returns the 'results' section from the JSON response of an HTTP request.

    This function makes an HTTP GET request to the provided URL and parses the JSON response. 
    It extracts the 'results' field from the response and returns it.

    Args:
        req: An object that has a method `get_requests` to perform HTTP GET requests.
        url (str): The URL to which the request will be made.
        header (dict): A dictionary containing the headers required for the HTTP request.

    Returns:
        dict or list: The 'results' section from the JSON response. The structure of 'results' depends on the API response, 
        which could be a dictionary or a list, depending on the API.

    Process:
        1. The function sends an HTTP request to the given URL using the `req.get_requests` method.
        2. The response is parsed as JSON to retrieve the 'results' field.
        3. The 'results' field is returned to the caller.
    '''
    response = req.get_requests(url, header)
    response_results = response.json()

    # Extract and return the 'results' field from the JSON response which contains the data 
    result = response_results['results']

    return result

In [9]:
def scrape_data(url, header, df):
    '''
     Scrapes data from multiple pages and appends the results to an existing DataFrame.

    This function scrapes data from a given URL by first generating price ranges, then retrieving pagination URLs for each price range, 
    and finally scraping the data from all the pages. The scraped data is appended to the provided DataFrame and returned.

    Args:
        url (str): The base URL from which the data will be scraped.
        header (dict): A dictionary containing the headers required for the HTTP requests.
        df (pandas.DataFrame): A DataFrame to which the scraped data will be appended.

    Returns:
        pandas.DataFrame: The DataFrame containing the original data with the newly scraped data appended.

    Process:
        1. A `Requester` object is initialized to handle HTTP requests.
        2. `get_price_range` is called to generate a list of URLs corresponding to different price ranges.
        3. For each price range, `get_page_number` is called to generate pagination URLs.
        4. All the pagination URLs are flattened into a single list.
        5. For each pagination URL, `get_results_info` is used to extract data from the page, and the data is converted to a DataFrame.
        6. The new DataFrame is concatenated with the original `df`, and the updated DataFrame is returned.

    '''
    req = Requester()

    # Get the list of price range URLs
    price_list = get_price_range(req, url, header)

    # Get the pagination URLs for each price range
    price_list_pages = [get_page_number(req, url_price, header) for url_price in price_list]

    # Flatten the list of lists into a single list of URLs
    price_list_pages_p = list(chain(*price_list_pages))

    # Loop through each page URL, scrape data, and append it to the DataFrame
    for url_page in price_list_pages_p:
        data = get_results_info(req, url_page, header)
        df = pd.concat([df, pd.DataFrame(data)], ignore_index=True)
        
    return df

In [10]:
df = pd.DataFrame()
df

In [11]:
df = scrape_data(url_json_rent, headers_json, df)
df

Unnamed: 0,precios,mantenimiento,precioFormat,precioMapa,precioSecundarioFormat,monedaSecundaria,mostrarMonedaSecundaria,tipoOperacionTxt,tipoPropiedadEnTipoOperacion,diasModificacionTxt,...,idAsesor,telefono,whatsapp,email,asesorNombre,asesorThumbnail,asesorFotoMostrarEnInternet,logoOficina,nombreAfiliado,fotos
0,"{'vista': {'precio': 11500.000000000002, 'mone...",,"$11,500 MXN",11500,"$11,500 MXN",MXN,False,en renta,Casa en renta,Hoy,...,3899,+52 55 3239 2705,+525532392705,anel@century21doniz.com,Maria Anel Esqueda Segura,https://cdn.21online.lat/mexico/cache/awsTest1...,True,https://cdn.21online.lat/mexico/cache/awsTest1...,CENTURY 21 Doniz & Asociados,"{'totalFotos': 16, 'propiedadThumbnail': ['htt..."
1,"{'vista': {'precio': 7500, 'moneda': 'MXN', 'p...",,"$7,500 MXN",7500,"$7,500 MXN",MXN,False,en renta,Casa en condominio en renta,Hoy,...,15680,+52 442 359 9277,+524423599277,mtrujillo141089@gmail.com,Paola Trujillo Toledo,https://cdn.21online.lat/mexico/cache/awsTest1...,True,https://cdn.21online.lat/mexico/cache/awsTest1...,CENTURY 21 Trums-AG,"{'totalFotos': 32, 'propiedadThumbnail': ['htt..."
2,"{'vista': {'precio': 6300, 'moneda': 'MXN', 'p...",,"$6,300 MXN",6300,"$6,300 MXN",MXN,False,en renta,Local en renta,Hoy,...,30026,+52 771 454 0334,,contacto@c21novareal.com,CENTURY 21 Nova Real,https://cdn.21online.lat/mexico/cache/awsTest1...,False,https://cdn.21online.lat/mexico/cache/awsTest1...,CENTURY 21 Nova Real,"{'totalFotos': 8, 'propiedadThumbnail': ['http..."
3,"{'vista': {'precio': 11000, 'moneda': 'MXN', '...",,"$11,000 MXN",11000,"$11,000 MXN",MXN,False,en renta,Casa en renta,Hoy,...,21788,+52 646 113 9696,+526461139696,info.c21f@gmail.com,Lic. Sonia Escobedo Medina,https://cdn.21online.lat/mexico/cache/awsTest1...,True,https://cdn.21online.lat/mexico/cache/awsTest1...,CENTURY 21 Fortaleza,"{'totalFotos': 12, 'propiedadThumbnail': ['htt..."
4,"{'vista': {'precio': 5000, 'moneda': 'MXN', 'p...",,"$5,000 MXN",5000,"$5,000 MXN",MXN,False,en renta,Casa en renta,Hoy,...,20677,+52 722 868 9076,,contacto@c21houseempire.com,Erick Gabriel Chavez Hernandez,https://cdn.21online.lat/mexico/cache/awsTest1...,False,https://cdn.21online.lat/mexico/cache/awsTest1...,CENTURY 21 House Empire,"{'totalFotos': 21, 'propiedadThumbnail': ['htt..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4535,"{'vista': {'precio': 50000, 'moneda': 'MXN', '...",,"$50,000 MXN",50000,"$50,000 MXN",MXN,False,en renta,Bodega en renta,Actualizado hace 713 días,...,11491,+52 921 267 1668,+529212671668,asesor16@c21habitat.com.mx,Maria Martinez Gomez,https://cdn.21online.lat/mexico/cache/awsTest1...,False,https://cdn.21online.lat/mexico/cache/awsTest1...,CENTURY 21 Hábitat,"{'totalFotos': 20, 'propiedadThumbnail': ['htt..."
4536,"{'vista': {'precio': 75400, 'moneda': 'MXN', '...",,"$75,400 MXN",75400,"$75,400 MXN",MXN,False,en renta,Local en renta,Actualizado hace 642 días,...,11764,+52 921 212 0462,,gerencia@c21habitat.com.mx,Amelia Valencia Villanueva,https://cdn.21online.lat/mexico/cache/awsTest1...,False,https://cdn.21online.lat/mexico/cache/awsTest1...,CENTURY 21 Hábitat,"{'totalFotos': 5, 'propiedadThumbnail': ['http..."
4537,"{'vista': {'precio': 67280, 'moneda': 'MXN', '...",,"$67,280 MXN",67280,"$67,280 MXN",MXN,False,en renta,Local en renta,Actualizado hace 642 días,...,11764,+52 921 212 0462,,gerencia@c21habitat.com.mx,Amelia Valencia Villanueva,https://cdn.21online.lat/mexico/cache/awsTest1...,False,https://cdn.21online.lat/mexico/cache/awsTest1...,CENTURY 21 Hábitat,"{'totalFotos': 8, 'propiedadThumbnail': ['http..."
4538,"{'vista': {'precio': 52839, 'moneda': 'MXN', '...",,"$52,839 MXN",52839,"$52,839 MXN",MXN,False,en renta,Local en renta,Actualizado hace 633 días,...,5565,+52 662 182 0023,+526621820023,Leonorcampoy@hotmail.com,Maria Leonor Campoy Burboa,https://cdn.21online.lat/mexico/cache/awsTest1...,True,https://cdn.21online.lat/mexico/cache/awsTest1...,CENTURY 21 Elga,"{'totalFotos': 4, 'propiedadThumbnail': ['http..."


In [12]:
df = scrape_data(url_json_sale, headers_json, df)
df

  df = pd.concat([df, pd.DataFrame(data)], ignore_index=True)


Unnamed: 0,precios,mantenimiento,precioFormat,precioMapa,precioSecundarioFormat,monedaSecundaria,mostrarMonedaSecundaria,tipoOperacionTxt,tipoPropiedadEnTipoOperacion,diasModificacionTxt,...,idAsesor,telefono,whatsapp,email,asesorNombre,asesorThumbnail,asesorFotoMostrarEnInternet,logoOficina,nombreAfiliado,fotos
0,"{'vista': {'precio': 11500.000000000002, 'mone...",,"$11,500 MXN",11500,"$11,500 MXN",MXN,False,en renta,Casa en renta,Hoy,...,3899,+52 55 3239 2705,+525532392705,anel@century21doniz.com,Maria Anel Esqueda Segura,https://cdn.21online.lat/mexico/cache/awsTest1...,True,https://cdn.21online.lat/mexico/cache/awsTest1...,CENTURY 21 Doniz & Asociados,"{'totalFotos': 16, 'propiedadThumbnail': ['htt..."
1,"{'vista': {'precio': 7500, 'moneda': 'MXN', 'p...",,"$7,500 MXN",7500,"$7,500 MXN",MXN,False,en renta,Casa en condominio en renta,Hoy,...,15680,+52 442 359 9277,+524423599277,mtrujillo141089@gmail.com,Paola Trujillo Toledo,https://cdn.21online.lat/mexico/cache/awsTest1...,True,https://cdn.21online.lat/mexico/cache/awsTest1...,CENTURY 21 Trums-AG,"{'totalFotos': 32, 'propiedadThumbnail': ['htt..."
2,"{'vista': {'precio': 6300, 'moneda': 'MXN', 'p...",,"$6,300 MXN",6300,"$6,300 MXN",MXN,False,en renta,Local en renta,Hoy,...,30026,+52 771 454 0334,,contacto@c21novareal.com,CENTURY 21 Nova Real,https://cdn.21online.lat/mexico/cache/awsTest1...,False,https://cdn.21online.lat/mexico/cache/awsTest1...,CENTURY 21 Nova Real,"{'totalFotos': 8, 'propiedadThumbnail': ['http..."
3,"{'vista': {'precio': 11000, 'moneda': 'MXN', '...",,"$11,000 MXN",11000,"$11,000 MXN",MXN,False,en renta,Casa en renta,Hoy,...,21788,+52 646 113 9696,+526461139696,info.c21f@gmail.com,Lic. Sonia Escobedo Medina,https://cdn.21online.lat/mexico/cache/awsTest1...,True,https://cdn.21online.lat/mexico/cache/awsTest1...,CENTURY 21 Fortaleza,"{'totalFotos': 12, 'propiedadThumbnail': ['htt..."
4,"{'vista': {'precio': 5000, 'moneda': 'MXN', 'p...",,"$5,000 MXN",5000,"$5,000 MXN",MXN,False,en renta,Casa en renta,Hoy,...,20677,+52 722 868 9076,,contacto@c21houseempire.com,Erick Gabriel Chavez Hernandez,https://cdn.21online.lat/mexico/cache/awsTest1...,False,https://cdn.21online.lat/mexico/cache/awsTest1...,CENTURY 21 House Empire,"{'totalFotos': 21, 'propiedadThumbnail': ['htt..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23006,"{'vista': {'precio': 33000000, 'moneda': 'MXN'...",,"$33,000,000 MXN",33.0M,"$33,000,000 MXN",MXN,False,en venta,Terreno en venta,Actualizado hace 713 días,...,845,+52 55 5662 9700,,informes@c21platinum.com.mx,Laura Maldonado,https://cdn.21online.lat/mexico/cache/awsTest1...,True,https://cdn.21online.lat/mexico/cache/awsTest1...,CENTURY 21 Platinum,"{'totalFotos': 6, 'propiedadThumbnail': ['http..."
23007,"{'vista': {'precio': 33499999.999999996, 'mone...",,"$33,500,000 MXN",33.5M,"$33,500,000 MXN",MXN,False,en venta,Terreno en venta,Actualizado hace 383 días,...,16054,+52 55 3906 1677,+525539061677,ventas@century21eden.com,Rene Zepeda Hurtado,https://cdn.21online.lat/mexico/cache/awsTest1...,True,https://cdn.21online.lat/mexico/cache/awsTest1...,CENTURY 21 Edén,"{'totalFotos': 7, 'propiedadThumbnail': ['http..."
23008,"{'vista': {'precio': 45000000, 'moneda': 'MXN'...",,"$45,000,000 MXN",45.0M,"$45,000,000 MXN",MXN,False,en venta,Edificio en venta,Actualizado hace 634 días,...,10085,+52 744 484 3000,,direccion@century21lacunza.com.mx,José Luis Infante Tafolla,https://cdn.21online.lat/mexico/cache/awsTest1...,True,https://cdn.21online.lat/mexico/cache/awsTest1...,CENTURY 21 Lacunza & Asociados,"{'totalFotos': 19, 'propiedadThumbnail': ['htt..."
23009,"{'vista': {'precio': 111258710.75, 'moneda': '...",,"$5,750,000 USD",5.8M,"$111,258,711 MXN",MXN,False,en venta,Casa en venta,Actualizado hace 75 días,...,10110,+52 744 484 3000,,direccion@century21lacunza.com.mx,Deyanira Terrazas Castro,https://cdn.21online.lat/mexico/cache/awsTest1...,True,https://cdn.21online.lat/mexico/cache/awsTest1...,CENTURY 21 Lacunza & Asociados,"{'totalFotos': 67, 'propiedadThumbnail': ['htt..."


In [13]:
df.to_csv('soriana_scrape.csv', index=False)  