In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

## Atributos

In [2]:
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0'

## Clase de requests

In [3]:
class Requester():
    def __init__(self):
        self.__requests_tries = 3
        self.timeout = 15
        self.__init_session()

    def __init_session(self):
        """
        Initializes a session object for making requests.

        Returns:
            requests.Session: The session object.

        """
        self.session = requests.Session()

    def get_requests(self, url, headers, proxy=None):
        """
        Sends a GET request to the specified URL using the provided session, headers, and proxy.

        Args:
            session (requests.Session): The session object to use for making the request.
            url (str): The URL to send the request to.
            headers (dict): The headers to include in the request.
            proxy (dict, optional): The proxy to use for the request. Defaults to None.

        Returns:
            requests.Response or None: The response object if the request is successful (status code 200), 
            otherwise None.

        Raises:
            requests.exceptions.RequestException: If an error occurs while making the request.
            requests.exceptions.Timeout: If the request times out.

        """
        for _ in range(self.__requests_tries):
            try:
                response = self.session.get(
                    url,
                    headers=headers,
                    proxies=proxy,
                    timeout=self.timeout
                )
                if response.status_code == 200:
                    return response
            except requests.exceptions.RequestException as e:
                print(e)
            except requests.exceptions.Timeout as e:
                print(e)
        return None


    def post_requests(self, url, headers, data=None, proxy=None):
        """
        Sends a POST request to the specified URL with the given headers, data, and proxy.

        Parameters:
        url (str): The URL to send the POST request to.
        headers (dict): The headers to include in the request.
        data (dict, optional): The data to include in the request body. Defaults to None.
        proxy (dict, optional): The proxy to use for the request. Defaults to None.

        Returns:
        response (requests.Response): The response object if the request is successful and the status code is 200.
        None: If the request fails or the status code is not 200.
        """
        for _ in range(self.__requests_tries):
            try:
                response = self.session.post(
                    url,
                    headers=headers,
                    data=data,
                    proxies=proxy
                )
                if response.status_code == 200:
                    return response
            except requests.exceptions.RequestException as e:
                print(e)
            except requests.exceptions.Timeout as e:
                print(e)
        return None

In [4]:
url_json_sale = 'https://century21mexico.com/v/resultados/operacion_venta?json=true'
url_json_rent = 'https://century21mexico.com/v/resultados/operacion_renta?json=true'
headers_json = {
    'Host': 'century21mexico.com',
    'User-Agent': user_agent,
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'en-CA,en-US;q=0.7,en;q=0.3',
    'Accept-Encoding': 'gzip, deflate, br, zstd',
    'DNT' : '1',
    'Sec-GPC' : '1',
    'Connection' : 'keep-alive',
    'Referer' : 'https://century21mexico.com/v/resultados',
    'Sec-Fetch-Dest' : 'empty',
    'Sec-Fetch-Mode' : 'cors',
    'Sec-Fetch-Site' : 'same-origin'
}

In [5]:
def get_price_range(req, url, header):
    start_price = 0
    hits_counter = 0
    subtotal_hits_results = 0
    urls_price = []

    response = req.get_requests(url, header)
    response_json = response.json()
    total_hits_results = int(response_json['totalHits'].replace(',',''))
    if response_json['filtros'][1]['validValues'][1]['isActive'] == True:
        end_price = 12000
        increment = end_price
    else:
        end_price = 500000
        increment = end_price
    url_final = url.replace('?json=true', '')
    while hits_counter < total_hits_results - subtotal_hits_results:
        if (total_hits_results - hits_counter) > 1500:
            url_price = f'{url_final}/precio-desde_{start_price}/precio-hasta_{end_price}?json=true'
        else:
            url_price = f'{url_final}/precio-desde_{start_price}?json=true'
        response_price = req.get_requests(url_price, header)
        response_price_json = response_price.json()
        subtotal_hits_results = int(response_price_json['totalHits'].replace(',',''))
        hits_counter += subtotal_hits_results
        start_price = end_price + 1
        end_price += increment
        urls_price.append(url_price)
    return urls_price

In [27]:
def get_page_number(req, url, header):
    response = req.get_requests(url, header)
    response_json = response.json()
    pages = response_json['filtros'][20]['validValues']
    url_page = [page['url'] for page in pages]
    urls_page = []
    for i in range(len(url_page)):
        urls_page.append(url.replace('?', f'{url_page[i]}?'))
    return urls_page

In [7]:
req = Requester()

In [8]:
req.session.cookies

<RequestsCookieJar[]>

In [28]:
list2 = [get_page_number(req,li,headers_json ) for li in list]
list2[0]

['https://century21mexico.com/v/resultados/operacion_renta/precio-desde_0/precio-hasta_12000/pagina_1?json=true',
 'https://century21mexico.com/v/resultados/operacion_renta/precio-desde_0/precio-hasta_12000/pagina_2?json=true',
 'https://century21mexico.com/v/resultados/operacion_renta/precio-desde_0/precio-hasta_12000/pagina_3?json=true',
 'https://century21mexico.com/v/resultados/operacion_renta/precio-desde_0/precio-hasta_12000/pagina_4?json=true',
 'https://century21mexico.com/v/resultados/operacion_renta/precio-desde_0/precio-hasta_12000/pagina_5?json=true',
 'https://century21mexico.com/v/resultados/operacion_renta/precio-desde_0/precio-hasta_12000/pagina_6?json=true',
 'https://century21mexico.com/v/resultados/operacion_renta/precio-desde_0/precio-hasta_12000/pagina_7?json=true',
 'https://century21mexico.com/v/resultados/operacion_renta/precio-desde_0/precio-hasta_12000/pagina_8?json=true',
 'https://century21mexico.com/v/resultados/operacion_renta/precio-desde_0/precio-hasta_1

In [11]:
response_rent = req.get_requests(url_json_rent, headers_json)
response_rent

<Response [200]>

In [None]:
def get_results_info(req, url, header):
    response = req.get_requests(url, header)
    response_results = response.json()
    

In [10]:
response_sale = req.get_requests(url_json_sale, headers_json)
response_sale

<Response [200]>

In [11]:
soup_sale = BeautifulSoup(response_sale.content, 'html.parser')

In [168]:
soup = BeautifulSoup(response.content, 'html.parser')
urls_page = get_page_number(soup)
df = pd.DataFrame()

In [169]:
def get_result_page(url):
     response = req.get_requests(url, headers_json)
     if response:
          soup = response.json()
          results = soup['results']
     return results

In [170]:
for url in urls_page:
    result = get_result_page(url)
    df = pd.concat([df, pd.DataFrame(result)], ignore_index=True)
