In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

## Atributos

In [2]:
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0'

## Clase de requests

In [3]:
class Requester():
    def __init__(self):
        self.__requests_tries = 3
        self.timeout = 15
        self.__init_session()

    def __init_session(self):
        """
        Initializes a session object for making requests.

        Returns:
            requests.Session: The session object.

        """
        self.session = requests.Session()

    def get_requests(self, url, headers, proxy=None):
        """
        Sends a GET request to the specified URL using the provided session, headers, and proxy.

        Args:
            session (requests.Session): The session object to use for making the request.
            url (str): The URL to send the request to.
            headers (dict): The headers to include in the request.
            proxy (dict, optional): The proxy to use for the request. Defaults to None.

        Returns:
            requests.Response or None: The response object if the request is successful (status code 200), 
            otherwise None.

        Raises:
            requests.exceptions.RequestException: If an error occurs while making the request.
            requests.exceptions.Timeout: If the request times out.

        """
        for _ in range(self.__requests_tries):
            try:
                response = self.session.get(
                    url,
                    headers=headers,
                    proxies=proxy,
                    timeout=self.timeout
                )
                if response.status_code == 200:
                    return response
            except requests.exceptions.RequestException as e:
                print(e)
            except requests.exceptions.Timeout as e:
                print(e)
        return None


    def post_requests(self, url, headers, data=None, proxy=None):
        """
        Sends a POST request to the specified URL with the given headers, data, and proxy.

        Parameters:
        url (str): The URL to send the POST request to.
        headers (dict): The headers to include in the request.
        data (dict, optional): The data to include in the request body. Defaults to None.
        proxy (dict, optional): The proxy to use for the request. Defaults to None.

        Returns:
        response (requests.Response): The response object if the request is successful and the status code is 200.
        None: If the request fails or the status code is not 200.
        """
        for _ in range(self.__requests_tries):
            try:
                response = self.session.post(
                    url,
                    headers=headers,
                    data=data,
                    proxies=proxy
                )
                if response.status_code == 200:
                    return response
            except requests.exceptions.RequestException as e:
                print(e)
            except requests.exceptions.Timeout as e:
                print(e)
        return None

In [4]:
url_json_sale = 'https://century21mexico.com/v/resultados/operacion_venta?json=true'
url_json_rent = 'https://century21mexico.com/v/resultados/operacion_renta?json=true'
headers_json = {
    'Host': 'century21mexico.com',
    'User-Agent': user_agent,
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'en-CA,en-US;q=0.7,en;q=0.3',
    'Accept-Encoding': 'gzip, deflate, br, zstd',
    'DNT' : '1',
    'Sec-GPC' : '1',
    'Connection' : 'keep-alive',
    'Referer' : 'https://century21mexico.com/v/resultados',
    'Sec-Fetch-Dest' : 'empty',
    'Sec-Fetch-Mode' : 'cors',
    'Sec-Fetch-Site' : 'same-origin'
}

In [44]:
def get_price_range(req, url, header):
    start_price = 0
    hits_counter = 0
    subtotal_hits_results = 0
    urls_price = []

    response = req.get_requests(url, header)
    response_json = response.json()
    total_hits_results = int(response_json['totalHits'].replace(',',''))
    if response_json['filtros'][1]['validValues'][1]['isActive'] == True:
        end_price = 12000
        increment = end_price
    else:
        end_price = 500000
        increment = end_price
    url_final = url.replace('?json=true', '')
    while hits_counter < total_hits_results - subtotal_hits_results:
        if (total_hits_results - hits_counter) > 1500:
            url_price = f'{url_final}/precio-desde_{start_price}/precio-hasta_{end_price}?json=true'
        else:
            url_price = f'{url_final}/precio-desde_{start_price}?json=true'
        response_price = req.get_requests(url_price, header)
        response_price_json = response_price.json()
        subtotal_hits_results = int(response_price_json['totalHits'].replace(',',''))
        hits_counter += subtotal_hits_results
        start_price = end_price + 1
        end_price += increment
        urls_price.append(url_price)
    return urls_price

In [82]:
def get_page_number(req, url, header):
    response = req.get_requests(url, header)
    response_json = response.json()
    urls = response_json['filtros'][20]['validValues']
    urls_page = [url['url'] for url in urls]
    for i in range(len(urls_page)):
        urls_page[i] = f'{url_section}{urls_page[i]}?json=true'
    return urls_page

In [5]:
req = Requester()

In [6]:
req.session.cookies

<RequestsCookieJar[]>

In [46]:
list = get_price_range(req, url_json_rent, headers_json)
list[0]

'https://century21mexico.com/v/resultados/operacion_renta/precio-desde_0/precio-hasta_12000?json=true'

In [49]:
response = req.get_requests(list[0], headers_json)
response_json = response.json()
urls = response_json['filtros'][20]['validValues']
urls_page = [url['url'] for url in urls]
test = list[0].replace('?', f'{urls_page[0]}?')
test

'https://century21mexico.com/v/resultados/operacion_renta/precio-desde_0/precio-hasta_12000/pagina_1?json=true'

In [11]:
response_rent = req.get_requests(url_json_rent, headers_json)
response_rent

<Response [200]>

In [71]:
soup_rent = BeautifulSoup(response_rent.content, 'html.parser')

{
    "pathContacto": "https:\/\/plus.21onlinemx.com\/interfaz\/wh\/c21wpLocal",
    "totalHitsRelation": "eq",
    "totalHits": "4,685",
    "totalTime": 37,
    "zoom": 6,
    "aggregations": {
        "ubicacion": [
            {
                "key": "estado_de_mexico",
                "doc_count": 595,
                "centroid": {
                    "location": {
                        "lat": 19.409704226460466,
                        "lon": -99.46402642068242
                    },
                    "count": 595
                },
                "url": "\/layout_mapa\/coordenadas_19.41470422646,-99.514026420682,19.40470422646,-99.414026420682,12",
                "num_propiedades": "595 Propiedades",
                "label": "ESTADO DE MEXICO"
            },
            {
                "key": "chihuahua",
                "doc_count": 541,
                "centroid": {
                    "location": {
                        "lat": 28.76438895870489,
                   

In [23]:
soup_json_rent = response_rent.json()
total_rent = int(soup_json_rent['totalHits'].replace(',',''))
total_rent

4694

In [37]:
desde = 0
hasta = 12000
contador = 0
sub_total_int = 0


while contador < total_rent - sub_total_int:
    if (total_rent - contador) > 1500:
        url_inicio = f'https://century21mexico.com/v/resultados/operacion_renta/precio-desde_{desde}/precio-hasta_{hasta}?json=true'
        response_renta = req.get_requests(url_inicio, headers_json)
        soup_json_renta = response_renta.json()
        sub_total = soup_json_renta['totalHits']
        sub_total_int = int(sub_total.replace(',',''))
        contador += sub_total_int
        print(sub_total_int)
        print(f'contador: {contador}')
        print(f'desde: {desde} hasta: {hasta}')
        print(url_inicio)
        desde = hasta +1
        hasta += 12000
    else:
        url_final = f'https://century21mexico.com/v/resultados/operacion_renta/precio-desde_{desde}?json=true'
        response_renta = req.get_requests(url_final, headers_json)
        soup_json_renta = response_renta.json()
        sub_total = soup_json_renta['totalHits']
        sub_total_int = int(sub_total.replace(',',''))
        contador += sub_total_int
        print(sub_total_int)
        print(f'contador: {contador}')
        print(url_final)


1089
contador: 1089
desde: 0 hasta: 12000
https://century21mexico.com/v/resultados/operacion_renta/precio-desde_0/precio-hasta_12000?json=true
1409
contador: 2498
desde: 12001 hasta: 24000
https://century21mexico.com/v/resultados/operacion_renta/precio-desde_12001/precio-hasta_24000?json=true
734
contador: 3232
desde: 24001 hasta: 36000
https://century21mexico.com/v/resultados/operacion_renta/precio-desde_24001/precio-hasta_36000?json=true
1465
contador: 4697
https://century21mexico.com/v/resultados/operacion_renta/precio-desde_36001?json=true


In [10]:
response_sale = req.get_requests(url_json_sale, headers_json)
response_sale

<Response [200]>

In [11]:
soup_sale = BeautifulSoup(response_sale.content, 'html.parser')

In [12]:
soup_json_sale = response_sale.json()
total_sale = soup_json_sale['totalHits']
total_sale = int(total_sale.replace(',',''))
total_sale

18721

In [16]:
sale_type = soup_json_sale['filtros'][1]['validValues'][0]['isActive']

rent_type = soup_json_sale['filtros'][1]['validValues'][1]['isActive']


False

In [65]:
desde = 0
hasta = 500000
contador = 0
sub_total_int = 0


while contador < total_sale - sub_total_int:
    if (total_sale - contador) > 1500:
        url_inicio = f'https://century21mexico.com/v/resultados/operacion_venta/precio-desde_{desde}/precio-hasta_{hasta}?json=true'
        response_renta = req.get_requests(url_inicio, headers_json)
        soup_json_renta = response_renta.json()
        sub_total = soup_json_renta['totalHits']
        sub_total_int = int(sub_total.replace(',',''))
        contador += sub_total_int
        print(sub_total_int)
        print(f'contador: {contador}')
        print(f'desde: {desde} hasta: {hasta}')
        print(url_inicio)
    else:
        url_final = f'https://century21mexico.com/v/resultados/operacion_venta/precio-desde_{desde}?json=true'
        response_renta = req.get_requests(url_final, headers_json)
        soup_json_renta = response_renta.json()
        sub_total = soup_json_renta['totalHits']
        sub_total_int = int(sub_total.replace(',',''))
        contador += sub_total_int
        print(sub_total_int)
        print(f'contador: {contador}')
        print(url_final)
    desde = hasta +1
    hasta += 500000

803
contador: 803
desde: 0 hasta: 500000
https://century21mexico.com/v/resultados/operacion_venta/precio-desde_0/precio-hasta_500000?json=true
1209
contador: 2012
desde: 500001 hasta: 1000000
https://century21mexico.com/v/resultados/operacion_venta/precio-desde_500001/precio-hasta_1000000?json=true
1128
contador: 3140
desde: 1000001 hasta: 1500000
https://century21mexico.com/v/resultados/operacion_venta/precio-desde_1000001/precio-hasta_1500000?json=true
1236
contador: 4376
desde: 1500001 hasta: 2000000
https://century21mexico.com/v/resultados/operacion_venta/precio-desde_1500001/precio-hasta_2000000?json=true
1279
contador: 5655
desde: 2000001 hasta: 2500000
https://century21mexico.com/v/resultados/operacion_venta/precio-desde_2000001/precio-hasta_2500000?json=true
1253
contador: 6908
desde: 2500001 hasta: 3000000
https://century21mexico.com/v/resultados/operacion_venta/precio-desde_2500001/precio-hasta_3000000?json=true
1252
contador: 8160
desde: 3000001 hasta: 3500000
https://centur

In [168]:
soup = BeautifulSoup(response.content, 'html.parser')
urls_page = get_page_number(soup)
df = pd.DataFrame()

In [169]:
def get_result_page(url):
     response = req.get_requests(url, headers_json)
     if response:
          soup = response.json()
          results = soup['results']
     return results

In [170]:
for url in urls_page:
    result = get_result_page(url)
    df = pd.concat([df, pd.DataFrame(result)], ignore_index=True)
