#Building a Rental Apartment Database with Web Scraping

In [1]:
import requests  # Importing the requests library to make HTTP requests
from bs4 import BeautifulSoup as bs  # Importing BeautifulSoup to parse HTML documents
import pandas as pd  # Importing Pandas library to manipulate data

In [2]:
url = 'https://www.argenprop.com/inmuebles-alquiler-localidad-capital-federal' # The URL of the webpage to scrape
search = requests.get(url, verify = True, headers = {"User-Agent":'Mozilla/5.0'})  # Making a GET request to the URL
print(f'El status es: {search.status_code}') # Printing the status code of the request
parse_search = bs(search.content, 'html.parser') # Parsing the HTML content of the response using BeautifulSoup

El status es: 200


In [3]:
#Let's see what type of information we obtained
tag_apartment_1 = parse_search.find(name='div', attrs={'class': 'listing__item'}) # tag containing metadata of the first apartment
print(tag_apartment_1.prettify()[:200]) # See the general tag of the first apartment

<div class="listing__item">
 <a class="card" data-item-card="8343599" data-item-id-visibilidad="11882835" data-track-dormitorios="" data-track-idaviso="8343599" data-track-idbarrio="" data-track-idloc


In [8]:
tag_apartment_1 = tag_apartment_1.find(name='a', attrs={'class': 'card'}) # The tag where the url of the apartment we are interested in is contained
print(tag_apartment_1.prettify()[:200])
print(tag_apartment_1.attrs['href']) # The url of the tag, we see that it is actually a subdomain within the web page.

<a class="card" data-item-card="8343599" data-item-id-visibilidad="11882835" data-track-dormitorios="" data-track-idaviso="8343599" data-track-idbarrio="" data-track-idlocalidad="2102" data-track-idmo
/departamento-en-alquiler-en-belgrano--8343599


In [4]:
#If we want to search for all the apartments that appear on the page, we use find_all for the first search we did
tag_apartments = parse_search.findAll(name='div', attrs={'class': 'listing__item'})
print(f'The object type is: {type(tag_apartments)}, which can be treated as a list (i.e. we can iterate over it)')
print(f'The number of apartments we found is: {len(tag_apartments)}')

The object type is: <class 'bs4.element.ResultSet'>, which can be treated as a list (i.e. we can iterate over it)
The number of apartments we found is: 20


In [None]:
#So, if we want all the urls of these apartments, we can do the following iteration:
base = 'https://www.argenprop.com' # Define the argenprop domain to add to the href of each apartment

apart_urls = [base + t.find(name='a', attrs={'class': 'card'}).attrs['href'] for t in tag_apartments]

apart_urls # And thus we obtain the list of the first 20 apartments shown on the first page of argenprop

In [None]:
apart_url = apart_urls[1]
apart_search = requests.get(apart_url, verify=True, headers={"User-Agent": 'Mozilla/5.0'})
print(f'The status is: {apart_search.status_code}') # Check that everything went well
parse_apart_search = bs(apart_search.content, 'html.parser')
print(parse_apart_search.prettify()[2000:4000])

In [12]:
location = parse_apart_search.find(name = 'div', attrs = {'class' : 'map-container'}).findNext().attrs
print(location)
print(f'Latitude : {location["data-latitude"]}')
print(f'Longitude : {location["data-longitude"]}')

{'class': ['leaflet-container'], 'data-location-map': '', 'data-url': 'https://static1.sosiva451.com/Mapas/{z}/{x}/{y}', 'data-latitude': '-34,55875', 'data-longitude': '-58,46048', 'data-attribution': 'Argenprop © en colaboración con', 'data-syst': 'Argenprop', 'data-location': 'Ficha', 'data-ad': '12697669', 'data-origin': '5U09_2'}
Latitude : -34,55875
Longitude : -58,46048


In [None]:
features = parse_apart_search.findAll('ul', attrs={'class': 'property-features'})
print(features[2].prettify) # We see that within the features, what the page calls Caracteristicas, Basic Data, Surface, etc.

## **Process Automation**

In [14]:
import time 
import numpy as np

In [15]:
def aux_apartment_url(apartment, *args):
    """
    This auxiliary function allows us to avoid errors associated with tags that are not for apartments or do not have the url defined.
    """
    try:
      return base + apartment.find(name='a', attrs={'class': 'card'}).attrs['href']
    except:
      return None

def detect_apartment_urls(search_page_url, base_url):
  """
  This function searches for the urls of apartments given a specific page and returns a list with the urls of each apartment.
  Parameters
  ----------
  search_page_url : string
      The url to scrape in search of apartments.
  base_url : string
      The base url of the page. It is used to return the apartment url ready to use.

  Returns
  -------
  apartment_urls : list
      A list containing the urls of the apartments of each page.
  """

  soup = bs(requests.get(search_page_url, verify=True, headers={"User-Agent": 'Mozilla/5.0'}).content, 'html.parser')

  apartment_urls = [aux_apartment_url(apartment, base_url) for apartment in soup.findAll(name='div', attrs={'class': 'listing__item'})]
      
  return apartment_urls

def get_apartment_info(apartment_url):
    """
    Given an apartment url, this function returns the information associated with it

    Parameters
    ----------
    apartment_url : string
        The url of an apartment.

    Returns
    -------
    apartment_info : dict
        A dictionary of apartment attributes. If an attribute is not found, it returns ''.

    """
    apartment_info = {
        'location': None,
        'latitude': None,
        'longitude': None,
        'bedrooms': None,
        'bathrooms': None,
        'antiquity': None,
        'expenses': None,
        'price': None,
        'currency': None,
        'covered_area': None,
        'uncovered_area': None,
        'url': None
    }
    
    soup = bs(requests.get(apartment_url, verify = True, headers = {"User-Agent":'Mozilla/5.0'}
    				       ).content,
                         'html.parser'
                         )
    apartment_info['url'] = apartment_url
    try:
        ubicacion = soup.find(name = 'div', attrs = {'class' : 'map-container'}).findNext().attrs
        apartment_info['latitude'] = ubicacion['data-latitude'].replace(',','.')
        apartment_info['longitude'] = ubicacion['data-longitude'].replace(',','.')
    except:
        pass
    
    try:
        apartment_info['location'] = soup.find('h3', {'class' : 'titlebar__address'}).text.lower()
    except:
        pass        
    features = soup.findAll('ul',
                                   attrs = {'class' : 'property-features'}
                                   )
    try:
        apartment_info['bathrooms'] = soup.find('i', {'class' : 'icono-cantidad_banos'}).find_next().find_next().text
    except:
        pass
    try:
        apartment_info['bedrooms'] = soup.find('i', {'class' : 'icono-cantidad_dormitorios'}).find_next().find_next().text
    except:
        pass
    for feature in features:
        for child in feature.findChildren(name = 'p'):
            attr_list = child.text.replace(':','').replace('.','').split()
            if 'Dormitorios' in attr_list:
                apartment_info['bedrooms'] = attr_list[-1]
            elif 'Baños' in attr_list:
                apartment_info['bathrooms'] = attr_list[-1]
            elif 'Antiguedad' in attr_list:
                apartment_info['antiquity'] = attr_list[-1]
            elif 'Expensas' in attr_list and '$' in attr_list:
                apartment_info['expenses'] = attr_list[-1]
            elif 'Precio' in attr_list and '$' in attr_list:
                apartment_info['price'] = attr_list[-1]
                apartment_info['currency'] = attr_list[attr_list.index('$')]
            elif 'Precio' in attr_list and 'USD' in attr_list:
                apartment_info['price'] = attr_list[-1]
                apartment_info['currency'] = attr_list[attr_list.index('USD')]                
            elif 'Sup' in attr_list and 'Cubierta' in attr_list:
                apartment_info['covered_area'] = attr_list[-2].replace(',','.')
            elif 'Sup' in attr_list and 'Descubierta' in attr_list:
                apartment_info['uncovered_area'] = attr_list[-2].replace(',','.')
            
    
    return apartment_info

def get_apartment_urls_info(search_page_url, base_url, page_count, from_page=1):
    """
    This function gathers all the previous ones in order to loop through the specified number
    of pages and extract information from each apartment, storing it as a dictionary.
    """
    apartments_info = {}
    apartment_id = 1
    for k in range(from_page, page_count + 1):
        apartments_urls = detect_apartment_urls(search_page_url, base_url)
        apartments_urls = [url for url in apartments_urls if url != None]
        for apartment_url in apartments_urls:
            apartments_info[apartment_id] = get_apartment_info(apartment_url)
            time.sleep(1 * np.random.random())
            apartment_id += 1
        
        if k == from_page:
            search_page_url += '-pagina-{}'.format(from_page)
            
        search_page_url = search_page_url.replace('-pagina-{}'.format(k),
                                                          '-pagina-{}'.format(k + 1))

    
    return apartments_info

let's try a lot of websites

In [16]:
base = 'https://www.argenprop.com'
url = 'https://www.argenprop.com/departamento-alquiler-localidad-capital-federal-orden-masnuevos'

apartment_info = get_apartment_urls_info(url,
                                base,
                                page_count = 2)

In [17]:
print(f'We obtained information from {len(apartment_info)} apartments')

We obtained information from 40 apartments


In [18]:
apartment_info_df = pd.DataFrame(apartment_info).T
apartment_info_df

Unnamed: 0,location,latitude,longitude,bedrooms,bathrooms,antiquity,expenses,price,currency,covered_area,uncovered_area,url
1,"dr j valentin gomez 2600, piso 8",-34.605545,-58.403893,1,1.0,50.0,10000.0,90000,$,35.0,,https://www.argenprop.com/departamento-en-alqu...
2,nogoya 3900,-34.608437,-58.501083,2,3.0,,,1000,USD,210.0,187.0,https://www.argenprop.com/departamento-en-alqu...
3,"av. del libertador 2200, piso 15",-34.581425,-58.40362,3,2.0,30.0,100000.0,2200,USD,138.0,9.0,https://www.argenprop.com/departamento-en-alqu...
4,paraguay 4700,-34.58258,-58.427128,1,2.0,,,2450,USD,160.0,,https://www.argenprop.com/departamento-en-alqu...
5,"av san juan 2100, piso 11",-34.622864,-58.395615,1,,,10000.0,75000,$,40.0,,https://www.argenprop.com/departamento-en-alqu...
6,correa al 2400,-34.54275,-58.460144,3,2.0,14.0,60000.0,1000,USD,79.0,7.0,https://www.argenprop.com/departamento-en-alqu...
7,nicolas rodriguez peña 1100,-34.59514,-58.3921,3,4.0,70.0,80000.0,510000,$,250.0,22.0,https://www.argenprop.com/departamento-en-alqu...
8,paraguay al 1400,-34.59845,-58.387478,1,1.0,50.0,19772.0,120000,$,,,https://www.argenprop.com/departamento-en-alqu...
9,araoz 700,,,,,,,550,USD,,,https://www.argenprop.com/departamento-en-alqu...
10,luis maría campos 1200,-34.5646,-58.441048,1,1.0,55.0,20000.0,95000,$,44.0,,https://www.argenprop.com/departamento-en-alqu...


# Palermo

Now a specific neighbourhood


In [19]:
base = 'https://www.argenprop.com'
url = 'https://www.argenprop.com/departamento-alquiler-barrio-palermo-localidad-capital-federal-orden-masnuevos'

apartment_info_palermo = get_apartment_urls_info(url,
                                base,
                                page_count = 2)

In [20]:
print(f'We obtained information from {len(apartment_info_palermo)} apartments')

We obtained information from 40 apartments


In [None]:
apartment_info_palermo_df = pd.DataFrame(apartment_info_palermo).T
apartment_info_palermo_df

In [24]:
apartment_info_palermo_df.price = pd.to_numeric(apartment_info_palermo_df.price, errors='coerce')


df = apartment_info_palermo_df.sort_values(by=['price'])
df

Unnamed: 0,location,latitude,longitude,bedrooms,bathrooms,antiquity,expenses,price,currency,covered_area,uncovered_area,url
32,scalabrini ortiz 2700,-34.58404,-58.41445,Monoambiente,,,39500.0,600,USD,50.0,,https://www.argenprop.com/departamento-en-alqu...
31,vidt al al 2000. disponible desde el 20/03/2023,-34.588806,-58.4138,2,2.0,70.0,20000.0,900,USD,60.0,,https://www.argenprop.com/departamento-en-alqu...
15,juncal al 4500,-34.577045,-58.422913,1,1.0,15.0,,1100,USD,,,https://www.argenprop.com/departamento-en-alqu...
16,ocampo al 2500,-34.58153,-58.405003,3,2.0,,30000.0,1200,USD,107.0,,https://www.argenprop.com/departamento-en-alqu...
14,ruggieri 2900,-34.579674,-58.400246,2,1.0,25.0,43000.0,1200,USD,65.0,5.0,https://www.argenprop.com/departamento-en-alqu...
26,guatemala 5629 2°,-34.58,-58.432076,2,2.0,,27000.0,1500,USD,67.0,,https://www.argenprop.com/departamento-en-alqu...
29,paraguay al 4800,-34.58184,-58.42777,2,2.0,1.0,58000.0,1500,USD,65.0,,https://www.argenprop.com/departamento-en-alqu...
11,cabello 3500,-34.58188,-58.40687,1,1.0,35.0,1.0,1500,USD,85.0,,https://www.argenprop.com/departamento-en-alqu...
18,av santa fe al 3400,-34.586468,-58.41369,3,1.0,,,1600,USD,,,https://www.argenprop.com/departamento-en-alqu...
6,ruggieri al 2900,-34.58026,-58.40709,2,2.0,15.0,46000.0,1700,USD,88.0,4.0,https://www.argenprop.com/departamento-en-alqu...


## **Statistics**
*ToDo*

Number of apartments in Dollar and Pesos

In [25]:
apartment_info_df['currency'].value_counts()

$      21
USD    19
Name: currency, dtype: int64

In [26]:
apartment_info_palermo_df['currency'].value_counts()

USD    22
$      18
Name: currency, dtype: int64