## Webscrapping from metroscubicos.com

In [1]:
# import instances and libraries
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import requests
import re
import pandas as pd
# To show a progess bar in loops:
from tqdm import tqdm_notebook as tqdm
import csv

In [2]:
# List all entities available in metroscubicos.com

entidades = ['Aguascalientes', 'Baja California', 'Baja California Sur', 'Campeche', 'Chiapas', 'Chihuahua', 'Coahuila', 'Colima',
            'Distrito Federal', 'Durango', 'Estado De México', 'Guanajuato', 'Guerrero', 'Hidalgo', 'Jalisco', 'Michoacán',
             'Morelos', 'Nayarit', 'Nuevo León', 'Oaxaca', 'Puebla', 'Querétaro', 'Quintana Roo', 'San Luis Potosí',
             'Sinaloa', 'Sonora', 'Tabasco', 'Tamaulipas', 'Tlaxcala', 'Veracruz', 'Yucatán', 'Zacatecas']

In [6]:
# make smallcaps and change spaces for dashes, and select which entities
entidadesLower = [x.lower().replace(' ', '-') for x in entidades]

In [7]:
entidadesLower

['aguascalientes',
 'baja-california',
 'baja-california-sur',
 'campeche',
 'chiapas',
 'chihuahua',
 'coahuila',
 'colima',
 'distrito-federal',
 'durango',
 'estado-de-méxico',
 'guanajuato',
 'guerrero',
 'hidalgo',
 'jalisco',
 'michoacán',
 'morelos',
 'nayarit',
 'nuevo-león',
 'oaxaca',
 'puebla',
 'querétaro',
 'quintana-roo',
 'san-luis-potosí',
 'sinaloa',
 'sonora',
 'tabasco',
 'tamaulipas',
 'tlaxcala',
 'veracruz',
 'yucatán',
 'zacatecas']

In [8]:
# Initialize variables
cityList = []
listLinksPassed = []
tipos = ['casas', 'departamentos', 'terrenos']
baseUrl = 'https://inmuebles.metroscubicos.com/'
csvPath = './Resources/'

### Script para hacer el webscrapping por ciudad

In [None]:
## Read file by city
ciudad = entidadesLower[0]
csvfile = csvPath+ciudad+".csv"
data= pd.read_csv(csvfile)
data.head()

In [10]:
# Select which city
ciudades = entidadesLower[14:]
ciudades

['jalisco',
 'michoacán',
 'morelos',
 'nayarit',
 'nuevo-león',
 'oaxaca',
 'puebla',
 'querétaro',
 'quintana-roo',
 'san-luis-potosí',
 'sinaloa',
 'sonora',
 'tabasco',
 'tamaulipas',
 'tlaxcala',
 'veracruz',
 'yucatán',
 'zacatecas']

In [None]:
for ciudad in tqdm(ciudades):
    # Read csv file and get links
    csvfile = csvPath+ciudad+".csv"
    data= pd.read_csv(csvfile)
    links = data["link"].tolist()
    
    # Loop through all links and webscrape info. Info is stores in a dictionary
    for link in tqdm(links):
        # Clean dictionary for each link
        dict = {}
        dict["link"] = link
        try: 
            asset_soup = soup(requests.get(link).content, 'html.parser')
        except:
            print(f"Cannot retrieve {link} ... passing")
            if link not in listLinksPassed:
                listLinksPassed.append(link)
            continue
        # Get price and currency symbol
        if asset_soup.find('span', class_="andes-money-amount__currency-symbol") != None:
            simbolo = asset_soup.find('span', class_="andes-money-amount__currency-symbol").get_text()
        else:
            simbolo = ""
        if asset_soup.find('span', class_="andes-money-amount__fraction") != None:
            monto = asset_soup.find('span', class_="andes-money-amount__fraction").get_text()
        else:
            monto = ""
        precio = simbolo+monto
        dict["precio"] = precio
        if asset_soup.find('ol', class_="andes-breadcrumb") != None:
            divBreadcrumbs = asset_soup.find('ol', class_="andes-breadcrumb")
            aBreadcrumbs = divBreadcrumbs.find_all('a')
            if len(aBreadcrumbs) > 1:
                dict['municipio'] = aBreadcrumbs[-2].get_text()
                dict['colonia'] = aBreadcrumbs[-1].get_text()
            elif len(aBreadcrumbs) == 1:
                dict['municipio'] = ""
                dict['colonia'] = aBreadcrumbs[-1].get_text()
            else:
                dict['municipio'] = ""
                dict['colonia'] = ""
        # Find table and process columns and rows
        if asset_soup.find('tbody', class_="andes-table__body") == None:
            tabla = ""
            pass
        else:
            tabla = asset_soup.find('tbody', class_="andes-table__body")
            tablaDatos = asset_soup.find('table', class_="andes-table") 
            nombreDatos = tablaDatos.find_all('th')
            dataDatos = tablaDatos.find_all('td')
            for i in range(len(nombreDatos)):
                dict[nombreDatos[i].get_text()] = dataDatos[i].get_text()
        
        # Add dictionary values to global list
        if dict not in cityList:
            cityList.append(dict)

    # save results
    results_df = pd.DataFrame(cityList)
    final_df = pd.merge(data, results_df, on="link")
    file = csvPath+ciudad+"-webscrapeResults.csv"
    final_df.to_csv(file, index=False)
    
    # save links passed
    passed_df = pd.DataFrame(listLinksPassed)
    passed_df.to_csv(csvPath+ciudad+"_passed.csv", index=False)

            

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


  0%|          | 0/18 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/5753 [00:00<?, ?it/s]

### Multithreading

In [None]:
from multiprocessing.dummy import Pool  # This is a thread-based Pool
from multiprocessing import cpu_count

In [None]:
def parseWeb(url):
    dict = {}
    try: 
        asset_soup = soup(requests.get(url).content, 'html.parser')
    except:
        print(f"Cannot retrieve {url} ... passing")
        return dict
    # Get price and currency symbol
    if asset_soup.find('span', class_="andes-money-amount__currency-symbol") != None:
        simbolo = asset_soup.find('span', class_="andes-money-amount__currency-symbol").get_text()
    else:
        simbolo = ""
    if asset_soup.find('span', class_="andes-money-amount__fraction") != None:
        monto = asset_soup.find('span', class_="andes-money-amount__fraction").get_text()
    else:
        monto = ""
    precio = simbolo+monto
    dict["precio"] = precio
    if asset_soup.find('ol', class_="andes-breadcrumb") != None:
        divBreadcrumbs = asset_soup.find('ol', class_="andes-breadcrumb")
        aBreadcrumbs = divBreadcrumbs.find_all('a')
        dict['colonia'] = aBreadcrumbs[-1].get_text()
        dict['municipio'] = aBreadcrumbs[-2].get_text()
    # Find table and process columns and rows
    if asset_soup.find('tbody', class_="andes-table__body") == None:
        tabla = ""
        pass
    else:
        tabla = asset_soup.find('tbody', class_="andes-table__body")
        tablaDatos = asset_soup.find('table', class_="andes-table") 
        nombreDatos = tablaDatos.find_all('th')
        dataDatos = tablaDatos.find_all('td')
        for i in range(len(nombreDatos)):
            dict[nombreDatos[i].get_text()] = dataDatos[i].get_text()
    #if dict not in cityList:
    #    cityList.append(dict)
    return dict

In [None]:
if __name__ == '__main__':
    ciudad = entidadesLower[0]
    csvfile = csvPath+ciudad+".csv"
    data= pd.read_csv(csvfile)
    links = data["link"].tolist()
    pool = Pool(cpu_count() * 200)
    results = pool.map(parseWeb, links)
    if results not in cityList and len(results) != 0:
        cityList.append(results)
        print(f"Processed {len(results)} links")