# Data Scraping

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
import time

def GetRegionsLinks(session):

    '''
    Enter to 'https://www.municipio.com.co' and extract all the Colombia regions
    No arguments
    '''
    
    Regions_Links = []
    
    response = session.get('https://www.municipio.com.co/')
    soup = BeautifulSoup(response.text,'html')
    regions = soup.find('div',class_='box mr list-box').find('ul').find_all('li')
    
    Regions_Links = ['https://www.municipio.com.co' + region.find('a').get('href') for region in regions]

    # Remove this element from the Regions_Links list because Archipiélado de San Andres isn't a region but rather a town

    Regions_Links.remove('https://www.municipio.com.co/departamento-archipielago-de-san-andres.html')
    
    return Regions_Links

def GetRegionPageCount(Region_Link,session):

    '''
    Navigate to each region's link and retrieve the number of total pages. 
    The Region_Link must be passed as an argument.
    '''
    
    response = session.get(Region_Link)
    soup = BeautifulSoup(response.text,'html')
    
    try:
        page_count = int((soup.find('td',class_='pagination').find_all('a')[-1].text))
    except:
        page_count = 1
        
    return page_count

def GetTownsLinks(Region_Link,session):
    
    '''
    Iterate through all the pages of each region to obtain all the towns links.
    Region_Link must be passed as an argument
    '''
    
    page_count = GetRegionPageCount(Region_Link,session)
    Towns_Links = []
    
    for page_count in range(1,page_count+1):
        url = Region_Link+f'?page={page_count}'
        response = session.get(url)
        soup = BeautifulSoup(response.text,'html')
        
        towns = soup.find('div',class_='annuaire').find_all('h2')
        Page_Links = ['https://www.municipio.com.co' + town.find('a').get('href') for town in towns]
        
        Towns_Links.extend(Page_Links)
    
    return Towns_Links

def GetTownInformation(Town_Link,session):
    
    '''
    Obtain certain data for the town
    Town_Link must be passed as an argument
    '''

    response = session.get(Town_Link)
    soup = BeautifulSoup(response.text,'html')
    
    Data = {'Town':[],
            'Region':[],
            'Country':[],
            'Town Code':[],
            'Website':[],
            'Phone':[],
            'Postal Code':[],
            'Mayor':[],
            'Population':[],
            'Area':[],
            'Altitude':[],
            'LAT':[],
            'LONG':[],
            'URL':[]}
    
    # General Info

    Data['URL'].append(Town_Link)
    
    Data['Town'].append(soup.find('a',{'id':'city'}).text.split('cipio de')[-1].strip())
    
    try: Data['Website'].append(soup.find('a',{'title':'Sitio internet del municipio'}).get('href'))
    except: Data['Website'].append('N/A')
    
    # Administration Info
    
    admindata = soup.find('div',{'id':'div_admindata'}).find_all('a')
    
    Data['Region'].append(admindata[1].text)
    
    Data['Country'].append(admindata[0].text)
    
    # CityHall Info
    
    cityhall = soup.find('div',{'id':'div_cityhall'}).find_all('tr')
    
    if "".join(cityhall[1].find('span').text.split()[1:]) == 'disponible':
        Data['Phone'].append('N/A')
    else:
        Data['Phone'].append("".join(cityhall[1].find('span').text.split()[1:]))
    
    #Data['Alcalde'].append(cityhall[-1].find('td').text)
    
    if cityhall[-1].find('td').text == '':
        Data['Mayor'].append('N/A')
    else:
        Data['Mayor'].append(cityhall[-1].find('td').text)
    
    # Demographic Info
    
    demography = soup.find('div',{'id':'div_demography'}).find_all('tr')
    
    Data['Population'].append(int("".join(demography[2].find('td').text.split()[:-1])))
    
    # Official Numbers Info
    try:
        number = soup.find('div',{'id':'div_number'}).find_all('tr')
        
        Data['Town Code'].append(number[0].find('td').text)
        
        if number[1].find('td').text == '':
            Data['Postal Code'].append('N/A')
        else:
            Data['Postal Code'].append(number[1].find('td').text)
    except:
        Data['Town Code'].append('N/A')
        Data['Postal Code'].append('N/A')
        
    #Territory Info
    
    territory = soup.find('div',{'id':'div_territory'}).find_all('tr')
    
    Data['Area'].append(float(territory[0].find('td').text.split('s')[1].split()[0].replace(',','.')))
    
    Data['Altitude'].append(int(territory[1].find('td').text.split('m')[0].replace(' ','')))
    
    Data['LAT'].append(territory[-3].find('td').find_all('span')[0].text)
    
    Data['LONG'].append(territory[-3].find('td').find_all('span')[1].text)
    
    df = pd.DataFrame(Data)

    return df

def main():
    start_time = time.time()

    session = requests.Session()

    Regions_Links = GetRegionsLinks(session)

    Towns_Links = []

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(GetTownsLinks,Region_Link,session) for Region_Link in Regions_Links]
        for future in concurrent.futures.as_completed(futures):
            Regional_Towns_Links = future.result()
            Towns_Links.extend(Regional_Towns_Links)

    dfs = []
    
    with ThreadPoolExecutor(max_workers = 28) as executor:
        futures = [executor.submit(GetTownInformation,Town_Link,session) for Town_Link in Towns_Links]
        for future in concurrent.futures.as_completed(futures):
            df = future.result()
            dfs.append(df)
    
    df = pd.concat(dfs, ignore_index=True)

    end_time = time.time()
    
    total_time = end_time - start_time

    return df, total_time

df,total_time = main()

print(f'Total Time: {(round(total_time,2))} s')

Total Time: 143.25 s


# Data Analysis

In [2]:
df

Unnamed: 0,Town,Region,Country,Town Code,Website,Phone,Postal Code,Mayor,Population,Area,Altitude,LAT,LONG,URL
0,El Encanto,Amazonas,Colombia,91263,,,913018,Mary Sofía Muñoz Romero,4376,11251.0,147,-1.67792,-73.2288,https://www.municipio.com.co/corregimiento-dep...
1,La Chorrera,Amazonas,Colombia,91405,,,914057,,3337,12517.0,152,-1.45,-72.583,https://www.municipio.com.co/corregimiento-dep...
2,Puerto Nariño,Amazonas,Colombia,91540,http://www.puertonarino-amazonas.gov.co,+5785223010,911010,Nelso Ruiz Ahue,6816,1475.0,96,-3.76286,-70.3745,https://www.municipio.com.co/municipio-puerto-...
3,Mirití-Paraná,Amazonas,Colombia,,,,,José Ávila Yucuna Matapi,1613,16864.0,116,-0.883461,-70.9835,https://www.municipio.com.co/corregimiento-dep...
4,Tarapacá,Amazonas,Colombia,91798,,,911030,Flor Ángela Martínez Bernardino,3100,9153.0,54,-2.867,-69.733,https://www.municipio.com.co/corregimiento-dep...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1116,Páramo,Santander,Colombia,68533,http://paramo-santander.gov.co,+573234936623,683527,Filemón Solano Cala,3643,74.0,1342,6.417,-73.167,https://www.municipio.com.co/municipio-paramo....
1117,Contratación,Santander,Colombia,68211,http://www.contratacion-santander.gov.co,+5777271231,683071,MARTHA SUAREZ,3904,117.0,1655,6.29277,-73.4739,https://www.municipio.com.co/municipio-contrat...
1118,Chipatá,Santander,Colombia,68179,http://www.chipata-santander.gov.co,+5777565494,685557,Argemiro Angulo Diaz,4972,85.0,1793,6.06323,-73.6362,https://www.municipio.com.co/municipio-chipata...
1119,Güepsa,Santander,Colombia,68327,http://www.guepsa-santander.gov.co,+5772283051,685547,Osmar Arias,4200,27.0,1528,6.02525,-73.5732,https://www.municipio.com.co/municipio-guepsa....


In [3]:
df['Inhabitants/Km2'] = round(df['Population'] / df['Area'],1)
df

Unnamed: 0,Town,Region,Country,Town Code,Website,Phone,Postal Code,Mayor,Population,Area,Altitude,LAT,LONG,URL,Inhabitants/Km2
0,El Encanto,Amazonas,Colombia,91263,,,913018,Mary Sofía Muñoz Romero,4376,11251.0,147,-1.67792,-73.2288,https://www.municipio.com.co/corregimiento-dep...,0.4
1,La Chorrera,Amazonas,Colombia,91405,,,914057,,3337,12517.0,152,-1.45,-72.583,https://www.municipio.com.co/corregimiento-dep...,0.3
2,Puerto Nariño,Amazonas,Colombia,91540,http://www.puertonarino-amazonas.gov.co,+5785223010,911010,Nelso Ruiz Ahue,6816,1475.0,96,-3.76286,-70.3745,https://www.municipio.com.co/municipio-puerto-...,4.6
3,Mirití-Paraná,Amazonas,Colombia,,,,,José Ávila Yucuna Matapi,1613,16864.0,116,-0.883461,-70.9835,https://www.municipio.com.co/corregimiento-dep...,0.1
4,Tarapacá,Amazonas,Colombia,91798,,,911030,Flor Ángela Martínez Bernardino,3100,9153.0,54,-2.867,-69.733,https://www.municipio.com.co/corregimiento-dep...,0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1116,Páramo,Santander,Colombia,68533,http://paramo-santander.gov.co,+573234936623,683527,Filemón Solano Cala,3643,74.0,1342,6.417,-73.167,https://www.municipio.com.co/municipio-paramo....,49.2
1117,Contratación,Santander,Colombia,68211,http://www.contratacion-santander.gov.co,+5777271231,683071,MARTHA SUAREZ,3904,117.0,1655,6.29277,-73.4739,https://www.municipio.com.co/municipio-contrat...,33.4
1118,Chipatá,Santander,Colombia,68179,http://www.chipata-santander.gov.co,+5777565494,685557,Argemiro Angulo Diaz,4972,85.0,1793,6.06323,-73.6362,https://www.municipio.com.co/municipio-chipata...,58.5
1119,Güepsa,Santander,Colombia,68327,http://www.guepsa-santander.gov.co,+5772283051,685547,Osmar Arias,4200,27.0,1528,6.02525,-73.5732,https://www.municipio.com.co/municipio-guepsa....,155.6


## Demographics

In [4]:
# Total Colombia Population

df['Population'].sum()

46576228

In [5]:
# Top 10 - Most populated cities

top_population = pd.DataFrame({'City':df['Town'],
                               'Population':df['Population']}).sort_values('Population',ascending=False).head(10)

top_population

Unnamed: 0,City,Population
751,Bogotá,10763453
319,Medellín,2222213
962,Cali,2075380
121,Barranquilla,1112889
137,Cartagena de Indias,895400
772,Soacha,831259
657,Cúcuta,585543
990,Ibagué,541101
320,Bello,532154
1034,Bucaramanga,509918


In [6]:
# Top 10 - Most densely populated cities

top_density = pd.DataFrame({'City':df['Town'],
                            'Inhabitants/Km2':df['Inhabitants/Km2']}).sort_values('Inhabitants/Km2',ascending=False).head(10)

top_density

Unnamed: 0,City,Inhabitants/Km2
310,Itagüí,13718.5
125,Soledad,6802.9
121,Barranquilla,6704.2
751,Bogotá,6060.6
309,Sabaneta,6049.5
319,Medellín,5742.2
772,Soacha,4445.2
962,Cali,3759.7
320,Bello,3524.2
302,Envigado,3438.0


## Geographics

In [7]:
# Top 10 - Highest Cities

highest_city = pd.DataFrame({'City':df['Town'],
                            'Altitude':df['Altitude']}).sort_values('Altitude',ascending=False).head(10)

highest_city

Unnamed: 0,City,Altitude
504,Jericó,3418
1105,Vetas,3259
849,Cumbal,3203
465,El Cocuy,3120
893,Cuaspud,3050
426,Aquitania,3042
855,Guachucal,3034
437,Ventaquemada,3033
903,Sapuyes,3011
853,Túquerres,3010


In [8]:
# Top 10 - Lowest Cities

lowest_city = pd.DataFrame({'City':df['Town'],
                            'Altitude':df['Altitude']}).sort_values('Altitude').head(10)

lowest_city

Unnamed: 0,City,Altitude
661,Pueblo Viejo,2
314,Turbo,2
845,El Charco,2
291,Riohacha,3
862,Tumaco,3
287,Manaure,3
179,Talaigua Nuevo,3
914,Santiago de Tolú,3
622,Pivijay,3
623,Ciénaga,3


In [9]:
# Cities / Region

count_cities = df.groupby('Region')['Town'].count()

top_region = pd.DataFrame({'Region':count_cities.index,
                          '# of Cities':count_cities.values}).sort_values('# of Cities',ascending=False).head(10)

top_region

Unnamed: 0,Region,# of Cities
1,Antioquia,125
5,Boyacá,123
12,Cundinamarca,117
25,Santander,87
20,Nariño,64
27,Tolima,47
4,Bolívar,47
9,Cauca,42
28,Valle Del Cauca,42
21,Norte de Santander,40


In [10]:
# Population Density / Region

sum_pop = df.groupby('Region')['Population'].sum()

sum_area = df.groupby('Region')['Area'].sum()

top_dens = pd.DataFrame({'Region':sum_pop.index,
                        'Inhabitants / Km2':(sum_pop.values / sum_area.values)}).sort_values('Inhabitants / Km2',ascending=False).head(10)

top_dens

Unnamed: 0,Region,Inhabitants / Km2
3,Atlántico,634.659091
12,Cundinamarca,557.202859
23,Quindío,281.133333
24,Risaralda,207.648792
28,Valle Del Cauca,183.041328
6,Caldas,113.905933
1,Antioquia,91.621354
4,Bolívar,70.835291
26,Sucre,69.823486
25,Santander,61.961074


In [11]:
# Area / Region

sum_areas = df.groupby('Region')['Area'].sum()

top_areas = pd.DataFrame({'Region':sum_areas.index,
                          'Area':sum_areas.values}).sort_values('Area',ascending=False).head(10)

top_areas

Unnamed: 0,Region,Area
0,Amazonas,109665.0
29,Vaupés,107001.71
30,Vichada,100242.0
7,Caquetá,88965.0
19,Meta,85635.0
14,Guainía,64427.0
1,Antioquia,63669.0
15,Guaviare,53460.0
11,Chocó,46530.0
8,Casanare,44640.0


In [12]:
# Mean Height / Region

mean_height = df.groupby('Region')['Altitude'].mean()

top_heights = pd.DataFrame({'Region':mean_height.index,
                            'Mean Height':mean_height.values}).sort_values('Mean Height',ascending=False).head(10)

top_heights

Unnamed: 0,Region,Mean Height
5,Boyacá,2178.325203
20,Nariño,1878.0625
12,Cundinamarca,1766.102564
6,Caldas,1546.407407
23,Quindío,1538.666667
24,Risaralda,1526.0
9,Cauca,1480.047619
25,Santander,1479.977011
1,Antioquia,1326.608
21,Norte de Santander,1179.325


In [13]:
df

Unnamed: 0,Town,Region,Country,Town Code,Website,Phone,Postal Code,Mayor,Population,Area,Altitude,LAT,LONG,URL,Inhabitants/Km2
0,El Encanto,Amazonas,Colombia,91263,,,913018,Mary Sofía Muñoz Romero,4376,11251.0,147,-1.67792,-73.2288,https://www.municipio.com.co/corregimiento-dep...,0.4
1,La Chorrera,Amazonas,Colombia,91405,,,914057,,3337,12517.0,152,-1.45,-72.583,https://www.municipio.com.co/corregimiento-dep...,0.3
2,Puerto Nariño,Amazonas,Colombia,91540,http://www.puertonarino-amazonas.gov.co,+5785223010,911010,Nelso Ruiz Ahue,6816,1475.0,96,-3.76286,-70.3745,https://www.municipio.com.co/municipio-puerto-...,4.6
3,Mirití-Paraná,Amazonas,Colombia,,,,,José Ávila Yucuna Matapi,1613,16864.0,116,-0.883461,-70.9835,https://www.municipio.com.co/corregimiento-dep...,0.1
4,Tarapacá,Amazonas,Colombia,91798,,,911030,Flor Ángela Martínez Bernardino,3100,9153.0,54,-2.867,-69.733,https://www.municipio.com.co/corregimiento-dep...,0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1116,Páramo,Santander,Colombia,68533,http://paramo-santander.gov.co,+573234936623,683527,Filemón Solano Cala,3643,74.0,1342,6.417,-73.167,https://www.municipio.com.co/municipio-paramo....,49.2
1117,Contratación,Santander,Colombia,68211,http://www.contratacion-santander.gov.co,+5777271231,683071,MARTHA SUAREZ,3904,117.0,1655,6.29277,-73.4739,https://www.municipio.com.co/municipio-contrat...,33.4
1118,Chipatá,Santander,Colombia,68179,http://www.chipata-santander.gov.co,+5777565494,685557,Argemiro Angulo Diaz,4972,85.0,1793,6.06323,-73.6362,https://www.municipio.com.co/municipio-chipata...,58.5
1119,Güepsa,Santander,Colombia,68327,http://www.guepsa-santander.gov.co,+5772283051,685547,Osmar Arias,4200,27.0,1528,6.02525,-73.5732,https://www.municipio.com.co/municipio-guepsa....,155.6


# Exploratory Data Analisis

### Estimates of Location

In [64]:
# Media
for column in df.columns:
    try:
        print(column, df[column].mean())
    except:
        pass

Population 41548.82069580731
Area 1060.3360303300624
Altitude 1133.157894736842
Inhabitants/Km2 142.9750223015165


In [65]:
# Median
for column in df.columns:
    try:
        print(column, df[column].median())
    except:
        pass

Population 11432.0
Area 287.0
Altitude 1090.0
Inhabitants/Km2 41.2


In [70]:
# Trimmed Mean
from scipy import stats

for column in df.columns:
    try:
        print(column,stats.trim_mean(df[column],0.1))
    except:
        pass

Population 14629.91973244147
Area 431.4838350055741
Altitude 1072.3701226309922
Inhabitants/Km2 52.06700111482721


In [17]:
import numpy as np
# Weighted Mean
np.average(df['Altitude'],weights=df['Area'])

420.11676078247257

In [18]:
import weightedstats as ws
# Weighted Median
ws.weighted_median(df['Altitude'],weights=df['Area'])

165

### Estimates of Variability/Dispersion

In [31]:
# Mean Absolute Deviation

In [96]:
def mean_abs_dev(data):
    mad = mean(absolute(mean(data)-data))
    return mad

for column in df.columns:
    try:
        print(column,mean_abs_dev(df[column]))
    except:pass

Population 51044.25014463161
Area 1244.0621519113254
Altitude 811.9131414620405
Inhabitants/Km2 174.78157309844255


In [63]:
# Standard Deviation
for column in df.columns:
    try:
        print(column,df[column].std())
    except:
        pass

Population 340801.64010157593
Area 3610.6440545684345
Altitude 928.1789183725723
Inhabitants/Km2 654.5834501262073


In [72]:
# Variance
for column in df.columns:
    try:
        std = df[column].std()
        print(column, pow(std,2))
    except:
        pass

Population 116145757895.92409
Area 13036750.488790384
Altitude 861516.1045112782
Inhabitants/Km2 428479.49317912885


In [84]:
# Interquartile Range

def IQR(data,qua1,qua2):
    iqr = absolute(data.quantile(qua1/100) - data.quantile(qua2/100))
    return iqr

for column in df.columns:
    try:
        print(column, IQR(df[column],75,25))
    except:
        pass

Population 16720.0
Area 585.0
Altitude 1670.0
Inhabitants/Km2 64.0


In [103]:
# Median Absolute Deviation From the Media

for column in df.columns:
    try:
        print(column,sm.robust.scale.mad(df[column]))
    except:
        pass

Population 9625.053602538368
Area 286.14222817158117
Altitude 1282.4509190073456
LAT 2.036917535960477
LONG 1.6797883135668403
Inhabitants/Km2 39.1406985685479


In [114]:
# Range
def Range(data):
    range = data.max() - data.min()
    return range

for column in df.columns:
    try:
        print(column, Range(df[column]))
    except:
        pass

Population 10763228
Area 65659.0
Altitude 3416
Inhabitants/Km2 13718.5


In [119]:
# Percentile

for column in df.columns:
    try:
        print(f'{column}: Percentil 75 = {df[column].quantile(0.75)}')
    except:
        pass

Population: Percentil 75 = 22878.0
Area: Percentil 75 = 718.0
Altitude: Percentil 75 = 1842.0
Inhabitants/Km2: Percentil 75 = 82.9


In [136]:
for column in df.columns:
    try:
        print(df[column].quantile([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]))
        print(column)
    except:
        pass

0.1        3580.0
0.2        5281.0
0.3        7078.0
0.4        9090.0
0.5       11432.0
0.6       14736.0
0.7       19054.0
0.8       27443.0
0.9       48898.0
1.0    10763453.0
Name: Population, dtype: float64
Population
0.1       73.0
0.2      115.0
0.3      157.0
0.4      214.0
0.5      287.0
0.6      413.0
0.7      576.0
0.8      917.0
0.9     1739.0
1.0    65674.0
Name: Area, dtype: float64
Area
0.1      25.0
0.2     114.0
0.3     262.0
0.4     752.0
0.5    1090.0
0.6    1450.0
0.7    1711.0
0.8    2042.0
0.9    2542.0
1.0    3418.0
Name: Altitude, dtype: float64
Altitude
0.1        7.8
0.2       15.9
0.3       22.1
0.4       30.9
0.5       41.2
0.6       53.6
0.7       69.0
0.8       97.6
0.9      186.7
1.0    13718.5
Name: Inhabitants/Km2, dtype: float64
Inhabitants/Km2
