# Data Cleaning and Plots - Scraped data from wikipedia

In [11]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
from requests.exceptions import HTTPError
import time
import numpy as np
from random import randint

## Scraping data

In [21]:
list_of_urls = ['https://pt.wikipedia.org/wiki/Categoria:Naturais_de_Minas_Gerais',
                'https://pt.wikipedia.org/w/index.php?title=Categoria:Naturais_de_Minas_Gerais&subcatfrom=Itamonte%0ANaturais+de+Itamonte#mw-subcategories',
                'https://pt.wikipedia.org/w/index.php?title=Categoria:Naturais_de_Minas_Gerais&subcatfrom=Sao+Pedro+Uniao%0ANaturais+de+S%C3%A3o+Pedro+da+Uni%C3%A3o#mw-subcategories']
citizens = []
city_name = []
flat_citizens = []
flat_city_name = []
for i in range(len(list_of_urls)):
    try:
        source = requests.get(list_of_urls[i]).text
        soup = BeautifulSoup(source,'lxml')
        citizens.append(soup.select("div > span:nth-of-type(2)"))
        city_name.append(soup.find_all('div',class_='CategoryTreeItem'))
        time.sleep(randint(3, 10))
        flat_citizens.extend(citizens[i])
        flat_city_name.extend(city_name[i])
    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
    except Exception as err:
        print(f'Other error occurred: {err}')
cities = pd.Series(
    {flat_city_name[i].a.text : flat_citizens[i].text for i in range(len(names))})
cities = cities.drop("Naturais de Minas Gerais por ocupação")
cities.to_csv('citizens.csv')

## Reading csv with scraped data

In [23]:
cities = pd.read_csv("citizens.csv")
cities = cities.rename(columns={"Unnamed: 0": "NAME", "0": "PESSOAS NA WIKIPEDIA"})
cities.head(11)

Unnamed: 0,NAME,PESSOAS NA WIKIPEDIA
0,Naturais de Abadia dos Dourados,(4 P)
1,Naturais de Abaeté,(11 P)
2,Naturais de Abre-Campo,(2 P)
3,Naturais de Acaiaca,(2 P)
4,Naturais de Açucena (Minas Gerais),(1 P)
5,Naturais de Água Boa (Minas Gerais),(2 P)
6,Naturais de Aimorés (Minas Gerais),(12 P)
7,Naturais de Aiuruoca,(6 P)
8,Naturais de Além Paraíba,(10 P)
9,Naturais de Alfenas,(18 P)


## Eliminating unwanted chars using REGEX 

In [14]:
cities["NAME"] = cities["NAME"].replace(to_replace='\(.*|Naturais de', value='', regex=True).str.strip()
cities["PESSOAS NA WIKIPEDIA"] = cities["PESSOAS NA WIKIPEDIA"].replace(to_replace='\(|P\)|vazia\)', value='', regex=True)
cities.head(10)

Unnamed: 0,NAME,PESSOAS NA WIKIPEDIA
0,Abadia dos Dourados,4
1,Abaeté,11
2,Abre-Campo,2
3,Acaiaca,2
4,Açucena,1
5,Água Boa,2
6,Aimorés,12
7,Aiuruoca,6
8,Além Paraíba,10
9,Alfenas,18


## Eliminating wrong values

In [15]:
cities = cities.drop(303)
cities = cities.drop(152)
cities = cities.drop(439)
cities["PESSOAS NA WIKIPEDIA"] = cities["PESSOAS NA WIKIPEDIA"].astype('int64')
cities.head()

Unnamed: 0,NAME,PESSOAS NA WIKIPEDIA
0,Abadia dos Dourados,4
1,Abaeté,11
2,Abre-Campo,2
3,Acaiaca,2
4,Açucena,1


## Reading cities with population CSV
### Filtering and applying some changes

In [16]:
cities_pop = pd.read_csv("sorted_cities_pop.csv")
cities_pop = cities_pop[cities_pop["STATE"] == "MG"]
cities_pop = cities_pop.drop(["Unnamed: 0","DISTANCE POP"],axis=1)
cities_pop["NAME"] = [i.split("/")[0] for i in cities_pop["NAME"]]
cities_pop.head()

Unnamed: 0,STATE,NAME,POPULAÇÃO ESTIMADA
2114,MG,Belo Horizonte,2502557
2115,MG,Contagem,648766
2116,MG,Betim,417307
2117,MG,Ribeirão das Neves,322659
2118,MG,Santa Luzia,216254


## Correcting wrong names

In [17]:
wrong = cities[~cities['NAME'].isin(cities_pop['NAME'])]
wrong_list = wrong["NAME"].tolist()
print(wrong_list)
correct_list = ["Abre Campo", "Amparo do Serra", "Bocaiúva","Brasópolis","Galiléia",
                "Passa Quatro", "São João del Rei", "São Thomé das Letras"]
for i in range(len(wrong_list)):
    cities.at[cities[cities["NAME"]==wrong_list[i]].index[0],
            "NAME"] = correct_list[i]


['Abre-Campo', 'Amparo da Serra', 'Bocaiuva', 'Brazópolis', 'Galileia', 'Passa-Quatro', 'São João del-Rei', 'São Tomé das Letras']


## Merging DFs using outer join 

In [18]:
cities = cities.merge(cities_pop,how='outer')
cities.head()

Unnamed: 0,NAME,PESSOAS NA WIKIPEDIA,STATE,POPULAÇÃO ESTIMADA
0,Abadia dos Dourados,4.0,MG,7015
1,Abaeté,11.0,MG,23535
2,Abre Campo,2.0,MG,13719
3,Acaiaca,2.0,MG,4056
4,Açucena,1.0,MG,10140


## Fixes to get the final DF - exporting to CSV 

In [19]:
cities['PESSOAS NA WIKIPEDIA'] = cities['PESSOAS NA WIKIPEDIA'].fillna(0)
cities = cities.drop(["STATE"],axis=1)
cities_with_famous = cities.copy()
cities_with_famous.to_csv('merged_cities_famous.csv')