# MATPLOTLIB and SEABORN
- Link [Matplotlib](https://matplotlib.org/)
- Link [Seaborn](https://seaborn.pydata.org/#)

Vamos proceder a visualizacao de dados e rapida analise dos mesmos. Para isso, vamos introduzir as bibliotecas `Matplotlib` e `Seaborn` para essas visualizacoes recorrendo a dados presentes na platforma [BAROMETRO MUNDIAL - WORLDOMETERS](https://www.worldometers.info/).



|Alias |   Description|
|-|-|
|B |       business day frequency|
|C|        custom business day frequency
D|        calendar day frequency
W|        weekly frequency
M|        month end frequency
SM|       semi-month end frequency (15th and end of month)
BM|       business month end frequency
CBM|      custom business month end frequency
MS|       month start frequency
SMS|      semi-month start frequency (1st and 15th)
BMS|      business month start frequency
CBMS|     custom business month start frequency
Q|        quarter end frequency
BQ|       business quarter end frequency
QS|       quarter start frequency
BQS|      business quarter start frequency
A, Y|     year end frequency
BA, BY|   business year end frequency
AS, YS|   year start frequency
BAS, BYS| business year start frequency
BH|       business hour frequency
H|        hourly frequency
T, min|   minutely frequency
S|        secondly frequency
L, ms|    milliseconds
U, us|    microseconds
N|        nanoseconds


## 1. Importacao das bibliotecas

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [1]:
# PARA VISUALIZACAO DE DADOS
import seaborn as sns
import matplotlib.pyplot as plt

import pandas as pd

# ficehiro para captura de dados - WORLDO METERS
from populacao_barometro_mundial import EvolucaoPopulacionalBM





# Nomes de Paises da CPLP (Comunidade de Paises da Lingua Oficial Portuguesa)
paises_cplp = [
                'Angola', 'Sao Tome & Principe', 'Cabo Verde', 'Mozambique', 
                'Guinea-Bissau', 'Portugal', 'Brazil'
            ]

## 2. Leitura de dados para extracao de urls (de paises da CPLP)
O dado a seguir  surge foi criado com codigo onde houve uma exposicao atravez de video.


In [2]:
# Leitura de ficheiro com dados populacional de todos os paises de mundo e algumas dependencias.
paises_df  = pd.read_csv('../a1_webscraping/Population (2020).csv')
paises_df.set_index(['paises'], inplace=True)
paises_df.head()

Unnamed: 0_level_0,regioes,continentes,url,Population (2020),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
paises,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
India,Southern Asia,Asia,https://www.worldometers.info/world-population...,1380004000.0,0.99,13586631.0,464.0,2973190.0,-532687.0,2.2,28.0,35.0,17.7
Pakistan,Southern Asia,Asia,https://www.worldometers.info/world-population...,220892300.0,2.0,4327022.0,287.0,770880.0,-233379.0,3.6,23.0,35.0,2.83
Bangladesh,Southern Asia,Asia,https://www.worldometers.info/world-population...,164689400.0,1.01,1643222.0,1265.0,130170.0,-369501.0,2.1,28.0,39.0,2.11
Iran,Southern Asia,Asia,https://www.worldometers.info/world-population...,83992950.0,1.3,1079043.0,52.0,1628550.0,-55000.0,2.2,32.0,76.0,1.08
Afghanistan,Southern Asia,Asia,https://www.worldometers.info/world-population...,38928350.0,2.33,886592.0,60.0,652860.0,-62920.0,4.6,18.0,25.0,0.5


In [10]:
class EvolucaoPopulacionalBM:
    """
    Classe para captura, limpeza e organizacao de dados existente na WORLDOMETERS relativo a evolucao populacao populacional dos paises de mundo
    Metodos:
    - parse
    - extrair_dados
    - converter_p_numeric
    - dados_paises
    """
    
    def __init__(self, urls):
        self.urls = urls
        

    # Extracao de Dados HTML
    def extrair_dados(self, url):
        cabecalho = []
        
        table = BeautifulSoup(requests.get(url).text, 'html.parser').find('div', {'class':'table-responsive'}).find('table')
        
        for entrada in table.find('tr').find_all('th'):
            cabecalho.append(entrada.text)

        dados = []
        for coluna in table.find('tbody').find_all('tr'):
            dados.append([valor.text for valor in coluna.find_all('td')])

        pais_df = pd.DataFrame(dados, columns=cabecalho)
        pais_df['Pais'] = url.replace('/', ' ').split()[-1]

        return pais_df



    # LIMPEZA de DADOS
    def converter_p_numerico(self, string):
        resultado = "".join([digito for digito in string if digito.isnumeric() or digito in '.-'])

        if resultado.replace('.','').replace('-','').isnumeric():
            return float(resultado)
        else:
            return np.nan




    # Uniao das Tabelas 
    def dados_paises(self):
        """ 
        urls: introduzida como uma lista (series de urls)
        """

       
        
        if type(self.urls) == str:
            try:
                return self.extrair_dados(self.urls)
            except:
                print('Verifique a url (website link de Worldometers)')
            
        else:
            try:
                
                df = self.extrair_dados(self.urls[0])
                
                for url in self.urls[1:]:
                    print(f"{url[:-12].split('/')[-1]} \t {url}".expandtabs(20))
                    df = pd.concat([df, self.extrair_dados(url)])
                
                
                

                return df
            
            except:
                print('Verifique o conjunto de urls (Worldometers)')



    
    def para_numerico(self, obj):
        return int(''.join([car for car in obj if car.isnumeric()]))
    
    
    # Apartir da variavel inicial - cplp_paises (nomes de paises cplp) - criar nova variavel: evop_cplp (evol)    
    def evo_populacao(self):
        
        df = self.dados_paises()
        
        evo_populacional = pd.DataFrame()
        evo_populacional['Year'] = pd.to_datetime(df.Year.unique())
        lista_paises = [pais[::-1][11:][::-1]  for pais in df.Pais.unique()]

        for nome_pais in lista_paises:



            evo_populacional[nome_pais] = df[df.Pais.str.contains(nome_pais.lower().replace(' ', '-').replace('&', 'and'))]['Population']
            
        evo_populacional = evo_populacional.sort_values(['Year']).set_index(['Year'])
        
        for col in evo_populacional.columns:
            evo_populacional[col] = evo_populacional[col].apply(self.para_numerico)
            
        return evo_populacional


In [12]:
url_africa = EvolucaoPopulacionalBM(paises_df[paises_df.continentes == 'Africa'].url[:3])
url_africa.dados_paises()

Verifique o conjunto de urls (Worldometers)


In [5]:
url_africa

## 3. Captura de captura para analise/visualizacao

In [None]:
lista_de_paises = paises_df.loc[paises_cplp]
lista_de_paises

In [None]:
urls_cplp =  lista_de_paises.url

### Captura de dados

In [None]:
b_m = EvolucaoPopulacionalBM(urls_cplp)
evop_cplp = b_m.evo_populacao()
evop_cplp

# 4. Visualizacao
### Grafico Linear

In [None]:
plt.figure(figsize= (15,8))
plt.plot(evop_cplp.angola)

In [None]:
evop_cplp.plot(figsize = (15,8))

#### Total de crescimento anual

In [None]:
evop_dif = evop_cplp.diff(1)

### Histograma

In [None]:
plt.hist(evop_dif['sao-tome-and-principe'], edgecolor = 'black')

### Barras

In [None]:
plt.figure(figsize= (15,8))
sns.barplot(x = evop_cplp.index.year, y =  evop_cplp['cabo-verde'], palette = ['green', 'yellow', 'red'])
plt.title('CV EVOP [1955 - 2020]')


### BoxPlot

In [None]:
plt.figure(figsize= (15,8))
sns.boxplot(evop_dif.angola)

### Violino

In [None]:
plt.figure(figsize= (15,8))
sns.violinplot(evop_dif.angola)

### Heatmap
#### Correlacao

In [None]:
evop_corr = evop_cplp.corr()

In [None]:
plt.figure(figsize= (15,8))
sns.heatmap(evop_corr, annot = True, square=True)

# 5. Analise
### Valores absoluto

In [None]:
evop_cplp.plot(figsize = (15,8))

## Valores relativos

In [None]:
evop_racio  = evop_cplp.copy()

for pais in evop_cplp.columns:
    evop_racio[pais] = evop_cplp[pais] / evop_cplp[pais]['1955-01-01']

In [None]:
evop_racio

In [None]:
evop_racio.plot(figsize = (15, 8))

## Dados normalisado escala [0, 1]

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
escala = MinMaxScaler()

evop_escalada = escala.fit_transform(evop_cplp)

In [None]:
evop_escalada

In [None]:
evop_escalada = pd.DataFrame(evop_escalada, columns = evop_cplp.columns, index = evop_cplp.index)
evop_escalada

In [None]:
evop_escalada.plot(figsize = (15, 8))

In [None]:
evop_cplp['portugal'].plot(figsize = (15, 8))