In [1]:
import numpy as np
import pandas as pd
import requests
import lxml
from bs4 import BeautifulSoup

# Web Scraping

- **Spletna stran:** https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population
- **Tabela:** Sovereign states and dependencies by population        

### Naloga

Tabelo na zgornji strani pretvorite v urejen pandas dataframe, ki vsebuje naslednje stolpce (pozor na ustrezen tip in index):
- Rank: (Index) - int
- Country name: - object
- Population - int
- Date - Datetime
- % of world population - int


In [89]:
wp = pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population', skiprows=1)

In [90]:
tabela = wp[0]
tabela = tabela.drop(tabela.columns[-1], axis=1)
tabela.columns = ['Rank', 'Country name', 'Population', 'Date', '% of world population']

In [91]:
tabela.head()

Unnamed: 0,Rank,Country name,Population,Date,% of world population
0,1,China[Note 2],1397540000,"May 29, 2019",18.1%
1,2,India[Note 3],1347800000,"May 29, 2019",17.5%
2,3,United States[Note 4],329283000,"May 29, 2019",4.27%
3,4,Indonesia,268074600,"July 1, 2019",3.48%
4,5,Brazil,209968000,"May 29, 2019",2.72%


In [92]:
#Stolpec z datumi pretovorim v datetime
tabela['Date'] = pd.to_datetime(tabela['Date'])

In [93]:
#zadnji stolpec, kjer so %, pretvorim v int
#ker je treba dati v int (ne float), se stevilo zaokrozi
#ce bi pustili kot float, bi bilo dovolj samo to_numeric, brez .astype(int)
#ker je int, je veliko stevil enakih 0
tabela['% of world population'] = pd.to_numeric(tabela['% of world population'].map(lambda x: x.rstrip('%'))).astype(int)

In [96]:
#pretvorim se stolpec Rank v Int
#najprej pretvorim s to_numeric, kjer hkrati '-' zamenjam z NaN
#zdi se mi smiselno, da vrstice, kjer je NaN napolnim z vrednostmi, zato uporabim ffill, ki jim dodeli pravo mesto
#glede na lestvico
tabela['Rank'] = pd.to_numeric(tabela['Rank'], errors='coerce').fillna(method='ffill').astype(int)

In [105]:
#pobrisem se dopise pri drzavah, ki so v obliki [Note#]
tabela['Country name'] = tabela['Country name'].apply(lambda x: x.split('[')[0])

In [107]:
tabela.head(10)

Unnamed: 0,Rank,Country name,Population,Date,% of world population
0,1,China,1397540000,2019-05-29,18
1,2,India,1347800000,2019-05-29,17
2,3,United States,329283000,2019-05-29,4
3,4,Indonesia,268074600,2019-07-01,3
4,5,Brazil,209968000,2019-05-29,2
5,6,Pakistan,204820000,2019-05-29,2
6,7,Nigeria,200962417,2019-07-01,2
7,8,Bangladesh,166633000,2019-05-29,2
8,9,Russia,146793744,2019-01-01,1
9,10,Mexico,126577691,2019-07-01,1


In [109]:
tabela.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 5 columns):
Rank                     240 non-null int32
Country name             240 non-null object
Population               240 non-null int64
Date                     240 non-null datetime64[ns]
% of world population    240 non-null int32
dtypes: datetime64[ns](1), int32(2), int64(1), object(1)
memory usage: 6.6+ KB


In [110]:
#za konec se stolpec 'Rank' nastavim za indeks
#sicer niso unikatne vrednosti, ampak tak je polozaj drzav na lestvici
#ce bi zelel nastaviti narascajoci unikaten indeks, bi lahko uporabil tabela.reset_index() in tabela.index +=1, da bi se
#indeks zacel z 1
tabela.set_index('Rank', inplace=True)