This notebook retrieves the population of each city in China

In [1]:
import pandas as pd
import numpy as np
#lets import whatever is required
import requests
import lxml.html as lh

In [2]:
# lets now retrieve all of the table cells from the page
url = "https://en.wikipedia.org/wiki/List_of_cities_in_China_by_population"
page = requests.get(url)
# store the content of the website under doc
doc = lh.fromstring(page.content)
#parse data that are stored between <tr>..<tr>
tr_elements = doc.xpath('//tr')

In [3]:
# as you see we are only interested on those rows which have 4 columns
# therefore I will creat a new list out of elements with the length of 9
elements4 = [T for T in tr_elements if len(T)==4]
i = 0 
# the first element with length of 4 is the columns heading
# note that we are only interested on province, capital and the population
col = ['Name','Province','Urban Population']
chinaCitiesAndPopulation = pd.DataFrame(columns = col)
chinaCitiesAndPopulation

Unnamed: 0,Name,Province,Urban Population


In [4]:
# now lets fill it
for t in elements4[1:]:
    chinaCitiesAndPopulation = chinaCitiesAndPopulation.append({
        'Name':t[1].text_content().split('\n')[0]
        ,'Province':t[2].text_content().split('\n')[0]
        ,'Urban Population':t[3].text_content().split('\n')[0]  
    },ignore_index=True)
chinaCitiesAndPopulation

Unnamed: 0,Name,Province,Urban Population
0,Shanghai#,—,26317104
1,Beijing⍟#,—,21542000
2,Guangzhou*#,Guangdong,14904400
3,Shenzhen†,Guangdong,10358381
4,Tianjin#,—,9583277
5,Wuhan*#,Hubei,7541527
6,Dongguan,Guangdong,7271322
7,Chengdu*#,Sichuan,7112045
8,Foshan,Guangdong,6771895
9,Chongqing#,—,6263790


In [5]:
# lets clean it up
import re
regex = re.compile(r"[,⍟#†*]")
chinaCitiesAndPopulation['Name'] = chinaCitiesAndPopulation['Name'].apply(lambda x: regex.sub('',x))
chinaCitiesAndPopulation['Urban Population'] = chinaCitiesAndPopulation['Urban Population'].apply(lambda x: regex.sub('',x))
chinaCitiesAndPopulation['Urban Population'] = chinaCitiesAndPopulation['Urban Population'].astype(int)
chinaCitiesAndPopulation

Unnamed: 0,Name,Province,Urban Population
0,Shanghai,—,26317104
1,Beijing,—,21542000
2,Guangzhou,Guangdong,14904400
3,Shenzhen,Guangdong,10358381
4,Tianjin,—,9583277
5,Wuhan,Hubei,7541527
6,Dongguan,Guangdong,7271322
7,Chengdu,Sichuan,7112045
8,Foshan,Guangdong,6771895
9,Chongqing,—,6263790


In [6]:
chinaCitiesAndPopulation.to_csv('chinaCitiesAndPopulation.csv',index=)