In [381]:
import pandas as pd
import requests
import pymongo
import requests

In [382]:
# Read the GDP per capita data from wikipedia
tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28PPP%29_per_capita')

In [383]:
# get exact data from table
gdp_per_capita = tables[2]

In [384]:
# clean data and rename the columns 
gdp_per_capita_df = gdp_per_capita.loc[gdp_per_capita.loc[:,'Rank']!= '—']
gdp_per_capita_df.columns = ['rank', 'country', 'gdp_per_capita']
gdp_per_capita_df = gdp_per_capita_df.iloc[:,1:3]
gdp_per_capita_clean_df = gdp_per_capita_df.reset_index(drop=True)
gdp_per_capita_clean_df.head()

Unnamed: 0,country,gdp_per_capita
0,Qatar,138910.0
1,Luxembourg,112045.0
2,Singapore,105689.0
3,Ireland,86988.0
4,Brunei,85011.0


In [386]:
# Read the population from wikipedia
tables = pd.read_html('https://worldpopulationreview.com/countries')
tables[0]

Unnamed: 0,Rank,Country,2020 Population,2019 Population,Growth Rate,Area (km²),2018 Density
0,1,China,1439323776,1433783686,0.39%,9706961,148/km²
1,2,India,1380004385,1366417754,0.99%,3287590,416/km²
2,3,United States,331002651,329064917,0.59%,9372610,35/km²
3,4,Indonesia,273523615,270625568,1.07%,1904569,142/km²
4,5,Pakistan,220892340,216565318,2.00%,881912,246/km²
...,...,...,...,...,...,...,...
227,228,Montserrat,4992,4989,0.06%,102,49/km²
228,229,Falkland Islands,3480,3377,3.05%,12173,0/km²
229,230,Niue,1626,1615,0.68%,260,6/km²
230,231,Tokelau,1357,1340,1.27%,12,112/km²


In [387]:
# get exact data from table
country_population_df = tables[0]
# clean data and rename the columns 
country_population_df.columns = ['rank', 'country', 'population', 'population2', 'growthrate', 'countrysize', 'pop_den']
country_population_clean_df = country_population_df.loc[:,['country', 'population', 'growthrate', 'countrysize', 'pop_den']]
country_population_clean_df.loc[:,'pop_den']= country_population_clean_df.loc[:,'pop_den'].str[:-4]
country_population_clean_df.loc[:,'growthrate'] = country_population_clean_df.loc[:,'growthrate'].str[:-1]
country_population_clean_df.loc[:,'growthrate']=country_population_clean_df.loc[:,'growthrate'].astype(float)/100
country_population_clean_df['pop_den'] = country_population_clean_df['pop_den'].str.replace(',','')
country_population_clean_df.loc[:,'pop_den']=country_population_clean_df.loc[:,'pop_den'].astype(int)
country_population_clean_df



Unnamed: 0,country,population,growthrate,countrysize,pop_den
0,China,1439323776,0.0039,9706961,148
1,India,1380004385,0.0099,3287590,416
2,United States,331002651,0.0059,9372610,35
3,Indonesia,273523615,0.0107,1904569,142
4,Pakistan,220892340,0.0200,881912,246
...,...,...,...,...,...
227,Montserrat,4992,0.0006,102,49
228,Falkland Islands,3480,0.0305,12173,0
229,Niue,1626,0.0068,260,6
230,Tokelau,1357,0.0127,12,112


Unnamed: 0,country,population,growthrate,countrysize,pop_den
21,France,65273511,0.0022,551695,118


In [362]:
# Read the Happiness data from wikipedia
tables = pd.read_html('https://en.wikipedia.org/wiki/World_Happiness_Report#2019_World_Happiness_Report')

In [377]:
# get exact data from table
country_happiest_df = tables[4]
# clean data and rename the columns 
country_happiest_df = country_happiest.loc[:, ['Country or region','Score']]
country_happiest_df.columns = ['country', 'happiestScore']
country_happiest_df.loc[country_happiest_df.loc[:, "country"] == 'United States of America']
country_happiest_df.iloc[18,0] = 'United States'

Unnamed: 0,country,co2_emissions


In [378]:
# merging three dataframes 
populcation_data = pd.merge(gdp_per_capita_clean_df, country_population_clean_df, 
             how="left", 
             on = ["country","country"])
populcation_data = pd.merge(populcation_data, country_happiest_df, 
             how="left", 
             on = ["country","country"])
populcation_data.head()

Unnamed: 0,country,gdp_per_capita,co2_emissions,population,growthrate,countrysize,pop_den,happiestScore
0,Qatar,138910.0,97.787,2881053.0,0.0173,11586.0,244.0,6.374
1,Luxembourg,112045.0,9.54,625978.0,0.0166,2586.0,238.0,7.09
2,Singapore,105689.0,55.018,5850342.0,0.0079,710.0,8175.0,6.262
3,Ireland,86988.0,38.914,4937786.0,0.0113,70273.0,69.0,7.021
4,Brunei,85011.0,6.711,437479.0,0.0097,5765.0,75.0,


In [379]:
# drop empty field 
populcation_data = populcation_data.dropna()

In [380]:
# create json list
populcation_data_list = [];
for index, row in populcation_data.iterrows():
    populcation_dict = {
        "country":row["country"],
        "gdp_per_capita":row["gdp_per_capita"],
        "co2_emissions":row["co2_emissions"],
        "population":row["population"],
        "growthrate":row["growthrate"],
        "countrysize":row["countrysize"],
        "pop_den":row["pop_den"],
        "happiestScore":row["happiestScore"]
    }
    populcation_data_list.append(populcation_dict)
    
populcation_data_list    

[{'country': 'Qatar',
  'gdp_per_capita': 138910.0,
  'co2_emissions': 97.787,
  'population': 2881053.0,
  'growthrate': 0.0173,
  'countrysize': 11586.0,
  'pop_den': 244.0,
  'happiestScore': 6.374},
 {'country': 'Luxembourg',
  'gdp_per_capita': 112045.0,
  'co2_emissions': 9.54,
  'population': 625978.0,
  'growthrate': 0.0166,
  'countrysize': 2586.0,
  'pop_den': 238.0,
  'happiestScore': 7.09},
 {'country': 'Singapore',
  'gdp_per_capita': 105689.0,
  'co2_emissions': 55.018,
  'population': 5850342.0,
  'growthrate': 0.0079,
  'countrysize': 710.0,
  'pop_den': 8175.0,
  'happiestScore': 6.2620000000000005},
 {'country': 'Ireland',
  'gdp_per_capita': 86988.0,
  'co2_emissions': 38.914,
  'population': 4937786.0,
  'growthrate': 0.0113,
  'countrysize': 70273.0,
  'pop_den': 69.0,
  'happiestScore': 7.021},
 {'country': 'Norway',
  'gdp_per_capita': 79638.0,
  'co2_emissions': 52.492,
  'population': 5421241.0,
  'growthrate': 0.0079,
  'countrysize': 323802.0,
  'pop_den': 17

In [326]:
# The default port used by MongoDB is 27017
# https://docs.mongodb.com/manual/reference/default-mongodb-port/
client = pymongo.MongoClient('mongodb://localhost:27017')
#define the database name
mydb = client["populcation_data_DB"]
#define the collection name
mycol = mydb["populcation_data"] 
mycol.insert_many(populcation_data_list)

<pymongo.results.InsertManyResult at 0x2137a7c5688>

Unnamed: 0,country,gdp_per_capita,co2_emissions,population,growthrate,countrysize,pop_den,happiestScore
