# World Electricity Production Data

This notebook has been used to extract API data from the world bank.

API electricity production data by source, country and year.  The electricity sources include:
* Electricity production from oil, gas and coal sources
* Renewable electricity output excluding Hydro
* Hydro Electricity production
* Nuclear electricity production
* Natrual Gas sources

https://datacatalog.worldbank.org/search?search_api_views_fulltext_op=AND&query=electricity&nid=&sort_by=search_api_relevance&q=search&page=0%2C0

In [1]:
# Python Dependencies
import requests
import json
import pandas as pd
import pymongo
# import numpy as np
# import matplotlib.pyplot as plt1

## Electricity production from oil, gas and coal sources

In [2]:
url = "https://api.worldbank.org/v2/en/country/all/indicator/EG.ELC.FOSL.ZS?format=json&per_page=20000&source=2"

# Get coal data
coal_response = requests.get(url)
coal_json = coal_response.json()


In [3]:
print(coal_json[1][1])

{'indicator': {'id': 'EG.ELC.FOSL.ZS', 'value': 'Electricity production from oil, gas and coal sources (% of total)'}, 'country': {'id': '1A', 'value': 'Arab World'}, 'countryiso3code': 'ARB', 'date': '2019', 'value': None, 'unit': '', 'obs_status': '', 'decimal': 1}


In [4]:
# Extract the header detail and push to the production type list and data frame
production_type = [{'id' : coal_json[1][1]['indicator']['id'], 'value' : coal_json[1][1]['indicator']['value']}]
print(production_type)

[{'id': 'EG.ELC.FOSL.ZS', 'value': 'Electricity production from oil, gas and coal sources (% of total)'}]


In [5]:
# useful for checking man page with options for json_normalize
pd.json_normalize?

[1;31mSignature:[0m
[0mpd[0m[1;33m.[0m[0mjson_normalize[0m[1;33m([0m[1;33m
[0m    [0mdata[0m[1;33m:[0m [0mUnion[0m[1;33m[[0m[0mDict[0m[1;33m,[0m [0mList[0m[1;33m[[0m[0mDict[0m[1;33m][0m[1;33m][0m[1;33m,[0m[1;33m
[0m    [0mrecord_path[0m[1;33m:[0m [0mUnion[0m[1;33m[[0m[0mstr[0m[1;33m,[0m [0mList[0m[1;33m,[0m [0mNoneType[0m[1;33m][0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmeta[0m[1;33m:[0m [0mUnion[0m[1;33m[[0m[0mstr[0m[1;33m,[0m [0mList[0m[1;33m[[0m[0mUnion[0m[1;33m[[0m[0mstr[0m[1;33m,[0m [0mList[0m[1;33m[[0m[0mstr[0m[1;33m][0m[1;33m][0m[1;33m][0m[1;33m,[0m [0mNoneType[0m[1;33m][0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmeta_prefix[0m[1;33m:[0m [0mUnion[0m[1;33m[[0m[0mstr[0m[1;33m,[0m [0mNoneType[0m[1;33m][0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mrecord_prefix[0m[1;33m:[0m [0mUnion[0m[1;33m[[0m[0mstr[0m

In [6]:
coal_df = pd.json_normalize(coal_json[1])
coal_df.head()

Unnamed: 0,countryiso3code,date,value,unit,obs_status,decimal,indicator.id,indicator.value,country.id,country.value
0,ARB,2020,,,,1,EG.ELC.FOSL.ZS,"Electricity production from oil, gas and coal ...",1A,Arab World
1,ARB,2019,,,,1,EG.ELC.FOSL.ZS,"Electricity production from oil, gas and coal ...",1A,Arab World
2,ARB,2018,,,,1,EG.ELC.FOSL.ZS,"Electricity production from oil, gas and coal ...",1A,Arab World
3,ARB,2017,,,,1,EG.ELC.FOSL.ZS,"Electricity production from oil, gas and coal ...",1A,Arab World
4,ARB,2016,,,,1,EG.ELC.FOSL.ZS,"Electricity production from oil, gas and coal ...",1A,Arab World


## Renewables excluding Hydroelectric

In [7]:
# Excluding Hydro
url = "https://api.worldbank.org/v2/en/country/all/indicator/EG.ELC.RNWX.ZS?format=json&per_page=20000&source=2"
# Including Hydro
# url = "https://api.worldbank.org/v2/en/country/all/indicator/EG.ELC.RNEW.ZS?format=json&per_page=20000&source=2"

renewable_response = requests.get(url)
renewable_json = renewable_response.json()

In [8]:
print(renewable_json[1][1])

{'indicator': {'id': 'EG.ELC.RNWX.ZS', 'value': 'Electricity production from renewable sources, excluding hydroelectric (% of total)'}, 'country': {'id': '1A', 'value': 'Arab World'}, 'countryiso3code': 'ARB', 'date': '2019', 'value': None, 'unit': '', 'obs_status': '', 'decimal': 1}


In [9]:
# Extract the header detail and push to the production type list and data frame
production_type.append({'id' : renewable_json[1][1]['indicator']['id'], 'value' : renewable_json[1][1]['indicator']['value']})
print(production_type)

[{'id': 'EG.ELC.FOSL.ZS', 'value': 'Electricity production from oil, gas and coal sources (% of total)'}, {'id': 'EG.ELC.RNWX.ZS', 'value': 'Electricity production from renewable sources, excluding hydroelectric (% of total)'}]


In [10]:
renew_df = pd.json_normalize(renewable_json[1])
renew_df.head(10)

Unnamed: 0,countryiso3code,date,value,unit,obs_status,decimal,indicator.id,indicator.value,country.id,country.value
0,ARB,2020,,,,1,EG.ELC.RNWX.ZS,"Electricity production from renewable sources,...",1A,Arab World
1,ARB,2019,,,,1,EG.ELC.RNWX.ZS,"Electricity production from renewable sources,...",1A,Arab World
2,ARB,2018,,,,1,EG.ELC.RNWX.ZS,"Electricity production from renewable sources,...",1A,Arab World
3,ARB,2017,,,,1,EG.ELC.RNWX.ZS,"Electricity production from renewable sources,...",1A,Arab World
4,ARB,2016,,,,1,EG.ELC.RNWX.ZS,"Electricity production from renewable sources,...",1A,Arab World
5,ARB,2015,0.454259,,,1,EG.ELC.RNWX.ZS,"Electricity production from renewable sources,...",1A,Arab World
6,ARB,2014,0.418328,,,1,EG.ELC.RNWX.ZS,"Electricity production from renewable sources,...",1A,Arab World
7,ARB,2013,0.355385,,,1,EG.ELC.RNWX.ZS,"Electricity production from renewable sources,...",1A,Arab World
8,ARB,2012,0.253884,,,1,EG.ELC.RNWX.ZS,"Electricity production from renewable sources,...",1A,Arab World
9,ARB,2011,0.282034,,,1,EG.ELC.RNWX.ZS,"Electricity production from renewable sources,...",1A,Arab World


## Electricity production from nuclear sources

In [11]:
url = "https://api.worldbank.org/v2/en/country/all/indicator/EG.ELC.NUCL.ZS?format=json&per_page=20000&source=2"

nuclear_response = requests.get(url)
nuclear_json = nuclear_response.json()

In [12]:
print(nuclear_json[1][1])

{'indicator': {'id': 'EG.ELC.NUCL.ZS', 'value': 'Electricity production from nuclear sources (% of total)'}, 'country': {'id': '1A', 'value': 'Arab World'}, 'countryiso3code': 'ARB', 'date': '2019', 'value': None, 'unit': '', 'obs_status': '', 'decimal': 1}


In [13]:
# Extract the header detail and push to the production type list and data frame
production_type.append({'id' : nuclear_json[1][1]['indicator']['id'], 'value' : nuclear_json[1][1]['indicator']['value']})
print(production_type)

[{'id': 'EG.ELC.FOSL.ZS', 'value': 'Electricity production from oil, gas and coal sources (% of total)'}, {'id': 'EG.ELC.RNWX.ZS', 'value': 'Electricity production from renewable sources, excluding hydroelectric (% of total)'}, {'id': 'EG.ELC.NUCL.ZS', 'value': 'Electricity production from nuclear sources (% of total)'}]


In [14]:
nuclear_df = pd.json_normalize(nuclear_json[1])
nuclear_df.head(10)

Unnamed: 0,countryiso3code,date,value,unit,obs_status,decimal,indicator.id,indicator.value,country.id,country.value
0,ARB,2020,,,,1,EG.ELC.NUCL.ZS,Electricity production from nuclear sources (%...,1A,Arab World
1,ARB,2019,,,,1,EG.ELC.NUCL.ZS,Electricity production from nuclear sources (%...,1A,Arab World
2,ARB,2018,,,,1,EG.ELC.NUCL.ZS,Electricity production from nuclear sources (%...,1A,Arab World
3,ARB,2017,,,,1,EG.ELC.NUCL.ZS,Electricity production from nuclear sources (%...,1A,Arab World
4,ARB,2016,,,,1,EG.ELC.NUCL.ZS,Electricity production from nuclear sources (%...,1A,Arab World
5,ARB,2015,,,,1,EG.ELC.NUCL.ZS,Electricity production from nuclear sources (%...,1A,Arab World
6,ARB,2014,0.0,,,1,EG.ELC.NUCL.ZS,Electricity production from nuclear sources (%...,1A,Arab World
7,ARB,2013,0.0,,,1,EG.ELC.NUCL.ZS,Electricity production from nuclear sources (%...,1A,Arab World
8,ARB,2012,0.0,,,1,EG.ELC.NUCL.ZS,Electricity production from nuclear sources (%...,1A,Arab World
9,ARB,2011,0.0,,,1,EG.ELC.NUCL.ZS,Electricity production from nuclear sources (%...,1A,Arab World


## Electricity production from natural gas sources

In [15]:
url = "https://api.worldbank.org/v2/en/country/all/indicator/EG.ELC.NGAS.ZS?format=json&per_page=20000&source=2"

ngas_response = requests.get(url)
ngas_json = ngas_response.json()

In [16]:
print(ngas_json[1][1])

{'indicator': {'id': 'EG.ELC.NGAS.ZS', 'value': 'Electricity production from natural gas sources (% of total)'}, 'country': {'id': '1A', 'value': 'Arab World'}, 'countryiso3code': 'ARB', 'date': '2019', 'value': None, 'unit': '', 'obs_status': '', 'decimal': 1}


In [17]:
# Extract the header detail and push to the production type list and data frame
production_type.append({'id' : ngas_json[1][1]['indicator']['id'], 'value' : ngas_json[1][1]['indicator']['value']})
print(production_type)

[{'id': 'EG.ELC.FOSL.ZS', 'value': 'Electricity production from oil, gas and coal sources (% of total)'}, {'id': 'EG.ELC.RNWX.ZS', 'value': 'Electricity production from renewable sources, excluding hydroelectric (% of total)'}, {'id': 'EG.ELC.NUCL.ZS', 'value': 'Electricity production from nuclear sources (% of total)'}, {'id': 'EG.ELC.NGAS.ZS', 'value': 'Electricity production from natural gas sources (% of total)'}]


In [18]:
ngas_df = pd.json_normalize(ngas_json[1])
ngas_df.head(10)

Unnamed: 0,countryiso3code,date,value,unit,obs_status,decimal,indicator.id,indicator.value,country.id,country.value
0,ARB,2020,,,,1,EG.ELC.NGAS.ZS,Electricity production from natural gas source...,1A,Arab World
1,ARB,2019,,,,1,EG.ELC.NGAS.ZS,Electricity production from natural gas source...,1A,Arab World
2,ARB,2018,,,,1,EG.ELC.NGAS.ZS,Electricity production from natural gas source...,1A,Arab World
3,ARB,2017,,,,1,EG.ELC.NGAS.ZS,Electricity production from natural gas source...,1A,Arab World
4,ARB,2016,,,,1,EG.ELC.NGAS.ZS,Electricity production from natural gas source...,1A,Arab World
5,ARB,2015,64.071422,,,1,EG.ELC.NGAS.ZS,Electricity production from natural gas source...,1A,Arab World
6,ARB,2014,61.592526,,,1,EG.ELC.NGAS.ZS,Electricity production from natural gas source...,1A,Arab World
7,ARB,2013,63.035044,,,1,EG.ELC.NGAS.ZS,Electricity production from natural gas source...,1A,Arab World
8,ARB,2012,61.330974,,,1,EG.ELC.NGAS.ZS,Electricity production from natural gas source...,1A,Arab World
9,ARB,2011,61.77952,,,1,EG.ELC.NGAS.ZS,Electricity production from natural gas source...,1A,Arab World


## Electricity production from hydroelectric sources

In [19]:
url = "https://api.worldbank.org/v2/en/country/all/indicator/EG.ELC.HYRO.ZS?format=json&per_page=20000&source=2"

hydro_response = requests.get(url)
hydro_json = hydro_response.json()

In [20]:
print(hydro_json[1][1])

{'indicator': {'id': 'EG.ELC.HYRO.ZS', 'value': 'Electricity production from hydroelectric sources (% of total)'}, 'country': {'id': '1A', 'value': 'Arab World'}, 'countryiso3code': 'ARB', 'date': '2019', 'value': None, 'unit': '', 'obs_status': '', 'decimal': 1}


In [21]:
production_type.append({'id' : hydro_json[1][1]['indicator']['id'], 'value' : hydro_json[1][1]['indicator']['value']})
print(production_type)

[{'id': 'EG.ELC.FOSL.ZS', 'value': 'Electricity production from oil, gas and coal sources (% of total)'}, {'id': 'EG.ELC.RNWX.ZS', 'value': 'Electricity production from renewable sources, excluding hydroelectric (% of total)'}, {'id': 'EG.ELC.NUCL.ZS', 'value': 'Electricity production from nuclear sources (% of total)'}, {'id': 'EG.ELC.NGAS.ZS', 'value': 'Electricity production from natural gas sources (% of total)'}, {'id': 'EG.ELC.HYRO.ZS', 'value': 'Electricity production from hydroelectric sources (% of total)'}]


In [22]:
hydro_df = pd.json_normalize(hydro_json[1])
hydro_df.tail(10)

Unnamed: 0,countryiso3code,date,value,unit,obs_status,decimal,indicator.id,indicator.value,country.id,country.value
16094,ZWE,1969,,,,1,EG.ELC.HYRO.ZS,Electricity production from hydroelectric sour...,ZW,Zimbabwe
16095,ZWE,1968,,,,1,EG.ELC.HYRO.ZS,Electricity production from hydroelectric sour...,ZW,Zimbabwe
16096,ZWE,1967,,,,1,EG.ELC.HYRO.ZS,Electricity production from hydroelectric sour...,ZW,Zimbabwe
16097,ZWE,1966,,,,1,EG.ELC.HYRO.ZS,Electricity production from hydroelectric sour...,ZW,Zimbabwe
16098,ZWE,1965,,,,1,EG.ELC.HYRO.ZS,Electricity production from hydroelectric sour...,ZW,Zimbabwe
16099,ZWE,1964,,,,1,EG.ELC.HYRO.ZS,Electricity production from hydroelectric sour...,ZW,Zimbabwe
16100,ZWE,1963,,,,1,EG.ELC.HYRO.ZS,Electricity production from hydroelectric sour...,ZW,Zimbabwe
16101,ZWE,1962,,,,1,EG.ELC.HYRO.ZS,Electricity production from hydroelectric sour...,ZW,Zimbabwe
16102,ZWE,1961,,,,1,EG.ELC.HYRO.ZS,Electricity production from hydroelectric sour...,ZW,Zimbabwe
16103,ZWE,1960,,,,1,EG.ELC.HYRO.ZS,Electricity production from hydroelectric sour...,ZW,Zimbabwe


In [23]:
print(len(hydro_df))

16104


## Data Cleansing and Database Insert (T)

In [24]:
# Concatenate all DataFrames into a single dataframe
# Header Data
production_type_df = pd.DataFrame(production_type)
production_type_df = production_type_df.rename(columns={"id":"source_id", "value":"source_name"})
production_type_df

Unnamed: 0,source_id,source_name
0,EG.ELC.FOSL.ZS,"Electricity production from oil, gas and coal ..."
1,EG.ELC.RNWX.ZS,"Electricity production from renewable sources,..."
2,EG.ELC.NUCL.ZS,Electricity production from nuclear sources (%...
3,EG.ELC.NGAS.ZS,Electricity production from natural gas source...
4,EG.ELC.HYRO.ZS,Electricity production from hydroelectric sour...


In [25]:
# Results Data
frames = [coal_df, renew_df, nuclear_df, ngas_df, hydro_df]
results_df = pd.concat(frames)

print(len(results_df))

80520


In [26]:
results_df.tail()

Unnamed: 0,countryiso3code,date,value,unit,obs_status,decimal,indicator.id,indicator.value,country.id,country.value
16099,ZWE,1964,,,,1,EG.ELC.HYRO.ZS,Electricity production from hydroelectric sour...,ZW,Zimbabwe
16100,ZWE,1963,,,,1,EG.ELC.HYRO.ZS,Electricity production from hydroelectric sour...,ZW,Zimbabwe
16101,ZWE,1962,,,,1,EG.ELC.HYRO.ZS,Electricity production from hydroelectric sour...,ZW,Zimbabwe
16102,ZWE,1961,,,,1,EG.ELC.HYRO.ZS,Electricity production from hydroelectric sour...,ZW,Zimbabwe
16103,ZWE,1960,,,,1,EG.ELC.HYRO.ZS,Electricity production from hydroelectric sour...,ZW,Zimbabwe


In [27]:
# Clean up results data
results_df = results_df[["country.value", "countryiso3code", "date", "indicator.id", "value"]]
results_df = results_df.rename(columns={"country.value":"country", "countryiso3code":"iso3code", "date":"year", "indicator.id":"source_id", "value":"percentage"})
results_df = results_df.dropna(how='any')
results_df.reset_index(inplace=True)
results_df

Unnamed: 0,index,country,iso3code,year,source_id,percentage
0,5,Arab World,ARB,2015,EG.ELC.FOSL.ZS,86.100437
1,6,Arab World,ARB,2014,EG.ELC.FOSL.ZS,86.243390
2,7,Arab World,ARB,2013,EG.ELC.FOSL.ZS,87.057507
3,8,Arab World,ARB,2012,EG.ELC.FOSL.ZS,86.745203
4,9,Arab World,ARB,2011,EG.ELC.FOSL.ZS,86.036528
...,...,...,...,...,...,...
39981,16088,Zimbabwe,ZWE,1975,EG.ELC.HYRO.ZS,86.665571
39982,16089,Zimbabwe,ZWE,1974,EG.ELC.HYRO.ZS,88.078619
39983,16090,Zimbabwe,ZWE,1973,EG.ELC.HYRO.ZS,67.420727
39984,16091,Zimbabwe,ZWE,1972,EG.ELC.HYRO.ZS,82.729162


In [28]:
print(len(results_df))

39986


In [29]:
# Insert into Mongo
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Define the 'electric_vehicles' database in Mongo
db = client.electric_vehicles

# Drop the 'electricity_production_sources' collection if it exists
db['electricity_production_sources'].drop()

# Insert Production Types
# records = json.loads(production_type_df.T.to_json()).values()
# db.electricity_production_sources.insert_many(records)

data = production_type_df.to_dict(orient='records')  # Here's our added param..
db.electricity_production_sources.insert_many(data)

<pymongo.results.InsertManyResult at 0x23d21043d80>

In [30]:
# Insert Results
records = json.loads(results_df.T.to_json()).values()
db.electricity_production_values.insert_many(records)

<pymongo.results.InsertManyResult at 0x23d23e7a900>