In [181]:
import pymongo                      # Module for MongoDB access and connection
from pymongo import MongoClient     # python's client for MongoDB
import pandas as pd
import numpy as np
import sys
import warnings
warnings.filterwarnings('ignore')
from dotenv import load_dotenv
from pathlib import Path
from pymongo import MongoClient, ASCENDING
from pymongo.errors import DuplicateKeyError
import mysql.connector

## Uploading raw data into the MongoDB database

In [184]:
import os
os.path.abspath("")

'C:\\Users\\Aniket\\Documents\\GitHub\\Data-Analytics-And-Visualization\\notebooks\\dataset_02'

In [186]:
#dat = pd.read_csv("owid-energy-data.csv")
#dat = pd.read_csv("data/owid-energy-data.csv")

In [188]:
load_dotenv(verbose=True)
env_path = Path('../../') / '.env'

In [190]:
if(load_dotenv(dotenv_path=env_path)):
    host = os.environ.get("my_host")
    username = os.environ.get("my_user")
    password = os.environ.get("my_password")
else:
    print(".env file does not have required values!")

## Calling the data from database for pre-processing

In [193]:
# Calling raw data stored in MySQL for preprocessing
conn = mysql.connector.connect(host=host, user=username, password=password,database="energy_database")
cursor = conn.cursor()
query = "SELECT * FROM raw_energy_stats"

In [195]:
data = pd.read_sql(query, con = conn)

In [196]:
data.head()

Unnamed: 0,iso_code,country,year,population,gdp,biofuel_cons_change_pct,biofuel_cons_change_twh,biofuel_cons_per_capita,biofuel_consumption,biofuel_elec_per_capita,...,solar_share_elec,solar_share_energy,wind_cons_change_pct,wind_cons_change_twh,wind_consumption,wind_elec_per_capita,wind_electricity,wind_energy_per_capita,wind_share_elec,wind_share_energy
0,,ASEAN (Ember),2000,,,,,,,,...,0.0,,,,,,0.0,,0.0,
1,,ASEAN (Ember),2001,,,,,,,,...,0.0,,,,,,0.0,,0.0,
2,,ASEAN (Ember),2002,,,,,,,,...,0.0,,,,,,0.0,,0.0,
3,,ASEAN (Ember),2003,,,,,,,,...,0.0,,,,,,0.0,,0.0,
4,,ASEAN (Ember),2004,,,,,,,,...,0.0,,,,,,0.0,,0.0,


In [199]:
#data.drop('_id',axis=1,inplace=True)

## Data Pre-processing

In [202]:
data_year = data.groupby("year")

In [204]:
data_year['year'].value_counts()

year
1900    118
1901    118
1902    118
1903    118
1904    118
       ... 
2019    274
2020    273
2021    273
2022    249
2023    155
Name: count, Length: 124, dtype: int64

#### We will filter the dataset for years and take data recorded in the 21st century into account as due to lack of technology and record keeping facilities the data for earlier years is very scarce.
#### We will also not select rows (countries) for which population and iso_code is not available.

In [207]:
query = "SELECT * FROM raw_energy_stats WHERE population IS NOT NULL AND iso_code IS NOT NULL AND year >= 2000"

In [209]:
data_1 = pd.read_sql(query, con = conn)

In [211]:
data_1.head()

Unnamed: 0,iso_code,country,year,population,gdp,biofuel_cons_change_pct,biofuel_cons_change_twh,biofuel_cons_per_capita,biofuel_consumption,biofuel_elec_per_capita,...,solar_share_elec,solar_share_energy,wind_cons_change_pct,wind_cons_change_twh,wind_consumption,wind_elec_per_capita,wind_electricity,wind_energy_per_capita,wind_share_elec,wind_share_energy
0,AFG,Afghanistan,2000,19542986,11283790000.0,,,,,0.0,...,0.0,,,,,0.0,0.0,,0.0,
1,AFG,Afghanistan,2001,19688634,11021270000.0,,,,,0.0,...,0.0,,,,,0.0,0.0,,0.0,
2,AFG,Afghanistan,2002,21000258,18804870000.0,,,,,0.0,...,0.0,,,,,0.0,0.0,,0.0,
3,AFG,Afghanistan,2003,22645136,21074340000.0,,,,,0.0,...,0.0,,,,,0.0,0.0,,0.0,
4,AFG,Afghanistan,2004,23553554,22332570000.0,,,,,0.0,...,0.0,,,,,0.0,0.0,,0.0,


In [213]:
data_1.shape

(5118, 129)

In [215]:
data_1.isna().sum()

iso_code                     0
country                      0
year                         0
population                   0
gdp                       1323
                          ... 
wind_elec_per_capita       156
wind_electricity           156
wind_energy_per_capita    3224
wind_share_elec            184
wind_share_energy         3499
Length: 129, dtype: int64

#### From the above output,it can be noticed that many features have a lot of missing values. We will filter the data and include only those featues for which the percentage of missing values is less than 60%.

In [218]:
((data_1.isna().sum()/len(data_1)) * 100).sort_values(ascending=False)

nuclear_cons_change_pct    85.853849
biofuel_cons_change_pct    79.933568
biofuel_share_energy       75.459164
biofuel_cons_change_twh    75.087925
biofuel_cons_per_capita    74.736225
                             ...    
per_capita_electricity      2.618210
year                        0.000000
population                  0.000000
country                     0.000000
iso_code                    0.000000
Length: 129, dtype: float64

In [220]:
miss_per = (data_1.isna().sum() / len(data_1)) * 100
sel_col = miss_per[miss_per < 60]

In [222]:
data_2 = data_1[sel_col.index]

In [224]:
data_2.head()

Unnamed: 0,iso_code,country,year,population,gdp,biofuel_elec_per_capita,biofuel_electricity,biofuel_share_elec,carbon_intensity_elec,coal_elec_per_capita,...,primary_energy_consumption,renewables_elec_per_capita,renewables_electricity,renewables_share_elec,solar_elec_per_capita,solar_electricity,solar_share_elec,wind_elec_per_capita,wind_electricity,wind_share_elec
0,AFG,Afghanistan,2000,19542986,11283790000.0,0.0,0.0,0.0,250.0,0.0,...,5.914,15.862,0.31,64.583,0.0,0.0,0.0,0.0,0.0,0.0
1,AFG,Afghanistan,2001,19688634,11021270000.0,0.0,0.0,0.0,217.391,2.032,...,4.664,25.395,0.5,72.464,0.0,0.0,0.0,0.0,0.0,0.0
2,AFG,Afghanistan,2002,21000258,18804870000.0,0.0,0.0,0.0,169.014,1.905,...,4.428,26.666,0.56,78.873,0.0,0.0,0.0,0.0,0.0,0.0
3,AFG,Afghanistan,2003,22645136,21074340000.0,0.0,0.0,0.0,241.758,3.974,...,5.208,27.821,0.63,69.231,0.0,0.0,0.0,0.0,0.0,0.0
4,AFG,Afghanistan,2004,23553554,22332570000.0,0.0,0.0,0.0,227.848,2.547,...,4.81,23.776,0.56,70.886,0.0,0.0,0.0,0.0,0.0,0.0


In [226]:
data_2.shape

(5118, 68)

In [228]:
data_2.columns.to_list()

['iso_code',
 'country',
 'year',
 'population',
 'gdp',
 'biofuel_elec_per_capita',
 'biofuel_electricity',
 'biofuel_share_elec',
 'carbon_intensity_elec',
 'coal_elec_per_capita',
 'coal_electricity',
 'coal_prod_change_twh',
 'coal_prod_per_capita',
 'coal_production',
 'coal_share_elec',
 'electricity_demand',
 'electricity_generation',
 'energy_cons_change_pct',
 'energy_cons_change_twh',
 'energy_per_capita',
 'energy_per_gdp',
 'fossil_elec_per_capita',
 'fossil_electricity',
 'fossil_share_elec',
 'gas_elec_per_capita',
 'gas_electricity',
 'gas_prod_change_twh',
 'gas_prod_per_capita',
 'gas_production',
 'gas_share_elec',
 'greenhouse_gas_emissions',
 'hydro_elec_per_capita',
 'hydro_electricity',
 'hydro_share_elec',
 'low_carbon_elec_per_capita',
 'low_carbon_electricity',
 'low_carbon_share_elec',
 'net_elec_imports',
 'net_elec_imports_share_demand',
 'nuclear_cons_change_twh',
 'nuclear_consumption',
 'nuclear_elec_per_capita',
 'nuclear_electricity',
 'nuclear_energy_p

In [230]:
data_2['country'] = data_2['country'].str.strip().str.title()

In [232]:
data_2.isnull().sum()

iso_code                   0
country                    0
year                       0
population                 0
gdp                     1323
                        ... 
solar_electricity        139
solar_share_elec         162
wind_elec_per_capita     156
wind_electricity         156
wind_share_elec          184
Length: 68, dtype: int64

#### Selecting only the features that are important for analysis out of the remaining.

In [235]:
df = data_2[['iso_code','country','year','population','biofuel_elec_per_capita','biofuel_electricity','biofuel_share_elec','carbon_intensity_elec','coal_elec_per_capita','coal_electricity','coal_share_elec','electricity_demand','electricity_generation','per_capita_electricity','fossil_elec_per_capita','fossil_electricity','fossil_share_elec','gas_elec_per_capita','gas_electricity','gas_share_elec','greenhouse_gas_emissions','low_carbon_electricity','low_carbon_elec_per_capita','low_carbon_share_elec','nuclear_elec_per_capita','nuclear_electricity','nuclear_share_elec','oil_elec_per_capita','oil_electricity','oil_share_elec','renewables_elec_per_capita','renewables_electricity','renewables_share_elec']]

In [237]:
df.head()

Unnamed: 0,iso_code,country,year,population,biofuel_elec_per_capita,biofuel_electricity,biofuel_share_elec,carbon_intensity_elec,coal_elec_per_capita,coal_electricity,...,low_carbon_share_elec,nuclear_elec_per_capita,nuclear_electricity,nuclear_share_elec,oil_elec_per_capita,oil_electricity,oil_share_elec,renewables_elec_per_capita,renewables_electricity,renewables_share_elec
0,AFG,Afghanistan,2000,19542986,0.0,0.0,0.0,250.0,0.0,0.0,...,64.583,0.0,0.0,0.0,8.699,0.17,35.417,15.862,0.31,64.583
1,AFG,Afghanistan,2001,19688634,0.0,0.0,0.0,217.391,2.032,0.04,...,72.464,0.0,0.0,0.0,7.619,0.15,21.739,25.395,0.5,72.464
2,AFG,Afghanistan,2002,21000258,0.0,0.0,0.0,169.014,1.905,0.04,...,78.873,0.0,0.0,0.0,5.238,0.11,15.493,26.666,0.56,78.873
3,AFG,Afghanistan,2003,22645136,0.0,0.0,0.0,241.758,3.974,0.09,...,69.231,0.0,0.0,0.0,8.39,0.19,20.879,27.821,0.63,69.231
4,AFG,Afghanistan,2004,23553554,0.0,0.0,0.0,227.848,2.547,0.06,...,70.886,0.0,0.0,0.0,7.218,0.17,21.519,23.776,0.56,70.886


In [239]:
df.isnull().sum()

iso_code                        0
country                         0
year                            0
population                      0
biofuel_elec_per_capita       179
biofuel_electricity           179
biofuel_share_elec            202
carbon_intensity_elec         178
coal_elec_per_capita          219
coal_electricity              219
coal_share_elec               242
electricity_demand            150
electricity_generation        134
per_capita_electricity        134
fossil_elec_per_capita        146
fossil_electricity            146
fossil_share_elec             174
gas_elec_per_capita           261
gas_electricity               261
gas_share_elec                284
greenhouse_gas_emissions      150
low_carbon_electricity        135
low_carbon_elec_per_capita    135
low_carbon_share_elec         163
nuclear_elec_per_capita       186
nuclear_electricity           186
nuclear_share_elec            265
oil_elec_per_capita           151
oil_electricity               151
oil_share_elec

In [241]:
df.shape

(5118, 33)

#### Since, the number of missing values in the remaining features is very less as compared to the total number of rows in each column, we can replace the null values by calculating the country means.

In [244]:
iso_mapping = df[['country', 'iso_code']].drop_duplicates()

In [246]:
energy_cols = df.columns.difference(['iso_code','country','year'])
energy_cols

Index(['biofuel_elec_per_capita', 'biofuel_electricity', 'biofuel_share_elec',
       'carbon_intensity_elec', 'coal_elec_per_capita', 'coal_electricity',
       'coal_share_elec', 'electricity_demand', 'electricity_generation',
       'fossil_elec_per_capita', 'fossil_electricity', 'fossil_share_elec',
       'gas_elec_per_capita', 'gas_electricity', 'gas_share_elec',
       'greenhouse_gas_emissions', 'low_carbon_elec_per_capita',
       'low_carbon_electricity', 'low_carbon_share_elec',
       'nuclear_elec_per_capita', 'nuclear_electricity', 'nuclear_share_elec',
       'oil_elec_per_capita', 'oil_electricity', 'oil_share_elec',
       'per_capita_electricity', 'population', 'renewables_elec_per_capita',
       'renewables_electricity', 'renewables_share_elec'],
      dtype='object')

In [248]:
df_final = df.copy()
for col in energy_cols:
    df_final[col] = df.groupby('country')[col].transform(lambda x:x.fillna(x.interpolate()))

In [249]:
df_final.head()

Unnamed: 0,iso_code,country,year,population,biofuel_elec_per_capita,biofuel_electricity,biofuel_share_elec,carbon_intensity_elec,coal_elec_per_capita,coal_electricity,...,low_carbon_share_elec,nuclear_elec_per_capita,nuclear_electricity,nuclear_share_elec,oil_elec_per_capita,oil_electricity,oil_share_elec,renewables_elec_per_capita,renewables_electricity,renewables_share_elec
0,AFG,Afghanistan,2000,19542986,0.0,0.0,0.0,250.0,0.0,0.0,...,64.583,0.0,0.0,0.0,8.699,0.17,35.417,15.862,0.31,64.583
1,AFG,Afghanistan,2001,19688634,0.0,0.0,0.0,217.391,2.032,0.04,...,72.464,0.0,0.0,0.0,7.619,0.15,21.739,25.395,0.5,72.464
2,AFG,Afghanistan,2002,21000258,0.0,0.0,0.0,169.014,1.905,0.04,...,78.873,0.0,0.0,0.0,5.238,0.11,15.493,26.666,0.56,78.873
3,AFG,Afghanistan,2003,22645136,0.0,0.0,0.0,241.758,3.974,0.09,...,69.231,0.0,0.0,0.0,8.39,0.19,20.879,27.821,0.63,69.231
4,AFG,Afghanistan,2004,23553554,0.0,0.0,0.0,227.848,2.547,0.06,...,70.886,0.0,0.0,0.0,7.218,0.17,21.519,23.776,0.56,70.886


In [250]:
df_final.isnull().sum()

iso_code                        0
country                         0
year                            0
population                      0
biofuel_elec_per_capita       130
biofuel_electricity           130
biofuel_share_elec            153
carbon_intensity_elec         129
coal_elec_per_capita          174
coal_electricity              174
coal_share_elec               197
electricity_demand            101
electricity_generation        101
per_capita_electricity        101
fossil_elec_per_capita        101
fossil_electricity            101
fossil_share_elec             129
gas_elec_per_capita           216
gas_electricity               216
gas_share_elec                239
greenhouse_gas_emissions      101
low_carbon_electricity        101
low_carbon_elec_per_capita    101
low_carbon_share_elec         129
nuclear_elec_per_capita       173
nuclear_electricity           173
nuclear_share_elec            231
oil_elec_per_capita           106
oil_electricity               106
oil_share_elec

#### We will replace the remaining missing values with the yearly means.

In [255]:
for col in energy_cols:
    df_final[col] = df.groupby('year')[col].transform(lambda x:x.fillna(x.mean()))

In [257]:
df_final.isnull().sum()

iso_code                      0
country                       0
year                          0
population                    0
biofuel_elec_per_capita       0
biofuel_electricity           0
biofuel_share_elec            0
carbon_intensity_elec         0
coal_elec_per_capita          0
coal_electricity              0
coal_share_elec               0
electricity_demand            0
electricity_generation        0
per_capita_electricity        0
fossil_elec_per_capita        0
fossil_electricity            0
fossil_share_elec             0
gas_elec_per_capita           0
gas_electricity               0
gas_share_elec                0
greenhouse_gas_emissions      0
low_carbon_electricity        0
low_carbon_elec_per_capita    0
low_carbon_share_elec         0
nuclear_elec_per_capita       0
nuclear_electricity           0
nuclear_share_elec            0
oil_elec_per_capita           0
oil_electricity               0
oil_share_elec                0
renewables_elec_per_capita    0
renewabl

#### In the original data, we had data for continents as well which was dropped (except for Australia as it is both a country as well as a continent) as they had no iso_code mapping. 
#### So, now we will first create a separate dataframe and manually assign iso_codes for the continents Africa, Antarctica, Asia, Europe, North America, South America.

In [260]:
data_cont = data[(data['year'] >= 2000) & (data['country'].isin(['Africa','Antarctica','Asia','Europe','North America','South America']))]

#### Keeping only the selected columns as in the case of the data for countries.

In [263]:
data_cont = data_cont[['iso_code','country','year','population','biofuel_elec_per_capita','biofuel_electricity','biofuel_share_elec','carbon_intensity_elec','coal_elec_per_capita','coal_electricity','coal_share_elec','electricity_demand','electricity_generation','per_capita_electricity','fossil_elec_per_capita','fossil_electricity','fossil_share_elec','gas_elec_per_capita','gas_electricity','gas_share_elec','greenhouse_gas_emissions','low_carbon_electricity','low_carbon_elec_per_capita','low_carbon_share_elec','nuclear_elec_per_capita','nuclear_electricity','nuclear_share_elec','oil_elec_per_capita','oil_electricity','oil_share_elec','renewables_elec_per_capita','renewables_electricity','renewables_share_elec']]

In [265]:
grp = data_cont.groupby('iso_code')

In [267]:
grp.head()

Unnamed: 0,iso_code,country,year,population,biofuel_elec_per_capita,biofuel_electricity,biofuel_share_elec,carbon_intensity_elec,coal_elec_per_capita,coal_electricity,...,low_carbon_share_elec,nuclear_elec_per_capita,nuclear_electricity,nuclear_share_elec,oil_elec_per_capita,oil_electricity,oil_share_elec,renewables_elec_per_capita,renewables_electricity,renewables_share_elec
947,ATA,Antarctica,2000,,,,,,,,...,,,,,,,,,,
948,ATA,Antarctica,2001,,,,,,,,...,,,,,,,,,,
949,ATA,Antarctica,2002,,,,,,,,...,,,,,,,,,,
950,ATA,Antarctica,2003,,,,,,,,...,,,,,,,,,,
951,ATA,Antarctica,2004,,,,,,,,...,,,,,,,,,,


#### From the above results it is evident that negligible amount of data is available for the continent of 'Antarctica'. We can exclude it from our further analysis.

In [270]:
data_cont = data_cont.loc[data["country"] != 'Antarctica']

In [272]:
maps = {'Africa':'AFR','Asia':'ASA','Europe':'EUR','North America':'NAE','South America':'SAM'}
data_cont['iso_code'] = data_cont['iso_code'].fillna(data_cont['country'].map(maps))

In [274]:
data_cont.isnull().sum(),data_cont.shape

(iso_code                      0
 country                       0
 year                          0
 population                    0
 biofuel_elec_per_capita       5
 biofuel_electricity           5
 biofuel_share_elec            5
 carbon_intensity_elec         5
 coal_elec_per_capita          5
 coal_electricity              5
 coal_share_elec               5
 electricity_demand            5
 electricity_generation        5
 per_capita_electricity        5
 fossil_elec_per_capita        5
 fossil_electricity            5
 fossil_share_elec             5
 gas_elec_per_capita           5
 gas_electricity               5
 gas_share_elec                5
 greenhouse_gas_emissions      5
 low_carbon_electricity        5
 low_carbon_elec_per_capita    5
 low_carbon_share_elec         5
 nuclear_elec_per_capita       5
 nuclear_electricity           5
 nuclear_share_elec            5
 oil_elec_per_capita           5
 oil_electricity               5
 oil_share_elec                5
 renewable

#### Out of the 120 rows only 5 in each energy statistic are null values. Hence, we can replace them with the mean value for that particular country.

In [277]:
iso_mapping_cont = data_cont[['country', 'iso_code']].drop_duplicates()

In [279]:
iso_mapping_cont

Unnamed: 0,country,iso_code
247,Africa,AFR
1304,Asia,ASA
6801,Europe,EUR
14130,North America,NAE
17823,South America,SAM


In [281]:
energy_cols_cont = data_cont.columns.difference(['iso_code','country','year'])
energy_cols_cont

Index(['biofuel_elec_per_capita', 'biofuel_electricity', 'biofuel_share_elec',
       'carbon_intensity_elec', 'coal_elec_per_capita', 'coal_electricity',
       'coal_share_elec', 'electricity_demand', 'electricity_generation',
       'fossil_elec_per_capita', 'fossil_electricity', 'fossil_share_elec',
       'gas_elec_per_capita', 'gas_electricity', 'gas_share_elec',
       'greenhouse_gas_emissions', 'low_carbon_elec_per_capita',
       'low_carbon_electricity', 'low_carbon_share_elec',
       'nuclear_elec_per_capita', 'nuclear_electricity', 'nuclear_share_elec',
       'oil_elec_per_capita', 'oil_electricity', 'oil_share_elec',
       'per_capita_electricity', 'population', 'renewables_elec_per_capita',
       'renewables_electricity', 'renewables_share_elec'],
      dtype='object')

In [283]:
data_cont_final = data_cont.copy()
for col in energy_cols_cont:
    data_cont_final[col] = data_cont.groupby('country')[col].transform(lambda x: x.fillna(x.interpolate()))

In [285]:
data_cont_final.isnull().sum()

iso_code                      0
country                       0
year                          0
population                    0
biofuel_elec_per_capita       0
biofuel_electricity           0
biofuel_share_elec            0
carbon_intensity_elec         0
coal_elec_per_capita          0
coal_electricity              0
coal_share_elec               0
electricity_demand            0
electricity_generation        0
per_capita_electricity        0
fossil_elec_per_capita        0
fossil_electricity            0
fossil_share_elec             0
gas_elec_per_capita           0
gas_electricity               0
gas_share_elec                0
greenhouse_gas_emissions      0
low_carbon_electricity        0
low_carbon_elec_per_capita    0
low_carbon_share_elec         0
nuclear_elec_per_capita       0
nuclear_electricity           0
nuclear_share_elec            0
oil_elec_per_capita           0
oil_electricity               0
oil_share_elec                0
renewables_elec_per_capita    0
renewabl

#### Now, the required dataframes are ready. So, we will begin with adding the collections for the same in the Mongo database created earlier.

#### Creating a collection storing unique ISO_Code and Country combinations in MongoDB.

#### For this we will first merge the dataframes storing the unique values.

In [289]:
frames = [iso_mapping,iso_mapping_cont]
df_iso = pd.concat(frames,axis=0)

In [291]:
df_iso.head()

Unnamed: 0,country,iso_code
0,Afghanistan,AFG
23,Albania,ALB
46,Algeria,DZA
70,American Samoa,ASM
93,Angola,AGO


In [293]:
load_dotenv(verbose=True)
env_path = Path('../../') / '.env'

In [295]:
if(load_dotenv(dotenv_path=env_path)):
    uri = os.environ.get("mongo_uri_1")
else:
    print(".env file does not have required values!")

In [297]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
#uri = "mongodb+srv://Aniket:Anirane15isro@cluster0.3ko5x.mongodb.net/?appName=Cluster0"
# Creating a client and connecting to the server
client = MongoClient(uri, server_api=ServerApi('1'))

try:
    client.admin.command('ping')
    print("You successfully connected to MongoDB!")
except Exception as e:
    print(e)

You successfully connected to MongoDB!


In [299]:
db = client.energy_database

In [301]:
def recreate_collection(db,collection_name):
    db[collection_name].drop()
    return db[collection_name]

In [303]:
collection = "ISOCodes"
collect = recreate_collection(db,collection)

In [305]:
dat_dict = df_iso.to_dict("records")

In [307]:
dat_collection = collect.insert_many(dat_dict)

#### Check if the data has been stored or not by retrieving some records.

In [310]:
data=collect.find({
    'iso_code':'AFR'
})
for i in data:                         # retrieving some of the data stored having iso_code as IND
    print(i)

{'_id': ObjectId('68014826740ab259f7807844'), 'country': 'Africa', 'iso_code': 'AFR'}


## Creating a database collection for year-wise country data

In [313]:
collection = "country_energy_stats"
collect = recreate_collection(db,collection)

In [315]:
data_dict = df_final.to_dict("records")

In [317]:
data_collection = collect.insert_many(data_dict)

#### Now, we will check if the data has been stored or not by retrieving some records.

In [320]:
data=collect.find({
    'iso_code':'IND',
    'year':2000
})
for i in data:                         # retrieving some of the data stored having iso_code as IND
    print(i)

{'_id': ObjectId('6801482c740ab259f78080a8'), 'iso_code': 'IND', 'country': 'India', 'year': 2000, 'population': 1059633664, 'biofuel_elec_per_capita': 1.595, 'biofuel_electricity': 1.69, 'biofuel_share_elec': 0.296, 'carbon_intensity_elec': 740.037, 'coal_elec_per_capita': 368.269, 'coal_electricity': 390.23, 'coal_share_elec': 68.295, 'electricity_demand': 572.69, 'electricity_generation': 571.39, 'per_capita_electricity': 539.234, 'fossil_elec_per_capita': 448.598, 'fossil_electricity': 475.35, 'fossil_share_elec': 83.192, 'gas_elec_per_capita': 52.811, 'gas_electricity': 55.96, 'gas_share_elec': 9.794, 'greenhouse_gas_emissions': 422.85, 'low_carbon_electricity': 96.04, 'low_carbon_elec_per_capita': 90.635, 'low_carbon_share_elec': 16.808, 'nuclear_elec_per_capita': 14.883, 'nuclear_electricity': 15.77, 'nuclear_share_elec': 2.76, 'oil_elec_per_capita': 27.519, 'oil_electricity': 29.16, 'oil_share_elec': 5.103, 'renewables_elec_per_capita': 75.753, 'renewables_electricity': 80.27, 

#### So, the data has been successfully inserted into the created database and respective collection

#### Now, we will follow the same steps for continent year-wise data

In [323]:
collection = "continent_energy"
collect = recreate_collection(db,collection)

In [325]:
data_dict_1 = data_cont_final.to_dict("records")

In [327]:
data_collection_1 = collect.insert_many(data_dict_1)

#### Now, we will check if the data has been stored or not by retrieving some records from the 2nd collection of continents and their energy stats.

In [330]:
data_1=collect.find({
    'iso_code':'EUR',
    'year':2005
})
for i in data_1:                         # retrieving some the data stored having iso_code as EUR
    print(i)

{'_id': ObjectId('68014837740ab259f7808c7c'), 'iso_code': 'EUR', 'country': 'Europe', 'year': 2005, 'population': 729987229.0, 'biofuel_elec_per_capita': 97.399, 'biofuel_electricity': 71.1, 'biofuel_share_elec': 1.502, 'carbon_intensity_elec': 416.359, 'coal_elec_per_capita': 1644.21, 'coal_electricity': 1200.25, 'coal_share_elec': 25.36, 'electricity_demand': 4730.06, 'electricity_generation': 4732.94, 'per_capita_electricity': 6483.59, 'fossil_elec_per_capita': 3593.34, 'fossil_electricity': 2623.09, 'fossil_share_elec': 55.422, 'gas_elec_per_capita': 1611.75, 'gas_electricity': 1176.56, 'gas_share_elec': 24.859, 'greenhouse_gas_emissions': 1970.6, 'low_carbon_electricity': 2109.85, 'low_carbon_elec_per_capita': 2890.26, 'low_carbon_share_elec': 44.578, 'nuclear_elec_per_capita': 1725.03, 'nuclear_electricity': 1259.25, 'nuclear_share_elec': 26.606, 'oil_elec_per_capita': 337.376, 'oil_electricity': 246.28, 'oil_share_elec': 5.204, 'renewables_elec_per_capita': 1165.23, 'renewables_

## Here, we have completed the process of creating a MongoDB database consisting of pre-processed data storing them in the respective collections.