In [57]:
#!pip install python-dotenv
#!pip install dagster papermill jupyter
#!pip install mysql-connector-python

In [58]:
import pandas as pd
import numpy as np
import sys
import warnings
warnings.filterwarnings('ignore')
from dotenv import load_dotenv
from pathlib import Path

## Uploading raw data into the MySQL database

In [60]:
import os
os.path.abspath("")

'C:\\Users\\Asus\\Desktop\\Study\\Msc DA\\Analytics Programming and Data Visualisation\\Project\\Data-Analytics-And-Visualization\\notebooks\\dataset_02'

In [61]:
file_path = "data/owid-energy-data.csv"
env_path = ""
if os.path.exists(file_path) and os.path.isfile(file_path):
    env_path = Path('.') / '.env'
    print(f"The file {file_path} exists")
else:
    file_path = "../../data/owid-energy-data.csv"
    env_path = Path('../../') / '.env'
dat = pd.read_csv(file_path)

In [62]:
import mysql.connector     # Python's module for connecting and accessing a MySQL database  

In [63]:
load_dotenv(verbose=True)

True

In [74]:
if(load_dotenv(dotenv_path=env_path)):
    host = os.environ.get("my_host")
    username = os.environ.get("my_user")
    password = os.environ.get("my_password")
else:
    print(".env file does not have required values!")

In [76]:
conn = mysql.connector.connect(host=host, user=username, password=password,database="")
cursor = conn.cursor()

cursor.execute("DROP DATABASE IF EXISTS energy_database")
cursor.execute("CREATE DATABASE IF NOT EXISTS energy_database")

print("Database 'energy_database' created successfully!")

# Close connection
cursor.close()
conn.close()

Database 'energy_database' created successfully!


In [78]:
# Creating a table for maintaining the raw unprocessed data
conn = mysql.connector.connect(host=host, user=username, password=password, database="energy_database")
cursor = conn.cursor()
# Creating the 'raw_energy_stats' table for unprocessed data
cursor.execute("""
    CREATE TABLE IF NOT EXISTS raw_energy_stats (
        iso_code VARCHAR(10),
        country VARCHAR(255),
        year INT NOT NULL,
        population BIGINT,
        gdp BIGINT, biofuel_cons_change_pct FLOAT, biofuel_cons_change_twh FLOAT, biofuel_cons_per_capita FLOAT, biofuel_consumption FLOAT, 
        biofuel_elec_per_capita FLOAT, biofuel_electricity FLOAT, biofuel_share_elec FLOAT, biofuel_share_energy FLOAT, carbon_intensity_elec FLOAT, 
        coal_cons_change_pct FLOAT, coal_cons_change_twh FLOAT, coal_cons_per_capita FLOAT, coal_consumption FLOAT, coal_elec_per_capita FLOAT, coal_electricity FLOAT, 
        coal_prod_change_pct FLOAT, coal_prod_change_twh FLOAT, coal_prod_per_capita FLOAT, coal_production FLOAT, coal_share_elec FLOAT, coal_share_energy FLOAT, 
        electricity_demand FLOAT, electricity_generation FLOAT, electricity_share_energy FLOAT, energy_cons_change_pct FLOAT, 
        energy_cons_change_twh FLOAT, energy_per_capita FLOAT, energy_per_gdp FLOAT, fossil_cons_change_pct FLOAT, fossil_cons_change_twh FLOAT, 
        fossil_elec_per_capita FLOAT, fossil_electricity FLOAT, fossil_energy_per_capita FLOAT, fossil_fuel_consumption FLOAT, fossil_share_elec FLOAT, 
        fossil_share_energy FLOAT, gas_cons_change_pct FLOAT, gas_cons_change_twh FLOAT, gas_consumption FLOAT, gas_elec_per_capita FLOAT, gas_electricity FLOAT, 
        gas_energy_per_capita FLOAT, gas_prod_change_pct FLOAT, gas_prod_change_twh FLOAT, gas_prod_per_capita FLOAT, gas_production FLOAT, gas_share_elec FLOAT, gas_share_energy FLOAT, 
        greenhouse_gas_emissions FLOAT, hydro_cons_change_pct FLOAT, hydro_cons_change_twh FLOAT, hydro_consumption FLOAT, hydro_elec_per_capita FLOAT, hydro_electricity FLOAT, hydro_energy_per_capita FLOAT, hydro_share_elec FLOAT, hydro_share_energy FLOAT, 
        low_carbon_cons_change_pct FLOAT, low_carbon_cons_change_twh FLOAT, low_carbon_consumption FLOAT, low_carbon_elec_per_capita FLOAT, low_carbon_electricity FLOAT, low_carbon_energy_per_capita FLOAT, low_carbon_share_elec FLOAT, low_carbon_share_energy FLOAT, 
        net_elec_imports FLOAT, net_elec_imports_share_demand FLOAT, nuclear_cons_change_pct FLOAT, nuclear_cons_change_twh FLOAT, nuclear_consumption FLOAT, nuclear_elec_per_capita FLOAT, nuclear_electricity FLOAT, 
        nuclear_energy_per_capita FLOAT, nuclear_share_elec FLOAT, nuclear_share_energy FLOAT, oil_cons_change_pct FLOAT, 
        oil_cons_change_twh FLOAT, oil_consumption FLOAT, oil_elec_per_capita FLOAT, oil_electricity FLOAT, oil_energy_per_capita FLOAT, 
        oil_prod_change_pct FLOAT, oil_prod_change_twh FLOAT, oil_prod_per_capita FLOAT, oil_production FLOAT, oil_share_elec FLOAT, oil_share_energy FLOAT, 
        other_renewable_consumption FLOAT, other_renewable_electricity FLOAT, other_renewable_exc_biofuel_electricity FLOAT, other_renewables_cons_change_pct FLOAT, other_renewables_cons_change_twh FLOAT, other_renewables_elec_per_capita FLOAT, other_renewables_elec_per_capita_exc_biofuel FLOAT, other_renewables_energy_per_capita FLOAT, other_renewables_share_elec FLOAT, other_renewables_share_elec_exc_biofuel FLOAT, other_renewables_share_energy FLOAT, 
        per_capita_electricity FLOAT, primary_energy_consumption FLOAT, renewables_cons_change_pct FLOAT, renewables_cons_change_twh FLOAT, renewables_consumption FLOAT, renewables_elec_per_capita FLOAT, renewables_electricity FLOAT, 
        renewables_energy_per_capita FLOAT, renewables_share_elec FLOAT, renewables_share_energy FLOAT, solar_cons_change_pct FLOAT, solar_cons_change_twh FLOAT, solar_consumption FLOAT, solar_elec_per_capita FLOAT, solar_electricity FLOAT, solar_energy_per_capita FLOAT, solar_share_elec FLOAT, solar_share_energy FLOAT, wind_cons_change_pct FLOAT, wind_cons_change_twh FLOAT, wind_consumption FLOAT, wind_elec_per_capita FLOAT, wind_electricity FLOAT, wind_energy_per_capita FLOAT, wind_share_elec FLOAT, wind_share_energy FLOAT,
        UNIQUE KEY unique_record (iso_code, country, year)
    )
""")

In [79]:
# Creating a bulk function to insert the raw data
from tqdm import tqdm
def insert_energy_stats_bulk(df, cursor, conn, columns=None, batch_size=500):

    # Default energy columns if not specified
    if columns is None:
        columns = ['country', 'year', 'iso_code', 'population', 'gdp', 'biofuel_cons_change_pct', 'biofuel_cons_change_twh', 'biofuel_cons_per_capita', 'biofuel_consumption', 'biofuel_elec_per_capita', 'biofuel_electricity', 'biofuel_share_elec', 'biofuel_share_energy', 'carbon_intensity_elec', 'coal_cons_change_pct', 'coal_cons_change_twh', 'coal_cons_per_capita', 'coal_consumption', 'coal_elec_per_capita', 'coal_electricity', 'coal_prod_change_pct', 'coal_prod_change_twh', 'coal_prod_per_capita', 'coal_production', 'coal_share_elec', 'coal_share_energy', 'electricity_demand', 'electricity_generation', 'electricity_share_energy', 'energy_cons_change_pct', 'energy_cons_change_twh', 'energy_per_capita', 'energy_per_gdp', 'fossil_cons_change_pct', 'fossil_cons_change_twh', 'fossil_elec_per_capita', 'fossil_electricity', 'fossil_energy_per_capita', 'fossil_fuel_consumption', 'fossil_share_elec', 'fossil_share_energy', 'gas_cons_change_pct', 'gas_cons_change_twh', 'gas_consumption', 'gas_elec_per_capita', 'gas_electricity', 'gas_energy_per_capita', 'gas_prod_change_pct', 'gas_prod_change_twh', 'gas_prod_per_capita', 'gas_production', 'gas_share_elec', 'gas_share_energy', 'greenhouse_gas_emissions', 'hydro_cons_change_pct', 'hydro_cons_change_twh', 'hydro_consumption', 'hydro_elec_per_capita', 'hydro_electricity', 'hydro_energy_per_capita', 'hydro_share_elec', 'hydro_share_energy', 'low_carbon_cons_change_pct', 'low_carbon_cons_change_twh', 'low_carbon_consumption', 'low_carbon_elec_per_capita', 'low_carbon_electricity', 'low_carbon_energy_per_capita', 'low_carbon_share_elec', 'low_carbon_share_energy', 'net_elec_imports', 'net_elec_imports_share_demand', 'nuclear_cons_change_pct', 'nuclear_cons_change_twh', 'nuclear_consumption', 'nuclear_elec_per_capita', 'nuclear_electricity', 'nuclear_energy_per_capita', 'nuclear_share_elec', 'nuclear_share_energy', 'oil_cons_change_pct', 'oil_cons_change_twh', 'oil_consumption', 'oil_elec_per_capita', 'oil_electricity', 'oil_energy_per_capita', 'oil_prod_change_pct', 'oil_prod_change_twh', 'oil_prod_per_capita', 'oil_production', 'oil_share_elec', 'oil_share_energy', 'other_renewable_consumption', 'other_renewable_electricity', 'other_renewable_exc_biofuel_electricity', 'other_renewables_cons_change_pct', 'other_renewables_cons_change_twh', 'other_renewables_elec_per_capita', 'other_renewables_elec_per_capita_exc_biofuel', 'other_renewables_energy_per_capita', 'other_renewables_share_elec', 'other_renewables_share_elec_exc_biofuel', 'other_renewables_share_energy', 'per_capita_electricity', 'primary_energy_consumption', 'renewables_cons_change_pct', 'renewables_cons_change_twh', 'renewables_consumption', 'renewables_elec_per_capita', 'renewables_electricity', 'renewables_energy_per_capita', 'renewables_share_elec', 'renewables_share_energy', 'solar_cons_change_pct', 'solar_cons_change_twh', 'solar_consumption', 'solar_elec_per_capita', 'solar_electricity', 'solar_energy_per_capita', 'solar_share_elec', 'solar_share_energy', 'wind_cons_change_pct', 'wind_cons_change_twh', 'wind_consumption', 'wind_elec_per_capita', 'wind_electricity', 'wind_energy_per_capita', 'wind_share_elec', 'wind_share_energy']
    
    # Keep only relevant columns
    energy_df = df[columns].copy()
    energy_df = energy_df.replace({np.nan: None})
    
    print("Inserting energy stats into database...")

    placeholders = ', '.join(['%s'] * len(columns))
    col_names = ', '.join(columns)
    
    count = 0

    for _, row in tqdm(energy_df.iterrows(), total=len(energy_df)):
        try:
            values = [row[col] for col in columns]
            cursor.execute(f"""
                INSERT IGNORE INTO raw_energy_stats ({col_names})
                VALUES ({placeholders})
            """, values)
            count += 1

            if count % batch_size == 0:
                conn.commit()

        except Exception as e:
            print(f"Error inserting row: {e}")

    conn.commit()
    print(f"Inserted successfully. Total rows inserted: {count}")

In [80]:
sel_cols = ['iso_code', 'country', 'year', 'population', 'gdp', 'biofuel_cons_change_pct', 'biofuel_cons_change_twh', 'biofuel_cons_per_capita', 'biofuel_consumption', 'biofuel_elec_per_capita', 'biofuel_electricity', 'biofuel_share_elec', 'biofuel_share_energy', 'carbon_intensity_elec', 'coal_cons_change_pct', 'coal_cons_change_twh', 'coal_cons_per_capita', 'coal_consumption', 'coal_elec_per_capita', 'coal_electricity', 'coal_prod_change_pct', 'coal_prod_change_twh', 'coal_prod_per_capita', 'coal_production', 'coal_share_elec', 'coal_share_energy', 'electricity_demand', 'electricity_generation', 'electricity_share_energy', 'energy_cons_change_pct', 'energy_cons_change_twh', 'energy_per_capita', 'energy_per_gdp', 'fossil_cons_change_pct', 'fossil_cons_change_twh', 'fossil_elec_per_capita', 'fossil_electricity', 'fossil_energy_per_capita', 'fossil_fuel_consumption', 'fossil_share_elec', 'fossil_share_energy', 'gas_cons_change_pct', 'gas_cons_change_twh', 'gas_consumption', 'gas_elec_per_capita', 'gas_electricity', 'gas_energy_per_capita', 'gas_prod_change_pct', 'gas_prod_change_twh', 'gas_prod_per_capita', 'gas_production', 'gas_share_elec', 'gas_share_energy', 'greenhouse_gas_emissions', 'hydro_cons_change_pct', 'hydro_cons_change_twh', 'hydro_consumption', 'hydro_elec_per_capita', 'hydro_electricity', 'hydro_energy_per_capita', 'hydro_share_elec', 'hydro_share_energy', 'low_carbon_cons_change_pct', 'low_carbon_cons_change_twh', 'low_carbon_consumption', 'low_carbon_elec_per_capita', 'low_carbon_electricity', 'low_carbon_energy_per_capita', 'low_carbon_share_elec', 'low_carbon_share_energy', 'net_elec_imports', 'net_elec_imports_share_demand', 'nuclear_cons_change_pct', 'nuclear_cons_change_twh', 'nuclear_consumption', 'nuclear_elec_per_capita', 'nuclear_electricity', 'nuclear_energy_per_capita', 'nuclear_share_elec', 'nuclear_share_energy', 'oil_cons_change_pct', 'oil_cons_change_twh', 'oil_consumption', 'oil_elec_per_capita', 'oil_electricity', 'oil_energy_per_capita', 'oil_prod_change_pct', 'oil_prod_change_twh', 'oil_prod_per_capita', 'oil_production', 'oil_share_elec', 'oil_share_energy', 'other_renewable_consumption', 'other_renewable_electricity', 'other_renewable_exc_biofuel_electricity', 'other_renewables_cons_change_pct', 'other_renewables_cons_change_twh', 'other_renewables_elec_per_capita', 'other_renewables_elec_per_capita_exc_biofuel', 'other_renewables_energy_per_capita', 'other_renewables_share_elec', 'other_renewables_share_elec_exc_biofuel', 'other_renewables_share_energy', 'per_capita_electricity', 'primary_energy_consumption', 'renewables_cons_change_pct', 'renewables_cons_change_twh', 'renewables_consumption', 'renewables_elec_per_capita', 'renewables_electricity', 'renewables_energy_per_capita', 'renewables_share_elec', 'renewables_share_energy', 'solar_cons_change_pct', 'solar_cons_change_twh', 'solar_consumption', 'solar_elec_per_capita', 'solar_electricity', 'solar_energy_per_capita', 'solar_share_elec', 'solar_share_energy', 'wind_cons_change_pct', 'wind_cons_change_twh', 'wind_consumption', 'wind_elec_per_capita', 'wind_electricity', 'wind_energy_per_capita', 'wind_share_elec', 'wind_share_energy']
insert_energy_stats_bulk(dat, cursor, conn, columns=sel_cols)

Inserting energy stats into database...


100%|██████████████████████████████████████████████████████████████████████████| 21812/21812 [00:18<00:00, 1191.06it/s]

Inserted successfully. Total rows inserted: 21812





### We will try to query the SQL database and see if the data has been properly stored or not.

In [82]:
conn = mysql.connector.connect(host=host, user=username, password=password, database="energy_database")

query = "SELECT * FROM raw_energy_stats"

In [83]:
data = pd.read_sql(query, con = conn)

In [84]:
data.head()

Unnamed: 0,iso_code,country,year,population,gdp,biofuel_cons_change_pct,biofuel_cons_change_twh,biofuel_cons_per_capita,biofuel_consumption,biofuel_elec_per_capita,...,solar_share_elec,solar_share_energy,wind_cons_change_pct,wind_cons_change_twh,wind_consumption,wind_elec_per_capita,wind_electricity,wind_energy_per_capita,wind_share_elec,wind_share_energy
0,,ASEAN (Ember),2000,,,,,,,,...,0.0,,,,,,0.0,,0.0,
1,,ASEAN (Ember),2001,,,,,,,,...,0.0,,,,,,0.0,,0.0,
2,,ASEAN (Ember),2002,,,,,,,,...,0.0,,,,,,0.0,,0.0,
3,,ASEAN (Ember),2003,,,,,,,,...,0.0,,,,,,0.0,,0.0,
4,,ASEAN (Ember),2004,,,,,,,,...,0.0,,,,,,0.0,,0.0,


In [85]:
data.shape

(21812, 129)

## With this we have finished creating the MySQL database and inserting the raw data. Also, we are able to query the database without any issues.

## We will now query the raw data in other .ipynb file and then preprocess it using SQL queries and store in MongoDB. 