# Loading Dataset to Neo4JS

In [1]:
# pip install neo4j
# pip install pandas py2neo
# pip install python-dotenv
# !pip install dagster papermill jupyter

In [2]:
import os

import numpy as np
import pandas as pd
from neo4j import GraphDatabase
from neo4j.exceptions import Neo4jError,ServiceUnavailable,ClientError
from py2neo import Graph, Node, Relationship, Transaction
from dotenv import load_dotenv
from pathlib import Path
import psycopg2
from psycopg2 import sql

In [3]:
#Intergovernmental Panel on Climate Change - IPCC

In [4]:
df = pd.read_excel("data/IEA_EDGAR_CO2_1970_2023.xlsx",sheet_name="IPCC 2006",skiprows=9)

In [5]:
# df.columns

In [6]:
# df.head()

In [7]:
# def connectToDataBases():
#     try:
#         URI = "bolt://localhost:7687"
#         AUTH = ("neo4j", "asd123asd123")
        
#         driver = GraphDatabase.driver(URI, auth=AUTH)
#         return driver
#     except Neo4jError as e:
#         print(f"Error when connecting to DB - {e.code} - {e.message}")

In [8]:
def connectToDB():
    try:
        load_dotenv(verbose=True)
        env_path = Path('.') / '.env'
        if(load_dotenv(dotenv_path=env_path)):
            _host = os.environ.get("pg_host")
            _username = os.environ.get("pg_username")
            _password = os.environ.get("pg_password")
            _port = int(os.environ.get("pg_port"))
            conn = psycopg2.connect(database="CO2_Emission",
                    host=_host,
                    user=_username,
                    password=_password,
                    port=_port)
            return conn
        else:
            return None
    except (psycopg2.DatabaseError, Exception) as error:
        print(error)

In [9]:
# conn = connectToDB()
# cursor = conn.cursor()

In [10]:
query = sql.SQL("""
SELECT c.*, e.*,i.* FROM emission e LEFT JOIN country c on e.country_id == c.country_id
LEFT JOIN ipcc i on e.ipcc_id == i.ipcc_id""")

In [11]:
# cursor.execute(query)
# all_data = cursor.fetchall()
# df_combined = pd.DataFrame(all_data)

## Data Pre-processing

In [12]:
#Aggregate Data to create tables

In [13]:
df = df.drop(["Substance","fossil_bio"],axis=1)

In [14]:
df = df.rename(columns={"C_group_IM24_sh":"zone","Country_code_A3":"code"})
df = df.rename(columns={"ipcc_code_2006_for_standard_report":"ipcc_code","ipcc_code_2006_for_standard_report_name":"ipcc_name"})

In [15]:
Country_columns = ["IPCC_annex","zone","code","Name"]
Code_columns = ["ipcc_code","ipcc_name"]
Country = df[Country_columns]
Ipcc_code = df.drop(Country_columns,axis=1)[Code_columns]
Year = df.drop(Country_columns + Code_columns,axis=1)
print(Country.columns)
print(Ipcc_code.columns)
print(Year.columns)

Index(['IPCC_annex', 'zone', 'code', 'Name'], dtype='object')
Index(['ipcc_code', 'ipcc_name'], dtype='object')
Index(['Y_1970', 'Y_1971', 'Y_1972', 'Y_1973', 'Y_1974', 'Y_1975', 'Y_1976',
       'Y_1977', 'Y_1978', 'Y_1979', 'Y_1980', 'Y_1981', 'Y_1982', 'Y_1983',
       'Y_1984', 'Y_1985', 'Y_1986', 'Y_1987', 'Y_1988', 'Y_1989', 'Y_1990',
       'Y_1991', 'Y_1992', 'Y_1993', 'Y_1994', 'Y_1995', 'Y_1996', 'Y_1997',
       'Y_1998', 'Y_1999', 'Y_2000', 'Y_2001', 'Y_2002', 'Y_2003', 'Y_2004',
       'Y_2005', 'Y_2006', 'Y_2007', 'Y_2008', 'Y_2009', 'Y_2010', 'Y_2011',
       'Y_2012', 'Y_2013', 'Y_2014', 'Y_2015', 'Y_2016', 'Y_2017', 'Y_2018',
       'Y_2019', 'Y_2020', 'Y_2021', 'Y_2022', 'Y_2023'],
      dtype='object')


In [16]:
# Handling missing values
df.fillna(0,inplace=True)
#remove duplicate records
# Country.drop_duplicates(subset=['Name'], keep='first', inplace=True)
# Ipcc_code.drop_duplicates(subset=['ipcc_code_2006_for_standard_report'], keep='first', inplace=True)

In [17]:
df.eq(0).sum()

IPCC_annex      0
zone            0
code            0
Name            0
ipcc_code       0
ipcc_name       0
Y_1970        751
Y_1971        746
Y_1972        746
Y_1973        736
Y_1974        712
Y_1975        702
Y_1976        695
Y_1977        713
Y_1978        652
Y_1979        654
Y_1980        651
Y_1981        634
Y_1982        652
Y_1983        644
Y_1984        640
Y_1985        636
Y_1986        640
Y_1987        636
Y_1988        641
Y_1989        631
Y_1990        541
Y_1991        539
Y_1992        529
Y_1993        517
Y_1994        509
Y_1995        504
Y_1996        479
Y_1997        463
Y_1998        481
Y_1999        497
Y_2000        492
Y_2001        488
Y_2002        498
Y_2003        491
Y_2004        418
Y_2005        398
Y_2006        394
Y_2007        412
Y_2008        406
Y_2009        394
Y_2010        372
Y_2011        372
Y_2012        354
Y_2013        351
Y_2014        341
Y_2015        332
Y_2016        327
Y_2017        306
Y_2018        314
Y_2019    

remove year columns before the year 2000. For our analysis we are focusing on the duration between 2000 - 2023

In [18]:
year_list=[]
for i in range(2000,2024,1):
    year_list.append("Y_" + str(i))
Year = Year[year_list];

## Insert data into the DB

In [19]:
load_dotenv(verbose=True)
env_path = Path('.') / '.env'

In [20]:
if(load_dotenv(dotenv_path=env_path)):
    host = os.environ.get("neo_host")
    username = os.environ.get("neo_username")
    password = os.environ.get("neo_password")
else:
    print(".env file does not have required values!")

In [21]:
graph = Graph(host,auth=(username, password))

In [22]:
def checkIfNodesAvailable(query):
    result = graph.run(query)
    record = result.data()
    has_nodes = record[0]["IsNodesPresent"]
    return has_nodes

In [23]:
def createNodes(row):
    try:
        country_node = Node("country",code=row["code"],name=row["Name"],zone=row["zone"])
        graph.merge(country_node,"country","code")
        ipcc_node = Node("ipcc",ipcc_code = row["ipcc_code"],
                      ipcc_name = row["ipcc_name"])
         
        relation01 = Relationship(country_node,"UNDER_IPCC_CODE",ipcc_node)

        graph.create(relation01)
        emission_node = Node("emission")
        total_emission = 0.0
        for year_column in [col for col in Year.columns]:
            total_emission += row[year_column]
            emission_node[year_column.replace("Y_","")] = row[year_column]
        emission_node["total_emission"] = total_emission
        relation = Relationship(ipcc_node,"HAS_PRODUCED",emission_node)
        graph.create(relation)
    except Neo4jError as e:
        print(f"Error - {e.code} - {e.message}")
        return
    except Exception as e:
        print(f"Unexpected Error - {str(e)}")
        return

In [24]:
check_query = """
OPTIONAL MATCH (n) WITH n LIMIT 1 
RETURN n IS NOT NULL AS IsNodesPresent
"""

In [25]:
if(checkIfNodesAvailable(check_query) == False):
    for i,row in df.iterrows():
        createNodes(row)
else:
    print("Nodes are already created and available in the DB!")

Nodes are already created and available in the DB!
