# Loading Dataset to Neo4JS

In [147]:
# pip install neo4j
# pip install pandas py2neo
# pip install python-dotenv
# !pip install dagster papermill jupyter

In [148]:
import os

import numpy as np
import pandas as pd
from neo4j import GraphDatabase
from neo4j.exceptions import Neo4jError,ServiceUnavailable,ClientError
from py2neo import Graph, Node, Relationship, Transaction
from dotenv import load_dotenv
from pathlib import Path
import psycopg2
from psycopg2 import sql

In [149]:
def connectToDB():
    try:
        load_dotenv(verbose=True)
        env_path = Path('../../') / '.env'
        if(load_dotenv(dotenv_path=env_path)):
            _host = os.environ.get("pg_host")
            _username = os.environ.get("pg_username")
            _password = os.environ.get("pg_password")
            _port = int(os.environ.get("pg_port"))
            conn = psycopg2.connect(database="co2_emission",
                    host=_host,
                    user=_username,
                    password=_password,
                    port=_port)
            return conn
        else:
            return None
    except (psycopg2.DatabaseError, Exception) as error:
        print(error)

In [150]:
conn = connectToDB()
cursor = conn.cursor()

In [151]:
query = """
SELECT c.*, e.*,i.* FROM emission e LEFT JOIN country c on e.country_id = c.country_id
LEFT JOIN ipcc i on e.ipcc_id = i.ipcc_id"""

In [152]:
cursor.execute(query)
all_data = cursor.fetchall()
df_columns = [column[0] for column in cursor.description]
df_combined = pd.DataFrame(all_data,columns = df_columns)

In [153]:
df_combined.head()

Unnamed: 0,country_id,name,code,zone,ipcc_annex,emission_id,country_id.1,ipcc_id,y_2000,y_2001,...,y_2017,y_2018,y_2019,y_2020,y_2021,y_2022,y_2023,ipcc_id.1,ipcc_code,ipcc_name
0,225,Aruba,ABW,Rest Central America,Non-Annex_I,1,225,8,143.5935666062,146.6161928686,...,223.4325491867,226.3412257045,268.0309848746,232.9232662866,253.0103102186,251.2783592288,263.5065696296,8,1.A.1.a,Main Activity Electricity and Heat Production
1,225,Aruba,ABW,Rest Central America,Non-Annex_I,2,225,11,6.6792409451,7.8339209034,...,33.276725739,28.199274569,29.3197993639,26.5624616304,34.2086327451,33.974461753,35.627795004,11,1.A.2,Manufacturing Industries and Construction
2,225,Aruba,ABW,Rest Central America,Non-Annex_I,3,225,23,15.0390362479,11.6839640871,...,19.6903132598,16.5848603275,25.4929408791,11.2167622617,12.4732368869,17.6151202385,20.6593659909,23,1.A.3.a,Civil Aviation
3,225,Aruba,ABW,Rest Central America,Non-Annex_I,4,225,15,80.9371557226,82.4069425714,...,135.4297287961,136.2249664849,165.1255208739,128.3983932191,143.1074358952,142.1278115272,149.0443195325,15,1.A.3.b_noRES,Road Transportation no resuspension
4,225,Aruba,ABW,Rest Central America,Non-Annex_I,5,225,14,2.1166644282,2.1578040911,...,9.7632527279,9.6157954463,11.3829934655,9.8661856769,10.6968109806,10.8763466736,12.1602195349,14,1.A.3.d,Water-borne Navigation


## Data Pre-processing

In [155]:
#Aggregate Data to create tables

In [156]:
df_combined = df_combined.drop(["country_id","emission_id","country_id","ipcc_id"],axis=1)

In [157]:
Country_columns = ["ipcc_annex","zone","code","name"]
Code_columns = ["ipcc_code","ipcc_name"]
Country = df_combined[Country_columns]
Ipcc_code = df_combined.drop(Country_columns,axis=1)[Code_columns]
Year = df_combined.drop(Country_columns + Code_columns,axis=1)
print(Country.columns)
print(Ipcc_code.columns)
print(Year.columns)

Index(['ipcc_annex', 'zone', 'code', 'name'], dtype='object')
Index(['ipcc_code', 'ipcc_name'], dtype='object')
Index(['y_2000', 'y_2001', 'y_2002', 'y_2003', 'y_2004', 'y_2005', 'y_2006',
       'y_2007', 'y_2008', 'y_2009', 'y_2010', 'y_2011', 'y_2012', 'y_2013',
       'y_2014', 'y_2015', 'y_2016', 'y_2017', 'y_2018', 'y_2019', 'y_2020',
       'y_2021', 'y_2022', 'y_2023'],
      dtype='object')


In [158]:
# Handling missing values
df_combined.fillna(0,inplace=True)
#remove duplicate records
# Country.drop_duplicates(subset=['Name'], keep='first', inplace=True)
# Ipcc_code.drop_duplicates(subset=['ipcc_code_2006_for_standard_report'], keep='first', inplace=True)

In [159]:
df_combined.eq(0).sum()

name            0
code            0
zone            0
ipcc_annex      0
y_2000        492
y_2001        488
y_2002        498
y_2003        491
y_2004        419
y_2005        398
y_2006        394
y_2007        412
y_2008        406
y_2009        394
y_2010        372
y_2011        372
y_2012        354
y_2013        351
y_2014        341
y_2015        332
y_2016        327
y_2017        306
y_2018        314
y_2019        313
y_2020        316
y_2021        315
y_2022        317
y_2023        319
ipcc_code       0
ipcc_name       0
dtype: int64

remove year columns before the year 2000. For our analysis we are focusing on the duration between 2000 - 2023

In [161]:
year_list=[]
for i in range(2000,2024,1):
    year_list.append("y_" + str(i))
Year = Year[year_list];

## Insert data into the DB

In [163]:
load_dotenv(verbose=True)
env_path = Path('../../') / '.env'

In [164]:
if(load_dotenv(dotenv_path=env_path)):
    host = os.environ.get("neo_host")
    username = os.environ.get("neo_username")
    password = os.environ.get("neo_password")
else:
    print(".env file does not have required values!")

In [165]:
graph = Graph(host,auth=(username, password))

In [166]:
def checkIfNodesAvailable(query):
    result = graph.run(query)
    record = result.data()
    has_nodes = record[0]["IsNodesPresent"]
    return has_nodes

def deleteNodes(query):
    try:
        result = graph.run(query)
    except Neo4jError as e:
        print(f"Error - {e.code} - {e.message}")
        return
    except Exception as e:
        print(f"Unexpected Error - {str(e)}")
        return

In [167]:
def createNodes(row):
    try:
        country_node = Node("country",code=row["code"],name=row["name"],zone=row["zone"])
        graph.merge(country_node,"country","code")
        ipcc_node = Node("ipcc",ipcc_code = row["ipcc_code"],
                      ipcc_name = row["ipcc_name"])
         
        relation01 = Relationship(country_node,"UNDER_IPCC_CODE",ipcc_node)

        graph.create(relation01)
        emission_node = Node("emission")
        total_emission = 0.0
        for year_column in [col for col in Year.columns]:
            total_emission += float(row[year_column])
            emission_node[year_column.replace("y_","")] = float(row[year_column])
        emission_node["total_emission"] = total_emission
        relation = Relationship(ipcc_node,"HAS_PRODUCED",emission_node)
        graph.create(relation)
    except Neo4jError as e:
        print(f"Error - {e.code} - {e.message}")
        return
    except Exception as e:
        print(f"Unexpected Error - {str(e)}")
        return

In [168]:
check_query = """
OPTIONAL MATCH (n) WITH n LIMIT 1 
RETURN n IS NOT NULL AS IsNodesPresent
"""
delete_query="""MATCH (n) DETACH DELETE n"""

In [169]:
if(checkIfNodesAvailable(check_query) == False):
    for i,row in df_combined.iterrows():
        createNodes(row)
else:
    print("Nodes are already created and available in the DB!")
    deleteNodes(delete_query)
    for i,row in df_combined.iterrows():
        createNodes(row)
    print("Nodes are deleted and created again")

Nodes are already created and available in the DB!
Nodes are deleted and created again
