# Loading Dataset to Neo4JS

In [39]:
# pip install neo4j
# pip install pandas py2neo

In [7]:
import numpy as np
import pandas as pd
from neo4j import GraphDatabase
from neo4j.exceptions import Neo4jError,ServiceUnavailable,ClientError
from py2neo import Graph, Node, Relationship

In [41]:
#Intergovernmental Panel on Climate Change - IPCC

In [42]:
df = pd.read_excel('IEA_EDGAR_CO2_1970_2023.xlsx',sheet_name="IPCC 2006")

In [43]:
df.info

<bound method DataFrame.info of        IPCC_annex       C_group_IM24_sh Country_code_A3      Name  \
0     Non-Annex_I  Rest Central America             ABW     Aruba   
1     Non-Annex_I  Rest Central America             ABW     Aruba   
2     Non-Annex_I  Rest Central America             ABW     Aruba   
3     Non-Annex_I  Rest Central America             ABW     Aruba   
4     Non-Annex_I  Rest Central America             ABW     Aruba   
...           ...                   ...             ...       ...   
3523  Non-Annex_I       Southern_Africa             ZWE  Zimbabwe   
3524  Non-Annex_I       Southern_Africa             ZWE  Zimbabwe   
3525  Non-Annex_I       Southern_Africa             ZWE  Zimbabwe   
3526  Non-Annex_I       Southern_Africa             ZWE  Zimbabwe   
3527  Non-Annex_I       Southern_Africa             ZWE  Zimbabwe   

     ipcc_code_2006_for_standard_report  \
0                               1.A.1.a   
1                                 1.A.2   
2         

In [44]:
df.head()

Unnamed: 0,IPCC_annex,C_group_IM24_sh,Country_code_A3,Name,ipcc_code_2006_for_standard_report,ipcc_code_2006_for_standard_report_name,Substance,fossil_bio,Y_1970,Y_1971,...,Y_2014,Y_2015,Y_2016,Y_2017,Y_2018,Y_2019,Y_2020,Y_2021,Y_2022,Y_2023
0,Non-Annex_I,Rest Central America,ABW,Aruba,1.A.1.a,Main Activity Electricity and Heat Production,CO2,fossil,17.343706,19.877523,...,222.238026,230.102165,240.807934,223.432549,226.341226,268.030985,232.923266,253.01031,251.278359,263.50657
1,Non-Annex_I,Rest Central America,ABW,Aruba,1.A.2,Manufacturing Industries and Construction,CO2,fossil,0.193087,0.221296,...,22.208463,24.215383,21.728011,33.276726,28.199275,29.319799,26.562462,34.208633,33.974462,35.627795
2,Non-Annex_I,Rest Central America,ABW,Aruba,1.A.3.a,Civil Aviation,CO2,fossil,0.754306,0.864505,...,18.429354,18.900723,18.986393,19.690313,16.58486,25.492941,11.216762,12.473237,17.61512,20.659366
3,Non-Annex_I,Rest Central America,ABW,Aruba,1.A.3.b_noRES,Road Transportation no resuspension,CO2,fossil,1.04072,1.192764,...,126.415252,135.23878,140.177105,135.429729,136.224966,165.125521,128.398393,143.107436,142.127812,149.04432
4,Non-Annex_I,Rest Central America,ABW,Aruba,1.A.3.d,Water-borne Navigation,CO2,fossil,,,...,9.191439,9.637073,10.209681,9.763253,9.615795,11.382993,9.866186,10.696811,10.876347,12.16022


In [11]:
def connectToDataBases():
    try:
        URI = "bolt://localhost:7687"
        AUTH = ("neo4j", "asd123asd123")
        
        driver = GraphDatabase.driver(URI, auth=AUTH)
        return driver
    except Neo4jError as e:
        print(f"Error when connecting to DB - {e.code} - {e.message}")

## Data Pre-processing

In [47]:
#Aggregate Data to create tables

In [48]:
df = df.drop(["Substance","fossil_bio"],axis=1)

In [49]:
Country_columns = ["IPCC_annex","C_group_IM24_sh","Country_code_A3","Name"]
Code_columns = ["ipcc_code_2006_for_standard_report","ipcc_code_2006_for_standard_report_name"]
Country = df[Country_columns]
Ipcc_code = df.drop(Country_columns,axis=1)[Code_columns]
Year = df.drop(Country_columns + Code_columns,axis=1)
print(Country.columns)
print(Ipcc_code.columns)
print(Year.columns)

Index(['IPCC_annex', 'C_group_IM24_sh', 'Country_code_A3', 'Name'], dtype='object')
Index(['ipcc_code_2006_for_standard_report', 'ipcc_code_2006_for_standard_report_name'], dtype='object')
Index(['Y_1970', 'Y_1971', 'Y_1972', 'Y_1973', 'Y_1974', 'Y_1975', 'Y_1976',
       'Y_1977', 'Y_1978', 'Y_1979', 'Y_1980', 'Y_1981', 'Y_1982', 'Y_1983',
       'Y_1984', 'Y_1985', 'Y_1986', 'Y_1987', 'Y_1988', 'Y_1989', 'Y_1990',
       'Y_1991', 'Y_1992', 'Y_1993', 'Y_1994', 'Y_1995', 'Y_1996', 'Y_1997',
       'Y_1998', 'Y_1999', 'Y_2000', 'Y_2001', 'Y_2002', 'Y_2003', 'Y_2004',
       'Y_2005', 'Y_2006', 'Y_2007', 'Y_2008', 'Y_2009', 'Y_2010', 'Y_2011',
       'Y_2012', 'Y_2013', 'Y_2014', 'Y_2015', 'Y_2016', 'Y_2017', 'Y_2018',
       'Y_2019', 'Y_2020', 'Y_2021', 'Y_2022', 'Y_2023'],
      dtype='object')


In [50]:
# Handling missing values
df.fillna('Unknown',inplace=True)
#remove duplicate records
# Country.drop_duplicates(subset=['Name'], keep='first', inplace=True)
# Ipcc_code.drop_duplicates(subset=['ipcc_code_2006_for_standard_report'], keep='first', inplace=True)

  df.fillna('Unknown',inplace=True)


In [88]:
df.eq("Unknown").sum()

IPCC_annex                                   0
C_group_IM24_sh                              0
Country_code_A3                              0
Name                                         0
ipcc_code_2006_for_standard_report           0
ipcc_code_2006_for_standard_report_name      0
Y_1970                                     746
Y_1971                                     740
Y_1972                                     738
Y_1973                                     730
Y_1974                                     705
Y_1975                                     698
Y_1976                                     692
Y_1977                                     710
Y_1978                                     651
Y_1979                                     653
Y_1980                                     650
Y_1981                                     633
Y_1982                                     651
Y_1983                                     643
Y_1984                                     639
Y_1985       

In [51]:
def create_node(tx, label, properties):
    query = (
        f"CREATE (n:{label} $props) "
        "RETURN n"
    )
    result = tx.run(query, props=properties)
    return result.single()[0]

In [52]:
driver = connectToDataBases()

In [53]:
graph = Graph("bolt://localhost:7687",auth=("neo4j", "asd123asd123"))

In [110]:
def checkIfNodesAvailable(query):
    result = graph.run(query)
    record = result.data()
    has_nodes = record[0]["IsNodesPresent"]
    return has_nodes

In [9]:
def createNodes():
    try:
        for i,row in df.iterrows():
            country_node = Node("Country",code=row["Country_code_A3"],name=row["Name"],zone=row["C_group_IM24_sh"])
            graph.merge(country_node,"Country","code")
    
            emission_node = Node("Emission",ipcc_code = row["ipcc_code_2006_for_standard_report"],
                         ipcc_name = row["ipcc_code_2006_for_standard_report_name"])
    
            for year_column in [col for col in Year.columns]:
                emission_node[year_column] = row[year_column]
            graph.merge(emission_node,"Emission","ipcc_code")
    
            relation = Relationship(country_node,"HAS_PRODUCED",emission_node)
            graph.create(relation)
    except Neo4jError as e:
        print(f"Error - {e.code} - {e.message}")
    except Exception as e:
        print(f"Unexpected Error - {str(e)}")
        

In [228]:
check_query = """
OPTIONAL MATCH (n) WITH n LIMIT 1 
RETURN n IS NOT NULL AS IsNodesPresent
"""

In [146]:
if(checkIfNodesAvailable(check_query) == False):
    createNodes()
else:
    print("Nodes are already created and available in the DB!")

Nodes are already created and available in the DB!
