# Loading Dataset to PostgreSQL

In [4]:
# pip install psycopg2

In [42]:
import psycopg2
import pandas as pd
import numpy as np

In [44]:
df = pd.read_excel('IEA_EDGAR_CO2_1970_2023.xlsx',sheet_name="IPCC 2006",skiprows=9)

In [45]:
df = df.rename(columns={"C_group_IM24_sh":"zone","Country_code_A3":"code"})
df = df.rename(columns={"ipcc_code_2006_for_standard_report":"ipcc_code","ipcc_code_2006_for_standard_report_name":"ipcc_name"})
df.fillna(0,inplace=True)

In [46]:
create_queries = {"country":
    """
    CREATE TABLE country(
        country_id SERIAL PRIMARY KEY,
        name VARCHAR(255),
        code VARCHAR(3),
        zone VARCHAR(25),
        ipcc_annex VARCHAR(25)
    )
    """
    ,"ipcc":
    """
    CREATE TABLE ipcc(
        ipcc_id SERIAL PRIMARY KEY,
        ipcc_code VARCHAR(15),
        ipcc_name VARCHAR(255)
    )
    """
    ,"emission":
    """
    CREATE TABLE emission(
        emission_id SERIAL PRIMARY KEY,
        country_id INTEGER REFERENCES country(country_id),
        ipcc_id INTEGER REFERENCES ipcc(ipcc_id),
        year INTEGER NOT NULL,
        emission_rate NUMERIC(20,10),
        UNIQUE(country_id,ipcc_id,year)
    )
    """}

In [50]:
check_query = """
SELECT EXISTS (
   SELECT 1 FROM information_schema.tables 
   WHERE table_schema ='public' 
   AND table_name ='{table}'
   )"""

In [52]:
table_list=["country","ipcc","emission"]

In [54]:
def connectToDB():
    try:
        conn = psycopg2.connect(database="CO2_Emission",
                host="localhost",
                user="postgres",
                password="asd123",
                port="5432")
        return conn
    except (psycopg2.DatabaseError, Exception) as error:
        print(error)

In [56]:
conn = connectToDB()
cursor = conn.cursor()

In [58]:
try:
    for table in table_list:
        query = str(check_query).format(table=table)
        cursor.execute(query)
        table_exist = cursor.fetchall()[0][0]
        if(table_exist):
            continue
        else:
            print("create")
            create_query = create_queries[table]
            cursor.execute(create_query)
            conn.commit()
            if(table == "country"):
                insertCountryData()
            if(table == "ipcc"):
                insertIpccData()
            if(table == "emission"):
                insertEmissionData()
    #add indexes
    cursor.execute("CREATE INDEX IF NOT EXISTS idx_country_year_emission ON emission(country_id,year)")
    cursor.execute("CREATE INDEX IF NOT EXISTS idx_ipcc_year_emission ON emission(ipcc_id,year)")
    conn.commit()
except (psycopg2.DatabaseError, Exception) as error:
    print(error)
    conn.rollback()

In [60]:
data_collection = {
    "country":set(),
    "ipcc":set(),
    "emission":[]
}

for i,row in df.iterrows():
    #country data
    country_name = row["Name"]
    country_zone = row["zone"]
    country_code = row["code"]
    IPCC_annex = row["IPCC_annex"]
    data_collection["country"].add((country_name,country_zone,country_code,IPCC_annex))
    #ipcc data
    ipcc_code = row["ipcc_code"]
    ipcc_name = row["ipcc_name"]
    data_collection["ipcc"].add((ipcc_code,ipcc_name))
    #emission data
    year_list=[]
    for i in range(2000,2024,1):
        year_list.append("Y_" + str(i))
    year_columns = df[year_list];
    for year_column in year_columns:
        year = int(year_column.replace("Y_",''))
        emission_rate = row[year_column]
        if pd.notna(emission_rate):
            data_collection["emission"].append({
                'country_code': country_code,
                'ipcc_code':ipcc_code,
                'ipcc_name':ipcc_name,
                'year':year,
                'emission_rate':emission_rate
            })
            

In [62]:
def insertCountryData():
    insert_query = """
    INSERT INTO country(name,zone,code,ipcc_annex)
    VALUES (%s,%s,%s,%s)
    """
    try:
        cursor.executemany(insert_query,data_collection["country"])
        conn.commit()
    except (psycopg2.DatabaseError, Exception) as error:
        print(error)
        conn.rollback()

def insertIpccData():
    insert_query = """
    INSERT INTO ipcc(ipcc_code,ipcc_name)
    VALUES (%s,%s)
    """
    try:
        cursor.executemany(insert_query,data_collection["ipcc"])
        conn.commit()
    except (psycopg2.DatabaseError, Exception) as error:
        print(error)
        conn.rollback()
        
def insertEmissionData():
    emission_rate_values=[]
    for emission in data_collection["emission"]:
        try:
            cursor.execute("""
                SELECT country_id FROM country WHERE code = %s
                """,(emission['country_code'],)
            )
            country_id_list = cursor.fetchall()
            cursor.execute("""
                SELECT ipcc_id FROM ipcc WHERE ipcc_code = %s
                AND ipcc_name = %s
                """,(emission['ipcc_code'],emission['ipcc_name'])
            )
            ipcc_id_list = cursor.fetchall()
            
            if country_id_list and ipcc_id_list:
                emission_rate_values.append({
                    "country_id" : country_id_list[0][0],
                    "ipcc_id" : ipcc_id_list[0][0],
                    "year": int(emission["year"]),
                    "emission_rate" : float(emission["emission_rate"])
                })
            else:
                print("Id not found!")
        except (psycopg2.DatabaseError, Exception) as error:
            print(error)
            conn.rollback()
            break

    try:
        size = 1000
        insert_query = """
        INSERT INTO emission(country_id,ipcc_id,year,emission_rate)
        VALUES (%s,%s,%s,%s)
        """
        print(emission_rate_values[0:10])
        for i in range(0, len(emission_rate_values),size):
            batch = emission_rate_values[i: i + size]
            row_data=[]
            for x in batch:
                row_data.append((x["country_id"],x["ipcc_id"],x["year"],x["emission_rate"]))
            cursor.executemany(insert_query,row_data)
            conn.commit()
    except (psycopg2.DatabaseError, Exception) as error:
            print(error)
            conn.rollback()
                