In [1]:
import pandas as pd
import os
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from sqlalchemy import create_engine
from sqlalchemy.engine import URL
from dotenv import load_dotenv
from pathlib import Path 

In [2]:
load_dotenv(verbose=True)
env_path = "../../.env"
if os.path.exists(env_path) and os.path.isfile(env_path):
    env_path = Path('../../') / '.env'
else:
    env_path =  Path('.') / '.env'

In [3]:
### PostgreSQL Connection
def get_engine_url():
    try:
        if(load_dotenv(dotenv_path=env_path)):
            return URL.create(
                drivername=os.environ.get("pg_drivername"),
                database=os.environ.get("pg_database"),
                username=os.environ.get("pg_username"),
                password=os.environ.get("pg_password"),
                host=os.environ.get("pg_host"),
                port=os.environ.get("pg_port"),
            )
        else:
            return None
    except Exception as e:
        print(f"Error creating database URL: {e}")
        return None

    except Exception as e:
        print(f"Error creating database URL: {e}")
        return None

url_obj = get_engine_url()
if url_obj:
    try:
        engine = create_engine(url_obj)
        print("Connection to the PostgreSQL established successfully.")
    except Exception as e:
        print(f"Connection to the PostgreSQL encountered an error: {e}")
else:
    print("Failed to construct the database URL.")

Connection to the PostgreSQL established successfully.


### Data pre-Processing

In [4]:
# Avoiding duplicate records and null value rows for clean data operation
query = """
SELECT DISTINCT *
FROM projectdata
WHERE NOT (
  "Area" IS NULL OR
  "Year" IS NULL OR
  "Savanna fires" IS NULL OR
  "Forest fires" IS NULL OR
  "Crop Residues" IS NULL OR
  "Rice Cultivation" IS NULL OR
  "Drained organic soils (CO2)" IS NULL OR
  "Pesticides Manufacturing" IS NULL OR
  "Food Transport" IS NULL OR
  "Forestland" IS NULL OR
  "Net Forest conversion" IS NULL OR
  "Food Household Consumption" IS NULL OR
  "Food Retail" IS NULL OR
  "On-farm Electricity Use" IS NULL OR
  "Food Packaging" IS NULL OR
  "Agrifood Systems Waste Disposal" IS NULL OR
  "Food Processing" IS NULL OR
  "Fertilizers Manufacturing" IS NULL OR
  "IPPU" IS NULL OR
  "Manure applied to Soils" IS NULL OR
  "Manure left on Pasture" IS NULL OR
  "Manure Management" IS NULL OR
  "Fires in organic soils" IS NULL OR
  "Fires in humid tropical forests" IS NULL OR
  "On-farm energy use" IS NULL OR
  "Rural population" IS NULL OR
  "Urban population" IS NULL OR
  "Total Population - Male" IS NULL OR
  "Total Population - Female" IS NULL OR
  "total_emission" IS NULL OR
  "Average Temperature °C" IS NULL
);
"""

In [5]:
df=pd.read_sql_query(query,con=engine)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4546 entries, 0 to 4545
Data columns (total 31 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Area                             4546 non-null   object 
 1   Year                             4546 non-null   int64  
 2   Savanna fires                    4546 non-null   float64
 3   Forest fires                     4546 non-null   float64
 4   Crop Residues                    4546 non-null   float64
 5   Rice Cultivation                 4546 non-null   float64
 6   Drained organic soils (CO2)      4546 non-null   float64
 7   Pesticides Manufacturing         4546 non-null   float64
 8   Food Transport                   4546 non-null   float64
 9   Forestland                       4546 non-null   float64
 10  Net Forest conversion            4546 non-null   float64
 11  Food Household Consumption       4546 non-null   float64
 12  Food Retail         

In [7]:
# Converting pre-processed data to dictionary to save in Mongodb No-sql database
df_dict=df.to_dict("records")

### Saving pre-processed dataset to Mongodb No-sql

In [8]:
uri = os.environ.get("mongo_uri_2"),
# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [9]:
mydb=client.apdv

In [10]:
collist=mydb.list_collection_names()

In [11]:
collist

['preprocesseddata']

In [12]:
if "preprocesseddata" in collist:
  print("The collection exists.")

elif "preprocesseddata"not in collist:
    dataset=mydb.preprocesseddata
    dataset.insert_many(df_dict)

The collection exists.


In [13]:
client.close()