# Setup

After creating a conda env with r packages

```bash
conda create --name r_env r r-base r-essentials r-dplyr r-ggplot2 r-r6
conda activate r_env
conda install -c conda-forge r-irkernel jupyterlab r-devtools r-rpostgres
```

I opened a new terminal, opened up jupyter lab on a different port than my main jupyter lab server, and opened an R notebook.

In [3]:
library(R6)

In [20]:
library(DBI)
library(RPostgres)

In [10]:
ls("package:R6")

In [11]:
ls(getNamespace("R6"), all.names = TRUE)

In [14]:
PGConnection <- R6Class(
    "PGConnection",
    public = list(
        conn = NULL,
        initialize = function(dbname, host, port, user, password) {
            self$conn <- dbConnect(
                RPostgres::Postgres(),
                dbname=dbname,
                host=host,
                port=port,
                user=user,
                password=password
            )
            message("Connection established.")
        },
        finalize = function() {
            if(!is.null(self$conn) && dbIsValid(self$conn)) {
                dbDisconnect(self$conn)
                message("Connection closed.")
            }
        }
    )
)

In [18]:
pg_conn = PGConnection$new(
    dbname="db_name",
    host="localhost",
    user="db_user",
    pass="db_password",
    port="5432"
)

Connection established.



In [72]:
DataCatalog = R6Class(
    "DataCatalog",
    public = list(
        pg_conn=NULL,
        initialize = function(pg_conn) {
            self$pg_conn <- pg_conn
        },
        run_query = function(query) {
            return (RPostgres::dbGetQuery(self$pg_conn$conn, query))
        },
        show_schema_names = function() {
            result = self$run_query(
                "SELECT nspname AS schema_name
                FROM pg_namespace
                WHERE nspname NOT LIKE 'pg_%';"
            )
            return (result)
        },
        show_table_names = function(schema) {
            result = self$run_query(paste0(
                "SELECT tablename AS table_name
                 FROM pg_tables
                 WHERE schemaname = '", schema,"';"
            ))
            return (result)
        }
    )
)

In [73]:
catalog = DataCatalog$new(pg_conn=pg_conn)

In [74]:
catalog$show_schema_names()

schema_name
<chr>
public
information_schema
topology
tiger
tiger_data
clean
data_raw
feature
dwh
report


In [75]:
catalog$show_table_names(schema="data_raw")

table_name
<chr>
temp_chicago_affordable_rental_housing
temp_chicago_food_inspections
chicago_food_inspections
cook_county_neighborhood_boundaries
chicago_cta_train_stations
nyc_parcel_sales
chicago_affordable_rental_housing
chicago_bike_paths
temp_cook_county_sao_case_diversion_data
chicago_cta_bus_stops


In [67]:
schema = "data_raw"
catalog$run_query(paste0(
    "SELECT tablename AS table_name
     FROM pg_tables
     WHERE schemaname = '", schema,"';"
))

table_name
<chr>
temp_chicago_affordable_rental_housing
temp_chicago_food_inspections
chicago_food_inspections
cook_county_neighborhood_boundaries
chicago_cta_train_stations
nyc_parcel_sales
chicago_affordable_rental_housing
chicago_bike_paths
temp_cook_county_sao_case_diversion_data
chicago_cta_bus_stops


In [38]:
ls(size_in_mb, all.names=True)

ERROR: Error in as.environment(pos): invalid 'pos' argument


In [49]:
df = catalog$run_query("SELECT * FROM data_raw.cook_county_parcel_sales")

In [25]:
result <- RPostgres::dbGetQuery(pg_conn$conn, "SELECT * FROM data_raw.cook_county_parcel_sales")

In [50]:
dim(df)

In [51]:
size_in_bytes = as.numeric(object.size(df))
size_in_mb = size_in_bytes / (2^20)

In [40]:
print(size_in_mb)

[1] 560.7479


In [42]:
sprintf("%f MB", size_in_mb)

In [12]:
# ?R6