In [16]:
import yaml
from sqlalchemy import create_engine, inspect

class DatabaseConnector:
    def __init__(self, yaml_file_path):
        self.yaml_file_path = yaml_file_path
        # Initialize the database engine
        self.engine = self.init_db_engine(yaml_file_path)

    def read_db_creds(self, yaml_file_path):
        try:
            with open(yaml_file_path, 'r') as yaml_file:
                creds = yaml.safe_load(yaml_file)  
            return creds
        except Exception as e:
            print(f"Error reading YAML file: {e}")
            return None

    def init_db_engine(self, yaml_file_path):
        creds = self.read_db_creds(self.yaml_file_path)

        if creds:
                # Database connection details
            DATABASE_TYPE = 'postgresql'
            DBAPI = 'psycopg2'
            ENDPOINT = creds['RDS_HOST']
            USER = creds['RDS_USER']
            PASSWORD = creds['RDS_PASSWORD']
            PORT = creds['RDS_PORT']
            DATABASE = creds['RDS_DATABASE']

            try:
                # Initialize and return the SQLAlchemy engine
                engine = create_engine(f"{DATABASE_TYPE}+{DBAPI}://{USER}:{PASSWORD}@{ENDPOINT}:{PORT}/{DATABASE}")
                return engine
            except Exception as e:
                print("Failed to initialise the database engine: {e}")
                return None
        else:
            print("Failed to initialize the database engine.")
            return None

    def list_db_tables(self):
        try:
            # Create a SQLAlchemy inspector object
            inspector = inspect(self.engine)

            # Get a list of all table names in the database
            table_names = inspector.get_table_names()
            
            return table_names
        except Exception as e:
            print(f"Error listing database tables: {e}")
            return None

# Example usage:
yaml_file_path = 'db_creds.yaml'
db_connector = DatabaseConnector(yaml_file_path)

if db_connector.engine:
    tables = db_connector.list_db_tables()
    if tables:
        print("Tables in the Database:")
        for table in tables:
            print(table)
    else:
        print("Failed to list database tables.")
else:
    print("Failed to initialize the database engine.")


Tables in the Database:
legacy_store_details
legacy_users
orders_table


In [17]:
import pandas as pd
from database_utils import DatabaseConnector

class DataExtractor:
    def __init__(self, db_connector):
        self.db_connector = db_connector
        
    def read_rds_table(self, table_name):
        try:
            engine = self.db_connector.engine
            with engine.execution_options(isolation_level='AUTOCOMMIT').connect() as conn:    
                df = pd.read_sql(f"SELECT * FROM {table_name}", engine).set_index('index')
                return df
        
        except Exception as e:
            print(f"Error reading table {table_name}: {e}")
            return None

table_name = "legacy_users"
user_df = DataExtractor(db_connector).read_rds_table(table_name)            

In [19]:
user_df
user_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15320 entries, 0 to 1249
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   first_name     15320 non-null  object
 1   last_name      15320 non-null  object
 2   date_of_birth  15320 non-null  object
 3   company        15320 non-null  object
 4   email_address  15320 non-null  object
 5   address        15320 non-null  object
 6   country        15320 non-null  object
 7   country_code   15320 non-null  object
 8   phone_number   15320 non-null  object
 9   join_date      15320 non-null  object
 10  user_uuid      15320 non-null  object
dtypes: object(11)
memory usage: 1.4+ MB


In [23]:
user_df['country'] = user_df['country'].astype('category')
user_df['country'].value_counts()

country
United Kingdom    9371
Germany           4708
United States     1205
NULL                21
PNRMPSYR1J           1
XN9NGL5C0B           1
XGI7FM0VBJ           1
T4WBZSW0XI           1
S0E37H52ON           1
RQRB7RMTAD           1
3518UD5CE8           1
50KUU3PQUF           1
I7G4DMDZOZ           1
GMRBOMI0O1           1
EWE3U0DZIV           1
AJ1ENKS3QL           1
7ZNO5EBALT           1
5EFAFD0JLI           1
YOTSVPRBQ7           1
Name: count, dtype: int64