In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import Integer, Text, String, Float
#import psycopg2

loaded_files_and_count = {}

In [2]:
def build_DB_URI(db_type, db_lib, user_id, password, db_name,  db_location='localhost', port='5432' ):
    '''
        A method which generates a DB_URI for SQL-Alchemey. Assumption that this will be
        used with Postgresql, however written to be generic.

        arg:

        db_type     --> the type of database, e.g 'postgres', 'mysql'

        db_lib      --> the appropriate sql-alchemy plughin for 
                        db_type, e.g 'psycopg2' or 'pymysql'

        user_id     --> the user name for the database, who has 
                        appropriate permissions

        password    --> the password for the db-user-id.
        db_name     --> the name of the db, e.g. 'esomeprazole'
        db_location --> the address / URL for the database. DEFAULT = localhost
        port        --> the port for the database. DEFAULT = 5432
        
        returns:
        db_URI     --> The URI for SQL-Alchemy of the form:
                       postgres+psycop2://user_id:password@db_location:5432/db_name

    '''
    
    db_URI = db_type+'+'+db_lib+'://'+user_id+':'+password+'@'+db_location+':'+port+'/'+db_name

    return db_URI

====

In [29]:
def load_csv_file_as_df(data_file_path, file_name):
    
    df = pd.read_csv(data_file_path+file_name)
    new_columns = [column.replace(' ', '_').lower() for column in df]
    df.columns = new_columns
    return df

====

In [4]:
def get_db_cols(df):
    '''
    A method which converts the col-dtype from Pandas/Numpy 
    to the SQLAlchemy equivelent. 
    args:
    
    df ---> A pandas DataFrame
    
    
    returns:
    
    db_cols --> a dictionary with column-name as key and SQL-Alchemy
                data type as values.
    
    '''
    
    col_info = dict(df.dtypes)
    db_cols = {}
    for k in col_info:

        if col_info[k] == 'object':
            db_cols[k] = String

        elif col_info[k] == 'int64':
            db_cols[k] = Integer

        elif col_info[k] == 'float64':
            db_cols[k] = Float  
        else:
            print('Unaccounted for type:')
            print(k, col_info[k])
            return None
    return db_cols

====

In [27]:
def get_file_names(data_file_path, file_name_pattern):

    all_file_list = os.listdir(data_file_path)

    all_file_list.sort()
    
    file_list = []

    for f in all_file_list:
        
        if file_name_pattern in f:
            file_list.append(f)
       
    return  file_list       

In [30]:
def load_csv_data_to_db(filename_pattern_and_tablename_dict, data_file_path):
    '''
    
    
    '''
    
    
    print('outside first for')
    # 1. iterate through list of patterns, to load all file-types into the database.
    for pattern in filename_pattern_and_tablename_dict.keys():
                      
        # 2. Get the list of files from the data-folder:
        data_file_list = get_file_names(data_file_path, pattern)
       
        
        table_name = filename_pattern_and_tablename_dict[pattern]
                
        # 3. load data into a data frame
        file_counter = 1
        print('in first loop, outside second.. ')
        for data_file in data_file_list:
            print('top of second loop.')
            df = load_csv_file_as_df(data_file_path, data_file)

            # Get the columns data types from the data frame and convert 
            # to SQL-Alchemy friend types.

            db_cols = get_db_cols(df)
            
            if db_cols == None:
                
                print(data_file, f)
                return None

            if file_counter == 1:
                
                df.to_sql(table_name,
                                   db_engine,
                                   if_exists='replace',
                                   schema='public',
                                   index=False,
                                   chunksize=1000,
                                   dtype=db_cols)            
                print('here, counter 1')
                break
                
            if file_counter > 1:
                
                df.to_sql(table_name,
                                   db_engine,
                                   if_exists='append',
                                   schema='public',
                                   index=False,
                                   chunksize=1000,
                                   dtype=db_cols)   
                
            file_counter += 1

====

In [7]:
db_type = 'postgres'
db_lib = 'psycopg2'
user_id = 'bhima'
password= ''
db_name = 'openfda'

db_URI = build_DB_URI(db_type, db_lib, user_id, password, db_name)
db_engine = create_engine(db_URI, echo=True)
db_engine.connect()
connection= db_engine.connect()

2020-05-05 21:24:33,586 INFO sqlalchemy.engine.base.Engine select version()
2020-05-05 21:24:33,586 INFO sqlalchemy.engine.base.Engine {}
2020-05-05 21:24:33,589 INFO sqlalchemy.engine.base.Engine select current_schema()
2020-05-05 21:24:33,589 INFO sqlalchemy.engine.base.Engine {}
2020-05-05 21:24:33,592 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2020-05-05 21:24:33,594 INFO sqlalchemy.engine.base.Engine {}
2020-05-05 21:24:33,595 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2020-05-05 21:24:33,596 INFO sqlalchemy.engine.base.Engine {}
2020-05-05 21:24:33,597 INFO sqlalchemy.engine.base.Engine show standard_conforming_strings
2020-05-05 21:24:33,598 INFO sqlalchemy.engine.base.Engine {}


In [31]:
filename_pattern_and_tablename_dict = {'drug.csv':'drugs', 'patient.csv':'patients', \
                      'reaction.csv':'reactions', 'open_fda.csv':'open_fda'}

#location of the data files:
data_file_path = '../Data/csv/'

load_csv_data_to_db(filename_pattern_and_tablename_dict, data_file_path)

outside first for
in first loop, outside second.. 
top of second loop.


NameError: name 'drugs_df' is not defined