# Data Systems: Project 1a
## Author: Joseph Bannon



In [1]:
import pandas as pd
from IPython.display import display
import os
import numpy as np
from sqlalchemy import create_engine
import requests
import json


The main function combines the logical steps in each other function, see other functions for more detail.

In [31]:
def main():
    print("Welcome to the data systems ETL progam \n Note: the paratheses contain the strings the user should input",flush=True)
    pd_df = getData()
    if pd_df is None:
        print("\nexiting")
        return
    
    pd_df = modifyData(pd_df)
    if pd_df is None:
        print("\nexiting")
        return
    
    summary(pd_df)
    
    writeData(pd_df)    
    print("\nexiting")

### 1) Get Data
This function is responsible for getting data from a source specified by the user. It takes CSV, JSON, SQL database, API from URL. There are several functions that deal with each data type and specific ways to connect to each data source. They all take the data and convert the data into a pandas dataframe for later manipulation by the user. The data must be in the current working directory to access.

In [25]:
def getData():
    
    print("")
    print("GET DATA")
    fileType = input("What is the file type of the target (options: csv, json, sqldb, api or exit): ").strip().lower()
    
    pd_df = None
    
    if fileType == "csv":
        fileLocation = input("What is the location of the target: ").strip().lower()
        pd_df = getCSV(fileLocation)
        
    elif fileType == "json":
        fileLocation = input("What is the location of the target: ").strip().lower()
        pd_df = getJSON(fileLocation)
        
    elif fileType == "sqldb":
        str_cred = input("Connection String (str) or Credentials (cred)? ").strip().lower()

        if str_cred == "str":
            conn_str = input("What is the connection string of the target datbase: ")
            table = input("Table Name: ")
            pd_df = get_SQL_str(conn_str, table)

        elif str_cred == "cred":
            user_id = input("User Id: ")
            pwd = input("Password: ")
            host_name = input("Host Name: ")
            db_name = input("Database Name: ")
            table = input("Table Name: ")
            pd_df = get_SQL_cred(user_id, pwd, host_name, db_name, table)
            
        elif fileType == "exit":
            getData()
        
        else:
            print("Not a listed choice(options: str, cred), try again")
            getData()        
        
    elif fileType == "api": #add in http get requests
        fileLocation = input("What is the url of the target api: ").strip().lower()
        #fileLocation = "http://universities.hipolabs.com/search?name=middle"
        get_api_response(fileLocation)
            
            
    elif fileType == "exit":
        return
    
    else:
        print("Not a listed choice(options: csv, json, sqldb, api or exit), try again")
        getData()
    print("Data collection sucessful")
    return pd_df
    
#https://universities.hipolabs.com/search?name=middle

CSV file input to pandas df (url or local file).

In [4]:
def getCSV(fileLocation):
    try:
        csv_df = pd.read_csv(fileLocation)
        return csv_df
    except FileNotFoundError:
        print("File not found, try again")
        getData()
    except:
        print("Error, try again")
        getData()


JSON file input to pandas df (url or local file).

In [5]:
def getJSON(fileLocation):
    try:
        json_df = pd.read_json(fileLocation)
        #display(json_df.head())
        return json_df
    except FileNotFoundError:
        print("File not found, try again")
        getData()
    except:
        print("Error, try again")
        getData()

Input is the database credentials and table to load, turn the table into a pandas df.

In [6]:
# host_name = "localhost"
# host_ip = "127.0.0.1"
# port = "3306"

# user_id = "root"
# pwd = "Antvenom21!"
# src_dbname = "northwind"
# dst_dbname = "northwind_dw2"

def get_SQL_cred(user_id, pwd, host_name, db_name, table):
    try:
        conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
        sqlEngine = create_engine(conn_str, pool_recycle=3600)
        connection = sqlEngine.connect()
        sql_query = f"SELECT * FROM {db_name}.{table}"
        dframe = pd.read_sql(sql_query, connection);
        connection.close()
        return dframe
    except:
        print("Connection error, try again")
        getData()

Input is the connection string and table to load, turns the table into a pandas dataframe.

In [7]:
def get_SQL_str(conn_str, table):
    try:
        #conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
        sqlEngine = create_engine(conn_str, pool_recycle=3600)
        connection = sqlEngine.connect()
        db_name = conn_str[conn_str.rfind("/")+1:]
        sql_query = f"SELECT * FROM {db_name}.{table}"
        dframe = pd.read_sql(sql_query, connection);
        connection.close()
        return dframe
    except:
        print("Connection error, try again")
        getData()

Input is the http location of an api, turns the reponse json into a pandas dataframe.

In [8]:
def get_api_response(fileLocation):
    try:
        response = requests.get(fileLocation)
        if (response.status_code != 200):
            print("API request unsuccessful, try again")
            getData()     
        json_file = json.loads(response.text)
        pd_df = pd.DataFrame.from_dict(json_file)
        return pd_df
    except:
        print("Json conversion unsuccesful, try again")
        getData()     

### 2) Modify Data

This function is takes the input of a dataframe (pd_df) from the getData() function. The function is able to remove a column by name ('remove_column'), adding a column with a default value and type specified by the user ('add_column'), or nothing ('none'). When adding a column, the user can add a primary key by typing '\_primaryKey' with unique integer key value. Also, the user can add NULL values by entering "\_NA". Another feature is adding a column via expression by typing 'add_column_exp', which uses compile and exec commands in python. A user can enter a expression involving other columns, that is evaluted into code, that the program will use to create a new column based off data in the old column. For example, "pd_df\['longitude'\]\*2" results in each row in the new column being the corresponding double of the row in the column "longitude". Typing head class the head function on the dataframe. Typing 'none' exits the function returning the modified dataframe.

In [27]:
def modifyData(pd_df):
    print("")
    print("MODIFY DATA")
    print("Note: add a column as an expression of other columns by typing 'add_column_exp'")
    columnType = input("How would you like to change the dataframe (options: remove_column, add_column, head, add_column_exp, none): ").strip().lower()
    # add primary key?
    
    if columnType == "remove_column":
        print("Here is a list of columns: ")
        print(*(pd_df.columns), sep = ", ")
        try:
            rm_col = [item for item in input("Enter the list of columns to remove: ").split(", ")]
            pd_df = pd_df.drop(rm_col, axis=1)
            print("removing succesful")
            modifyData(pd_df)
        except:
            print("removing unsuccesful")
        
    elif columnType == "add_column":
        colName = input("Column name: ")
        defaultValue = input("What default value should be used (type '_primaryKey' for primary key, '_NA' for N/A): ")
        try:
            if defaultValue == "_primaryKey":
                pd_df.insert(0,colName,range(pd_df.shape[0]))
                print("Sucessfully added")
                modifyData(pd_df)

            elif defaultValue == "_NA":
                pd_df.insert(0,colName,np.nan)
                print("Sucessfully added")
                modifyData(pd_df)

            else:
                typeD = input("What data type should it be (int, string, bool, double): ")
                index = int(input("What position should the column be: "))
                if typeD == "int":
                    defaultValue = int(defaultValue)
                elif typeD == "bool":
                    defaultValue = bool(defaultValue)
                elif typeD == "double":
                    defaultValue = float(defaultValue)
                elif typeD == "string":
                    pass
                else:
                    print("invalid choice")
                    modifyData(pd_df)
                pd_df.insert(index,colName,defaultValue)
                print("Sucessfully added")
                modifyData(pd_df)
        except:
            print("adding column unsuccessful")
            modifyData(pd_df)
            
    elif columnType == "add_column_exp":
        try:
            colName = input("Column name to add: ")
            index = int(input("What position should the column be: "))
            print("Access a column by typing: pd_df['COLUMNNAME']")
            print("Replace COLUMNNAME with the name, must include '' still, must use 'pd_df' to access the dataframe")
            print("ex: pd_df['longitude']*2,")
            print("results in each row in the new column being the double of the row in the column 'longitude'")
            exp = input("What expression are you using: ")
            code = compile(exp, "<string>", "eval")
            column = eval(code)
            pd_df.insert(index,colName,column)
            print("Sucessfully added")
            modifyData(pd_df)
        except:
            print("invalid expression, try again")
            modifyData(pd_df)
    
    elif columnType == "none":
        return pd_df
    
    elif columnType == "head":
        display(pd_df.head())
        modifyData(pd_df)
    else:
        print("Not a listed choice (options: remove_column, add_column, none), try again")
        modifyData(pd_df)
        
    modifyData(pd_df)

### 3) Summary

The summary function gives a summary of several attributes of the dataframe. These attributes include the number of rows, number of columns, columns with N/A values and a list of all columns and their data types.

In [10]:
def summary(pd_df):
    print(" ")
    print('Summary of data collected')
    print('Number of rows are: ', pd_df.shape[0])
    print('Number of columns are: ', pd_df.shape[1])
    print('Columns containing NA:')
    print(*(pd_df.columns[pd_df.isna().any()].tolist()), sep = ", " )
    print('Columns and data types:')
    print(pd_df.dtypes)
    display(pd_df.head())



### 4) writeData

The writeData function takes input from modifyData. The function allows the user to write the write to csv, json, sqldb. For csv and json, the user can specify a name for the created file. The program creates the file at current working directory of this jupyter notebook. For adding a table to a SQL database (sqldb), the user can connect to the database using a connection string (str) or user credentials (cred). The user can also add a table name to be created in the database (table is replaced if the name already exists).

In [11]:
def writeData(pd_df):
    fileType = input("What is the file type of what you want to write (options: csv, json, sqldb): ").strip().lower()
    fileName = input("What is the name of the file: ").strip().lower()
    if fileType == "csv":
        writeCSV(pd_df, fileName)
        
    elif fileType == "json":
        writeJSON(pd_df, fileName)
        
    elif fileType == "sqldb":
        str_cred = input("Connection String (str) or Credentials (cred)? ").strip().lower()

        if str_cred == "str":
            conn_str = input("What is the connection string of the target datbase: ")
            table = input("Table Name: ")
            write_SQL_str(pd_df, conn_str, table)

        elif str_cred == "cred":
            user_id = input("User Id: ")
            pwd = input("Password: ")
            host_name = input("Host Name: ")
            db_name = input("Database Name: ")
            table = input("Table Name to be created: ")
            write_SQL_cred(pd_df,  user_id, pwd, host_name, db_name, table)
            
        elif fileType == "exit":
            writeData(pd_df)
        
        else:
            print("Not a listed choice (options: str, cred), try again")
            writeData(pd_df)        
    
    else:
        print("Not a listed choice (options: csv, json, sqldb), try again")
        writeData(pd_df)
    print("Data Writing sucessful")

CSV file output from pandas df (creates file at current working directory).

In [12]:
def writeCSV(pd_df, fileName):
    try:
        pd_df.to_csv(fileName, index=True, sep=',')
    except:
        print("Error, try again")
        writeData(pd_df)

JSON file output from pandas df (creates file at current working directory).

In [13]:
def writeJSON(pd_df, fileName):
    try:
        pd_df.to_json(fileName)
    except:
        print("Error, try again")
        writeData(pd_df)

Input is the database credentials and table to create in the sql db, creates a table in the database specified, replacing the table if it already exists.

In [14]:
def write_SQL_str(pd_df, conn_str, table):
    #conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    try:
        sqlEngine = create_engine(conn_str, pool_recycle=3600)
        connection = sqlEngine.connect()

        pd_df.to_sql(table, con=connection, index=False, if_exists='replace')
        str_cred = input("Add key? (y/n)").strip().lower()
        if str_cred == "y":
            print("List of columns: ")
            print(*(pd_df.columns.tolist()), sep = ", " )
            pk_column = input("What is the column name?").strip().lower()
            sqlEngine.execute(f"ALTER TABLE {table} ADD PRIMARY KEY ({pk_column});")

        connection.close()
    except:
        print("Error, try again")
        writeData(pd_df)


Input is the connection string and table to create in the sql db, creates a table in the database specified, replacing the table if it already exists.

In [15]:
def write_SQL_cred(pd_df, fileName,  user_id, pwd, host_name, db_name, table):
    try:
        conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
        sqlEngine = create_engine(conn_str, pool_recycle=3600)
        connection = sqlEngine.connect()

        pd_df.to_sql(table, con=connection, index=False, if_exists='replace')
        str_cred = input("Add key? (y/n)").strip().lower()
        if str_cred == "y":
            print("List of columns: ")
            print(*(pd_df.columns.tolist()), sep = ", " )
            pk_column = input("What is the column name: ").strip().lower()
            sqlEngine.execute(f"ALTER TABLE {table} ADD PRIMARY KEY ({pk_column});")

        connection.close()
    except:
        print("Error, try again")
        writeData(pd_df)



In [None]:
main()

Welcome to the data systems ETL progam 
 Note: the paratheses contain the strings the user should input

GET DATA


In [28]:
pd_df = pd.read_csv("housing.csv")
pd_df['ocean_proximity']

0        NEAR BAY
1        NEAR BAY
2        NEAR BAY
3        NEAR BAY
4        NEAR BAY
           ...   
20635      INLAND
20636      INLAND
20637      INLAND
20638      INLAND
20639      INLAND
Name: ocean_proximity, Length: 20640, dtype: object