# Import Python Packages

In [None]:
# Import Python packages 
import datetime
import pandas as pd
import os
import glob
import psycopg2

# Connection Set up 

## DB Params

In [None]:
DB_ENDPOINT = ""
DEFAULT_DB = 'postgres'
DB = ''
DB_USER = ''
DB_PASSWORD = ''
DB_PORT = '5432'

## Create a connection to Default Database

In [None]:
try: 
    conn = psycopg2.connect(
                        host=DB_ENDPOINT,
                        port=DB_PORT,
                        dbname=DEFAULT_DB,
                        user=DB_USER,
                        password=DB_PASSWORD)
    print("Connection established")
    conn.set_session(autocommit=True)
    Print("Connection is in Auto Commit")
except psycopg2.Error as e: 
    print("Error: Could not make connection to the Default Postgres database")
    print(e)

## Create Cursor

In [None]:
try: 
    cursor = conn.cursor()
except psycopg2.Error as e: 
    print("Error: Could not get cursor to the Database")
    print(e)

## Create New DB and reset connection 

In [None]:
query = "DROP DATABASE IF EXISTS %s"
try: 
    cursor.execute(query, (DB))
    query = "CREATE DATABASE %s"
    cursor.execute(query, (DB))
except psycopg2.Error as e: 
    print("Error: Issue creating DataBase")
    print (e)

### Reset connection to the Desired DB

In [None]:
cursor.close()
conn.close()
try: 
    conn = psycopg2.connect(
                        host=DB_ENDPOINT,
                        port=DB_PORT,
                        dbname=DB,
                        user=DB_USER,
                        password=DB_PASSWORD)
    print("Connection established")
    conn.set_session(autocommit=True)
    Print("Connection is in Auto Commit")
except psycopg2.Error as e: 
    print(f"Error: Could not make connection to the {DB} database")
    print(e)
    
try: 
    cursor = conn.cursor()
except psycopg2.Error as e: 
    print("Error: Could not get cursor to the Database")
    print(e)

# Clean DATA , DROP unconstant Data, DROP Unused DATA

In [None]:
# Show numeric output in decimal format e.g., 2.15
pd.options.display.float_format = '{:,.2f}'.format
data_frame_riders = pd.read_csv('./data/riders.csv')
data_frame_payments = pd.read_csv('./data/payments.csv')
data_frame_stations = pd.read_csv('./data/stations.csv')
data_frame_trips = pd.read_csv('./data/trips.csv')

## Explore the Data & Check for Consistency

### Riders CSV 

In [None]:
print(data_frame_riders.shape)
print(data_frame_riders.columns)
print(data_frame_riders.sample())

In [None]:
print(f'Missing values for data set?: {data_frame_riders.isna().values.any()}')
print(f'Missing values for data set?: {data_frame_riders.isna().values.sum()}')
#data_frame_riders.dropna(inplace=True)
print(f'Missing values for data set?: {data_frame_riders.isna().values.any()}')
print(f'Missing values for data set?: {data_frame_riders.isna().values.sum()}')
# Remove duplicates
print(f'duplicates values for data set?: {data_frame_riders.duplicated().values.any()}')
print(f'duplicates values for data set?: {data_frame_riders.duplicated().values.sum()}')
print(data_frame_riders.duplicated)  

#### Check  data after cleanning

In [None]:
print(data_frame_riders.shape)
print(data_frame_riders.columns)
print(data_frame_riders.sample())

#### Change Columns type to the desired format

In [None]:
print(data_frame_riders.describe())
print(data_frame_riders.info())

In [None]:
# data_frame["itemInSession"] = pd.to_numeric(data_frame["itemInSession"])
# data_frame["length"] = pd.to_numeric(data_frame["length"])
# data_frame["sessionId"] = pd.to_numeric(data_frame["sessionId"])
# data_frame["userId"] = pd.to_numeric(data_frame["userId"],downcast='integer' )

#### See the new data types and if the data is ready

In [None]:
#print(data_frame.describe())
#print(data_frame.info())

### Payments CSV 

In [None]:
print(data_frame_payments.shape)
print(data_frame_payments.columns)
print(data_frame_payments.sample())

In [None]:
print(f'Missing values for data set?: {data_frame_payments.isna().values.any()}')
print(f'Missing values for data set?: {data_frame_payments.isna().values.sum()}')
#data_frame_payments.dropna(inplace=True)
print(f'Missing values for data set?: {data_frame_payments.isna().values.any()}')
print(f'Missing values for data set?: {data_frame_payments.isna().values.sum()}')
# Remove duplicates
print(f'duplicates values for data set?: {data_frame_payments.duplicated().values.any()}')
print(f'duplicates values for data set?: {data_frame_payments.duplicated().values.sum()}')
print(data_frame_payments.duplicated)  

#### Check  data after cleanning

In [None]:
print(data_frame_payments.shape)
print(data_frame_payments.columns)
print(data_frame_payments.sample())

#### Change Columns type to the desired format

In [None]:
print(data_frame_payments.describe())
print(data_frame_payments.info())

### Stations CSV 

In [None]:
print(data_frame_stations.shape)
print(data_frame_stations.columns)
print(data_frame_stations.sample())

In [None]:
print(f'Missing values for data set?: {data_frame_stations.isna().values.any()}')
print(f'Missing values for data set?: {data_frame_stations.isna().values.sum()}')
#data_frame_stations.dropna(inplace=True)
print(f'Missing values for data set?: {data_frame_stations.isna().values.any()}')
print(f'Missing values for data set?: {data_frame_stations.isna().values.sum()}')
# Remove duplicates
print(f'duplicates values for data set?: {data_frame_stations.duplicated().values.any()}')
print(f'duplicates values for data set?: {data_frame_stations.duplicated().values.sum()}')
print(data_frame_stations.duplicated)  

#### Check  data after cleanning

In [None]:
print(data_frame_stations.shape)
print(data_frame_stations.columns)
print(data_frame_stations.sample())

#### Change Columns type to the desired format

In [None]:
print(data_frame_stations.describe())
print(data_frame_stations.info())

### Trips CSV 

In [None]:
print(data_frame_trips.shape)
print(data_frame_trips.columns)
print(data_frame_trips.sample())

In [None]:
print(f'Missing values for data set?: {data_frame_trips.isna().values.any()}')
print(f'Missing values for data set?: {data_frame_trips.isna().values.sum()}')
#data_frame_trips.dropna(inplace=True)
print(f'Missing values for data set?: {data_frame_trips.isna().values.any()}')
print(f'Missing values for data set?: {data_frame_trips.isna().values.sum()}')
# Remove duplicates
print(f'duplicates values for data set?: {data_frame_trips.duplicated().values.any()}')
print(f'duplicates values for data set?: {data_frame_trips.duplicated().values.sum()}')
print(data_frame_trips.duplicated)  

#### Check  data after cleanning

In [None]:
print(data_frame_trips.shape)
print(data_frame_trips.columns)
print(data_frame_trips.sample())

#### Change Columns type to the desired format

In [None]:
print(data_frame_trips.describe())
print(data_frame_trips.info())

# Populating Cleaned Data 

## Automation Functions

In [None]:
# Helper functions
def table_recreate(cursor, tableName: str, tableFields: str ):
     """ Takes the Connection , Table name and create Fields, will drop the table if exists and create it again with the desired fields Ex : table_recreate(cursor, payments,
                    "(sessionId INT,
                    itemInSession INT,
                    artist TEXT,
                    song TEXT,
                    length FLOAT, 
                    PRIMARY KEY (sessionId, itemInSession))")  """
    try:
        
        cursor.execute("DROP TABLE IF EXISTS {0};".format(tableName))
        query = f"CREATE TABLE {tableName} "
        query = query + tableFields
        cursor.execute(query)
        print("Finished creating table {0}".format(tableName))
    except psycopg2.Error as e: 
    print(f"Error: Couldn't recreate the table: {tableName}, something went wrong")
    print(e)

def populate_table(c, filename, tablename):
    f = open(filename, 'r')
    try:
        cursor.copy_from(f, tablename, sep=",", null = "")
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
    print("Finished populating {0}".format(tablename))
    def generate_header(signal_header_dict: dict) -> pandas.DataFrame:
    """ Takes the header_attributes associated with The signal and Return a Pandas Data Frame with the format
        of { key : [value],} for the appropriate  Signal"""

    header_dict = {key: [value] for (key, value) in signal_header_dict.items()}
    return pandas.DataFrame(header_dict)


## Create Tables & Insert data

### Riders Table 

#### Create

In [None]:
TABLE = riders
FIELDS = """(rider_id INTEGER PRIMARY KEY,
                    first VARCHAR(50),
                    last VARCHAR(50), 
                    address VARCHAR(100), 
                    birthday DATE, 
                    account_start_date DATE, 
                    account_end_date DATE, 
                    is_member BOOLEAN);"""
table_recreate(cursor = cursor, tableName = TABLE, tableFields = FIELDS )

#### Populate