Origin
  [OriginID] Integer not null primary key
  [Origin] Text not null
Destination
  [DestinationID] integer not null Primary key
  [Destination] Text not null
Trips
  [TripID] INTEGER NOT NULL PRIMARY KEY
  [pickup_datetime] TEXT NOT NULL
  [trip_distance] FLOAT NOT NULL
  [trip_duration] TEXT NOT NULL
  [OriginID] INTEGER NOT NULL REFERENCES Origin(OriginID)
  [DestinationID] INTEGER NOT NULL REFERENCES Destination(DestinationID))

In [1]:
### Utility Functions
from IPython.display import display, HTML
import pandas as pd
import sqlite3
from sqlite3 import Error

def create_connection(db_file, delete_db=False):
    import os
    if delete_db and os.path.exists(db_file):
        os.remove(db_file)

    conn = None
    try:
        conn = sqlite3.connect(db_file)
        conn.execute("PRAGMA foreign_keys = 1")
    except Error as e:
        print(e)

    return conn


def create_table(conn, create_table_sql, drop_table_name=None):
    
    if drop_table_name: # You can optionally pass drop_table_name to drop the table. 
        try:
            c = conn.cursor()
            c.execute("""DROP TABLE IF EXISTS %s""" % (drop_table_name))
        except Error as e:
            print(e)
    
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except Error as e:
        print(e)
        
def execute_sql_statement(sql_statement, conn):
    cur = conn.cursor()
    cur.execute(sql_statement)

    rows = cur.fetchall()

    return rows


In [2]:
def create_origin_table(data_filename, normalized_database_filename):
    conn = create_connection(normalized_database_filename)
    
    create_table_origin = '''CREATE TABLE IF NOT EXISTS Origin ( 
                         [OriginID] Integer not null primary key,
                         [Origin] Text not null);
                         ''' 
    
    row_count = 0
    origins = []
    with open(data_filename) as file:
        
        for line in file:
            if row_count ==0:
                columns = (line.strip().split(","))
                row_count+=1
            else:
                origins.append(line.strip().split(",")[1].strip())
                row_count+=1
        
        origins_unique = sorted(set(origins))
        origins_final = list(set(zip(range(1,len(origins_unique)+1),origins_unique)))
        origins_final=sorted(origins_final)
        
    with conn:
        create_table(conn, create_table_origin)
        insert_Origin(conn,origins_final)
        
def insert_Origin(conn,values):
        
        sql_insert_origin = """INSERT INTO Origin(OriginID,Origin)
                                VALUES(?,?)"""
        curr = conn.cursor()
        curr.executemany(sql_insert_origin,values)
        return curr.lastrowid

In [3]:
def create_destination_table(data_filename, normalized_database_filename):
    conn = create_connection(normalized_database_filename)
    
    create_table_destinations = '''CREATE TABLE IF NOT EXISTS Destination ( 
                                 [DestinationID] Integer not null primary key,
                                 [Destination] Text not null);
                         ''' 
    
    row_count = 0
    destinations = []

    with open(data_filename) as file:
        
        for line in file:
            if row_count ==0:
                columns = (line.strip().split(","))
                row_count+=1
            else:
                destinations.append(line.strip().split(",")[2].strip())
                row_count+=1
        
        destinations_unique = sorted(set(destinations))
        destinations_final = list(set(zip(range(1,len(destinations_unique)+1),destinations_unique)))
        destinations_final=sorted(destinations_final)
        
    with conn:
        create_table(conn, create_table_destinations)
        insert_destinations(conn,destinations_final)
        
def insert_destinations(conn,values):
        
        sql_insert_destinations = """INSERT INTO Destination(DestinationID,Destination)
                                VALUES(?,?)"""
        curr = conn.cursor()
        curr.executemany(sql_insert_destinations,values)
        return curr.lastrowid

In [4]:
def origin_to_originid_dictionary(normalized_database_filename,conn):
    
    
    # YOUR CODE HERE
    origins = execute_sql_statement("SELECT OriginID,Origin FROM Origin",conn)
    origin_dict = dict()
    for item in origins:
        origin_dict[item[1]] = item[0]
        
    return origin_dict
        

In [5]:
def destination_to_destinationid_dictionary(normalized_database_filename,conn):
    
    
    # YOUR CODE HERE
    destination = execute_sql_statement("SELECT DestinationID,Destination FROM Destination",conn)
    destination_dict = dict()
    for item in destination:
        destination_dict[item[1]] = item[0]
        
    return destination_dict

In [6]:
def create_trips_table(data_filename, normalized_database_filename):
    conn = create_connection(normalized_database_filename)
    
    create_table_trips = '''CREATE TABLE IF NOT EXISTS Trips( 
                                 [TripID] INTEGER NOT NULL PRIMARY KEY,
                                 [pickup_datetime] TEXT NOT NULL,
                                 [trip_distance] FLOAT NOT NULL,
                                 [trip_duration] TEXT NOT NULL,
                                 [OriginID] INTEGER NOT NULL REFERENCES Origin(OriginID),
                                 [DestinationID] INTEGER NOT NULL REFERENCES Destination(DestinationID));
                         ''' 
    
    row_count = 0
    trips = []
    destination_dict = destination_to_destinationid_dictionary(normalized_database_filename,conn)
    origin_dict = origin_to_originid_dictionary(normalized_database_filename,conn)
    with open(data_filename) as file:
        
        for line in file:
            if row_count ==0:
                columns = (line.strip().split(","))
                row_count+=1
            else:
                a = line.strip().split(',')
                values = (a[3],a[4],a[5],origin_dict[a[1]],destination_dict[a[2]])
                trips.append(values)
                row_count+=1
        
        
    with conn:
        create_table(conn, create_table_trips)
        insert_Trip(conn,trips)
        

def insert_Trip(conn, values):
    sql = '''INSERT INTO Trips(pickup_datetime, trip_distance,trip_duration,OriginID,DestinationID)
          VALUES(?,?,?,?,?) '''
    cur = conn.cursor()
    cur.executemany(sql, values)
    return cur.lastrowid

In [8]:
data_filename = "uber_nyc_data.csv"
normalized_database_filename = 'normalized1.db'
create_origin_table(data_filename, normalized_database_filename)
create_destination_table(data_filename, normalized_database_filename)
create_trips_table(data_filename, normalized_database_filename)

In [9]:
                
def convertDateTime(non_normalized_db_filename):
    """
    Function that takes a string in the format yyyy-mm-dd hh:mm:ss, and
    returns the same as a datetime object.
    """
    conn = create_connection(non_normalized_db_filename)
 
    sql_statement = "SELECT TripID,pickup_datetime from Trips"
    df = pd.read_sql_query(sql_statement, conn)
    df['pickup_dt'] = df['pickup_datetime'].astype('datetime64[ns]')
    df['date'] = df['pickup_dt'].dt.date
    df['year'] = df['pickup_dt'].dt.year
    df['month'] = df['pickup_dt'].dt.month
    df['day'] = df['pickup_dt'].dt.day
    df['hour'] = df['pickup_dt'].dt.hour 
    df['weekday'] = df['pickup_dt'].dt.dayofweek

    return df.head()

In [10]:
def ConvertDurationToMinutes(time_str):
    mins_split = time_str.split(':')
    val = int(mins_split[0])*60 + int(mins_split[1]) + int(mins_split[2])/60.0
    return val

In [11]:
                
def convertTripDuration(non_normalized_db_filename):
    """
    Function that takes a string in the format yyyy-mm-dd hh:mm:ss, and
    returns the same as a datetime object.
    """
    conn = create_connection(non_normalized_db_filename)
 
    sql_statement = "SELECT TripID,trip_duration from Trips"
    df = pd.read_sql_query(sql_statement, conn)
    df['trip_duration'].fillna(value=0, inplace=True)
    df['trip_duration'] = df['trip_duration'].replace('NULL',0)
    df['duration_in_mins']=df['trip_duration'].apply(ConvertDurationToMinutes)
    return df.head()

In [12]:
convertDateTime("normalized1.db")
convertTripDuration("normalized1.db")

AttributeError: 'int' object has no attribute 'split'