# ETL Processes

In [None]:
import os
import glob
import psycopg2
from datetime import datetime
import pandas as pd
from sql_queries import *

In [None]:

"""
    PLEASE INSERT YOUR POSTGRESQL USERNAME, PASSWORD AND DATABASE BELOW
"""

In [None]:
conn = psycopg2.connect("host=127.0.0.1 dbname=*** user=*** password=***")
cur = conn.cursor()

In [None]:
def get_files(filepath):
    all_files = []
    for root, dirs, files in os.walk(filepath):
        files = glob.glob(os.path.join(root,'*.json'))
        for f in files :
            all_files.append(os.path.abspath(f))
    
    return all_files

# Process `song_data`

In [None]:
song_files = 'data/song_data'

In [None]:
filepath = get_files(song_files)

In [None]:
df = pd.read_json(filepath[0], lines=True)
df.head()

## #1: `songs` Table
#### Extract Data for Songs Table

In [None]:
song_data = list(df[["song_id", "title","artist_id", "year", "duration"]].values[0])
song_data 

#### Insert Record into Song Table

In [None]:
cur.execute(song_table_insert, song_data)
conn.commit()

## #2: `artists` Table
#### Extract Data for Artists Table

In [None]:
artist_data = list(df[["artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude"]].values[0])
artist_data

#### Insert Record into Artist Table

In [None]:
cur.execute(artist_table_insert, artist_data)
conn.commit()

In [None]:
conn.commit()

# Process `log_data`

In [None]:
log_files = 'data/log_data'

In [None]:
filepath = get_files(log_files)

In [None]:
df = pd.read_json(filepath[1], lines=True)
df.head()

## #3: `time` Table
#### Extract Data for Time Table

In [None]:
df = df[df["page"] == "NextSong"]
df.head()

In [None]:
t = df['ts'].apply(lambda x: datetime.utcfromtimestamp(x//1000.0))
t.head()

In [None]:
df['start_time'] = t
df['hour'] = t.apply(lambda x: x.hour)
df['day'] = t.apply(lambda x: x.day)
df['week'] = t.apply(lambda x: x.week)
df['month'] = t.apply(lambda x: x.month)
df['year'] = t.apply(lambda x: x.year)
df['weekday'] = t.apply(lambda x: x.day_name())

In [None]:
column_labels = ["start_time", "hour",
                 "day", "week", "month", "year", "weekday"]
time_df = df[column_labels]

time_df.head()

#### Insert Records into Time Table

In [None]:
for i, row in time_df.iterrows():
    cur.execute(time_table_insert, list(row))
    conn.commit()

## #4: `users` Table
#### Extract Data for Users Table

In [None]:
user_cols = ["userId", "firstName",
             "lastName", "gender", "level"]
user_df = df[user_cols].rename(columns=({
    "userId": "user_id",
    "firstName": "first_name",
    "lastName": "last_name"
    }))
user_df["user_id"] = user_df["user_id"].astype(int)
df.head()

#### Insert Records into Users Table

In [None]:
for i, row in user_df.iterrows():
    cur.execute(user_table_insert, row)
    conn.commit()

## #5: `songplays` Table
#### Extract Data and Songplays Table
#### Insert Records into Songplays Table

In [None]:
for index, row in df.iterrows():

    # get songid and artistid from song and artist tables
    
    insert_statement = song_select.format(*(row.song.replace("'", ""), row.artist.replace("'", ""), row.length))
    cur.execute(insert_statement)
    results = cur.fetchone()

    if results:
        songid, artistid = results
    else:
        songid, artistid = None, None
    # insert songplay record
    songplay_data = (row.ts, row.start_time, int(row.userId), row.level, songid, artistid,
                     row.sessionId, row.location, row.userAgent)
    cur.execute(songplay_table_insert, songplay_data)
    conn.commit()

# Close Connection to Sparkify Database

In [None]:
conn.close()