In [6]:
import json
import glob
import pandas as pd

log_paths = glob.glob('./log_data/2018/11/*.json')


In [12]:
df = pd.read_json(log_paths[0], lines=True)
df.tail()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
197,,Logged In,Lily,F,0,Burns,,free,"New York-Newark-Jersey City, NY-NJ-PA",GET,Home,1540621000000.0,349,,200,1541629595796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",32
198,STRATOVARIUS,Logged In,Lily,F,1,Burns,350.74567,free,"New York-Newark-Jersey City, NY-NJ-PA",PUT,NextSong,1540621000000.0,349,Twilight Time,200,1541629614796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",32
199,The Mantles,Logged In,Lily,F,2,Burns,226.53342,free,"New York-Newark-Jersey City, NY-NJ-PA",PUT,NextSong,1540621000000.0,349,Don't Lie,200,1541629964796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",32
200,Tub Ring,Logged In,Kaleb,M,0,Cook,233.69098,free,"Yuba City, CA",PUT,NextSong,1540680000000.0,213,Invalid,200,1541632356796,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,54
201,Jack Johnson,Logged In,Kaleb,M,1,Cook,173.42649,free,"Yuba City, CA",PUT,NextSong,1540680000000.0,213,Wrong Turn,200,1541632589796,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,54


In [9]:
sample_track_df = pd.read_json(
    './song_data/A/A/A/TRAAAAW128F429D538.json', lines=True
)

sample_track_df.head()

Unnamed: 0,num_songs,artist_id,artist_latitude,artist_longitude,artist_location,artist_name,song_id,title,duration,year
0,1,ARD7TVE1187B99BFB1,,,California - LA,Casual,SOMZWCG12A8C13C480,I Didn't Mean To,218.93179,0


# SQL Queries

In [4]:
import psycopg2 as pg2
import json


try:
    with open('./db_credentials.json') as f:
        data = json.load(f)
        USER = data['username']
        PASS = data['password']
        NAME = data['name']
except Exception as e:
    print(e)

try:
    conn = pg2.connect(
        host='localhost',
        database=NAME,
        user=USER,
        password=PASS,
    )
    conn.set_session(autocommit=True)
    cur = conn.cursor()
except Exception as e:
    print(e)


# DROP and CREATE

In [12]:
drop_songplays = "DROP TABLE IF EXISTS songplays"

create_songplays = """
CREATE TABLE IF NOT EXISTS songplays (
    songplay_id SERIAL PRIMARY KEY,
    start_time TIMESTAMP,
    user_id INT,
    level VARCHAR(10),
    song_id VARCHAR(20),
    artist_id VARCHAR(20),
    session_id INT,
    location VARCHAR(50),
    user_agent VARCHAR(150)
)
"""

drop_users = "DROP TABLE IF EXISTS users"

create_users = """
CREATE TABLE IF NOT EXISTS users (
    user_id INT PRIMARY KEY,
    first_name VARCHAR(50),
    last_name VARCHAR(50),
    gender CHAR(1),
    level VARCHAR(10)
)
"""

drop_songs = "DROP TABLE IF EXISTS songs"

create_songs = """
CREATE TABLE IF NOT EXISTS songs (
    song_id VARCHAR(20) PRIMARY KEY,
    title VARCHAR(100),
    artist_id VARCHAR(20) NOT NULL,
    year INT,
    duration FLOAT(5)
)
"""

drop_artists = "DROP TABLE IF EXISTS artists"

create_artists = """
CREATE TABLE IF NOT EXISTS artists (
    artist_id VARCHAR(20) PRIMARY KEY,
    name VARCHAR(100),
    location VARCHAR(100),
    latitude FLOAT(5),
    longitude FLOAT(5)
)
"""

drop_time = "DROP TABLE IF EXISTS time"

create_time = """
CREATE TABLE IF NOT EXISTS time (
    start_time TIMESTAMP PRIMARY KEY,
    hour INT,
    day INT,
    week SMALLINT,
    month SMALLINT,
    year SMALLINT,
    weekday SMALLINT
)
"""

drop_queries = [drop_songplays, drop_songs, drop_artists, drop_time, drop_users]
create_queries = [create_songplays, create_songs, create_artists, create_time, create_users]

# Run DROP and CREATE

In [13]:
for query in drop_queries:
    "DROP TABLE IF EXISTS ..."
    try:
        cur.execute(query)
    except Exception as e:
        print(e)

for query in create_queries:
    "CREATE TABLE IF NOT EXISTS ..."
    try:
        cur.execute(query)
    except Exception as e:
        print(e)

# INSERT INTO

In [15]:
insert_songplays = ("""
INSERT INTO songplays (songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT(songplay_id) DO NOTHING;
""")

insert_users = ("""
INSERT INTO users (user_id, first_name, last_name, gender, level)
VALUES (%s, %s, %s, %s, %s) ON CONFLICT (user_id) DO UPDATE SET level = EXCLUDED.level;
""")

insert_songs = ("""
INSERT INTO songs (song_id, title, artist_id, year, duration)
VALUES (%s, %s, %s, %s, %s) ON CONFLICT DO NOTHING;
""")

insert_artists = ("""
INSERT INTO artists (artist_id, name, location, lattitude, longitude)
VALUES (%s, %s, %s, %s, %s) ON CONFLICT DO NOTHING;
""")


insert_time = ("""
INSERT INTO time (start_time, hour, day, week, month, year, weekday)
VALUES (%s, %s, %s, %s, %s, %s, %s) ON CONFLICT DO NOTHING;
""")

INSERT = {
    'songplays': insert_songplays,
    'users': insert_users,
    'time': insert_time,
    'artists': insert_artists,
    'sogns': insert_songs,
}

# Test Creation

In [14]:
test_songplays = "SELECT * FROM songplays"

test_users = "SELECT * FROM users"

test_artists = "SELECT * FROM artists"

test_songs = "SELECT * FROM songs"

test_time = "SELECT * FROM time"


test_queries = [test_songplays, test_users, test_artists, test_songs, test_time]
table_names = ['test_songplays', 'test_users', 'test_artists', 'test_songs', 'test_time']

for table_name, query in zip(table_names, test_queries):
    try:
        cur.execute(query)
        row = cur.fetchone()

        print('Table:', table_name)
        if not row:
            print('No data yet.\n')
        while row:
            print(row)
            row = cur.fetchone()
    except Exception as e:
        print(e)
    

Table: test_songplays
No data yet.

Table: test_users
No data yet.

Table: test_artists
No data yet.

Table: test_songs
No data yet.

Table: test_time
No data yet.



# SELECT specific song

In [16]:
select_song = """
SELECT songs.song_id, artists.artist_id FROM songs
JOIN artists ON songs.artist_id = artists.artist_id
WHERE songs.title = %s
AND artists.name = %s
AND songs.duration = %s
"""