# ETL processes

In [1]:
import os
import glob
import psycopg2
import pandas as pd

In [14]:
from sql import create_table_queries

In [2]:
def get_files(filepath):
    all_files = []
    for root, dirs, files in os.walk(filepath):
        files = glob.glob(os.path.join(root,'*.json'))
        for f in files :
            all_files.append(os.path.abspath(f))
    
    return all_files

## Setup DB connection and tables

In [35]:
conn = psycopg2.connect("host=postgres-db dbname=udacity user=udacity password=udacity")
cur = conn.cursor()

In [36]:
for query in create_table_queries:
    cur.execute(query)
conn.commit()

In [37]:
cur.execute("""SELECT table_name FROM information_schema.tables
       WHERE table_schema = 'public'""")
for table in cur.fetchall():
    print(table)

('artists',)
('songs',)
('time',)
('songplays',)
('users',)


## Process `song_data`

First we set a list of all song data filepaths, and inspect one of the json file.

In [3]:
song_files = get_files('./data/song_data')

In [5]:
sample_file_path = song_files[0]
df = pd.DataFrame(pd.read_json(sample_file_path, typ='series', convert_dates=False))
df.values

array([[1],
       ['AR7G5I41187FB4CE6C'],
       [None],
       [None],
       ['London, England'],
       ['Adam Ant'],
       ['SONHOTT12A8C13493C'],
       ['Something Girls'],
       [233.40363],
       [1982]], dtype=object)

Then we create a combined dataframe with all the log data

In [81]:
dfs = []
for file in song_files:
    dfs.append(pd.DataFrame([pd.read_json(file, typ='series', convert_dates=False)]))

Then we set artist_id as index and deduplicate, create a list of tuples to be inserted

In [90]:
artist_data_list = []
result = pd.concat(dfs)
result = result.reset_index(drop=True)
result = result.set_index('artist_id',  drop=False)
result = result[~result.index.duplicated(keep='first')]
for value in result.values:
        num_songs, artist_id, artist_latitude, artist_longitude, artist_location, artist_name, song_id, title, duration, year = value
        artist_data = (artist_id, artist_name, artist_location, artist_latitude, artist_longitude)
        artist_data_list.append(artist_data)

In [91]:
conn = psycopg2.connect("host=postgres-db dbname=udacity user=udacity password=udacity")
cur = conn.cursor()
args_str = ','.join(cur.mogrify("(%s,%s,%s,%s,%s)", x).decode("utf-8") for x in artist_data_list)
cur.execute("INSERT INTO artists VALUES " + args_str)
conn.commit()

In [94]:
conn = psycopg2.connect("host=postgres-db dbname=udacity user=udacity password=udacity")
cur = conn.cursor()
cur.execute("SELECT * FROM artists")
artist_records = cur.fetchall()
artist_records

[('AR7G5I41187FB4CE6C',
  'Adam Ant',
  'London, England',
  Decimal('NaN'),
  Decimal('NaN')),
 ('AR8ZCNI1187B9A069B',
  'Planet P Project',
  '',
  Decimal('NaN'),
  Decimal('NaN')),
 ('ARXR32B1187FB57099', 'Gob', '', Decimal('NaN'), Decimal('NaN')),
 ('AR10USD1187B99F3F1',
  'Tweeterfriendly Music',
  'Burlington, Ontario, Canada',
  Decimal('NaN'),
  Decimal('NaN')),
 ('ARMJAGH1187FB546F3',
  'The Box Tops',
  'Memphis, TN',
  Decimal('35.149680'),
  Decimal('-90.048920')),
 ('ARD7TVE1187B99BFB1',
  'Casual',
  'California - LA',
  Decimal('NaN'),
  Decimal('NaN')),
 ('ARKRRTF1187B9984DA',
  'Sonora Santanera',
  '',
  Decimal('NaN'),
  Decimal('NaN')),
 ('ARNTLGG11E2835DDB9', 'Clp', '', Decimal('NaN'), Decimal('NaN')),
 ('ARKFYS91187B98E58F',
  'Jeff And Sheri Easter',
  '',
  Decimal('NaN'),
  Decimal('NaN')),
 ('ARD0S291187B9B7BF5', 'Rated R', 'Ohio', Decimal('NaN'), Decimal('NaN')),
 ('ARH4Z031187B9A71F2',
  'Faye Adams',
  'Newark, NJ',
  Decimal('40.731970'),
  Decimal('-74.1

Similarly, we get all the song data. But this time we don't need to deduplicate.

In [95]:
song_data_list = []
result = pd.concat(dfs)
for value in result.values:
        num_songs, artist_id, artist_latitude, artist_longitude, artist_location, artist_name, song_id, title, duration, year = value
        song_data = (song_id, title, artist_id, year, duration)
        song_data_list.append(song_data)

In [96]:
conn = psycopg2.connect("host=postgres-db dbname=udacity user=udacity password=udacity")
cur = conn.cursor()
args_str = ','.join(cur.mogrify("(%s,%s,%s,%s,%s)", x).decode("utf-8") for x in song_data_list)
cur.execute("INSERT INTO songs VALUES " + args_str)
conn.commit()

In [97]:
conn = psycopg2.connect("host=postgres-db dbname=udacity user=udacity password=udacity")
cur = conn.cursor()
cur.execute("SELECT * FROM songs")
song_records = cur.fetchall()
song_records

[('SONHOTT12A8C13493C',
  'Something Girls',
  'AR7G5I41187FB4CE6C',
  1982,
  233.40363),
 ('SOIAZJW12AB01853F1', 'Pink World', 'AR8ZCNI1187B9A069B', 1984, 269.81832),
 ('SOFSOCN12A8C143F5D',
  'Face the Ashes',
  'ARXR32B1187FB57099',
  2007,
  209.60608),
 ('SOHKNRJ12A6701D1F8', 'Drop of Rain', 'AR10USD1187B99F3F1', 0, 189.57016),
 ('SOCIWDW12A8C13D406', 'Soul Deep', 'ARMJAGH1187FB546F3', 1969, 148.03546),
 ('SOMZWCG12A8C13C480',
  "I Didn't Mean To",
  'ARD7TVE1187B99BFB1',
  0,
  218.93179),
 ('SOXVLOJ12AB0189215', 'Amor De Cabaret', 'ARKRRTF1187B9984DA', 0, 177.47546),
 ('SOUDSGM12AC9618304',
  'Insatiable (Instrumental Version)',
  'ARNTLGG11E2835DDB9',
  0,
  266.39628),
 ('SOYMRWW12A6D4FAB14',
  'The Moon And I (Ordinary Day Album Version)',
  'ARKFYS91187B98E58F',
  0,
  267.7024),
 ('SOMJBYD12A6D4F8557',
  'Keepin It Real (Skit)',
  'ARD0S291187B9B7BF5',
  0,
  114.78159),
 ('SOVYKGO12AB0187199',
  'Crazy Mixed Up World',
  'ARH4Z031187B9A71F2',
  1961,
  156.39465),
 ('SOGN

## Process log data