In [10]:
import os
import glob
import pandas as pd

In [11]:
def get_files(filepath):
    all_files = []
    for root, dirs, files in os.walk(filepath):
        files = glob.glob(os.path.join(root,'*.json'))
        for f in files :
            all_files.append(os.path.abspath(f))
    
    return all_files

# Process `song_data`
In the initial phase of the ETL process, the `song_data` dataset is utilized to populate the `songs` and `artists` dimensional tables.

In [12]:
song_files = get_files('data/song_data')

In [13]:
song_files[0]

'/home/div/Data-Engineering-projects/postgres_ETL/data/song_data/A/B/B/TRABBBV128F42967D7.json'

In [14]:
df = pd.read_json(song_files[0], lines=True)
df.head()

Unnamed: 0,num_songs,artist_id,artist_latitude,artist_longitude,artist_location,artist_name,song_id,title,duration,year
0,1,AR7SMBG1187B9B9066,,,,Los Manolos,SOBCOSW12A8C13D398,Rumba De Barcelona,218.38322,0


# 1. `songs` Table

Extract the necessary data for the `songs` table by selecting the columns `song_id`, `title`, `artist_id`, `year`, and `duration`. Use `df.values` to retrieve only the values from these columns in the DataFrame.

In [15]:
song_data = list(df[['song_id', 'title', 'artist_id', 'year', 'duration']].values[0])
song_data

['SOBCOSW12A8C13D398',
 'Rumba De Barcelona',
 'AR7SMBG1187B9B9066',
 0,
 218.38322]

In [16]:
songs_df = df[['song_id', 'title', 'artist_id', 'year', 'duration']]
songs_df.head()

Unnamed: 0,song_id,title,artist_id,year,duration
0,SOBCOSW12A8C13D398,Rumba De Barcelona,AR7SMBG1187B9B9066,0,218.38322


# 2. `artists` Table

Extract the data required for the `artists` table by selecting the columns `artist_id`, `artist_name`, and `artist_location`. Use `df.values` to obtain only the values from these specified columns in the DataFrame. Additionally, select only the first record from the DataFrame by indexing appropriately.

In [17]:
artist_data = list(df[['artist_id', 'artist_name', 'artist_location',
                       'artist_latitude', 'artist_longitude']].values[0])
artist_data

['AR7SMBG1187B9B9066', 'Los Manolos', '', nan, nan]

In [18]:
artists_df = df[['artist_id', 'artist_name', 'artist_location',
                       'artist_latitude', 'artist_longitude']]
artists_df.head()

Unnamed: 0,artist_id,artist_name,artist_location,artist_latitude,artist_longitude
0,AR7SMBG1187B9B9066,Los Manolos,,,


# Process `log_data`

In this section, the ETL process is applied to the second dataset, `log_data`, to populate the `time` and `users` dimensional tables, as well as the `songplays` fact table.

The ETL is executed on a single log file, and a single record is inserted into each table. Utilize the `get_files` function detailed earlier to retrieve a list of all JSON log files from the `data/log_data` directory.

In [19]:
log_files = get_files('data/log_data')

In [20]:
log_files[0]

'/home/div/Data-Engineering-projects/postgres_ETL/data/log_data/2018/11/2018-11-23-events.json'

In [21]:
df = pd.read_json(log_files[0], lines=True)
df.head()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,Great Lake Swimmers,Logged In,Kevin,M,0,Arellano,215.11791,free,"Harrisburg-Carlisle, PA",PUT,NextSong,1540007000000.0,815,Your Rocky Spine,200,1542931645796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",66
1,Soziedad Alkoholika,Logged In,Kevin,M,1,Arellano,204.7473,free,"Harrisburg-Carlisle, PA",PUT,NextSong,1540007000000.0,815,Va Bien,200,1542931860796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",66
2,Franz Ferdinand,Logged In,Kevin,M,2,Arellano,172.01587,free,"Harrisburg-Carlisle, PA",PUT,NextSong,1540007000000.0,815,Eleanor Put Your Boots On,200,1542932064796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",66
3,Modest Mouse,Logged In,Kevin,M,3,Arellano,209.52771,free,"Harrisburg-Carlisle, PA",PUT,NextSong,1540007000000.0,815,Float On,200,1542932236796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",66
4,Adam Lambert,Logged In,Kevin,M,4,Arellano,266.44853,free,"Harrisburg-Carlisle, PA",PUT,NextSong,1540007000000.0,815,Aftermath,200,1542932445796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",66


# 3. `time` table

To prepare data for the `time` table, begin by filtering the records to include only those with the `NextSong` action. Then, convert the `ts` column, which contains timestamps, to datetime format. From this datetime-converted `ts` column, extract the following attributes: `timestamp`, `hour`, `day`, `week`, `month`, `year`, and `weekday`. These attributes will be used to populate the `time` table.

In [22]:
df = df[df['page'] == 'NextSong']
df.head()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,Great Lake Swimmers,Logged In,Kevin,M,0,Arellano,215.11791,free,"Harrisburg-Carlisle, PA",PUT,NextSong,1540007000000.0,815,Your Rocky Spine,200,1542931645796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",66
1,Soziedad Alkoholika,Logged In,Kevin,M,1,Arellano,204.7473,free,"Harrisburg-Carlisle, PA",PUT,NextSong,1540007000000.0,815,Va Bien,200,1542931860796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",66
2,Franz Ferdinand,Logged In,Kevin,M,2,Arellano,172.01587,free,"Harrisburg-Carlisle, PA",PUT,NextSong,1540007000000.0,815,Eleanor Put Your Boots On,200,1542932064796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",66
3,Modest Mouse,Logged In,Kevin,M,3,Arellano,209.52771,free,"Harrisburg-Carlisle, PA",PUT,NextSong,1540007000000.0,815,Float On,200,1542932236796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",66
4,Adam Lambert,Logged In,Kevin,M,4,Arellano,266.44853,free,"Harrisburg-Carlisle, PA",PUT,NextSong,1540007000000.0,815,Aftermath,200,1542932445796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",66


In [23]:
t = pd.to_datetime(df['ts'])
t.head()

0   1970-01-01 00:25:42.931645796
1   1970-01-01 00:25:42.931860796
2   1970-01-01 00:25:42.932064796
3   1970-01-01 00:25:42.932236796
4   1970-01-01 00:25:42.932445796
Name: ts, dtype: datetime64[ns]

In [24]:
time_data = [(tt.value, tt.hour, tt.day, tt.week, tt.month, tt.year, tt.weekday()) for tt in t]
column_labels = ('timestamp', 'hour', 'day', 'week', 'month', 'year', 'weekday')

In [25]:
time_df = pd.DataFrame(data=time_data, columns=column_labels)
time_df.head()

Unnamed: 0,timestamp,hour,day,week,month,year,weekday
0,1542931645796,0,1,1,1,1970,3
1,1542931860796,0,1,1,1,1970,3
2,1542932064796,0,1,1,1,1970,3
3,1542932236796,0,1,1,1,1970,3
4,1542932445796,0,1,1,1,1970,3


# 4. `users` table

To compile data for the `users` table, select the following columns from the dataset: `userId`, `firstName`, `lastName`, `gender`, and `level`. These selected attributes will be used to populate the `users` table.

In [26]:
user_df = df[['userId', 'firstName', 'lastName', 'gender', 'level']]
user_df.head()

Unnamed: 0,userId,firstName,lastName,gender,level
0,66,Kevin,Arellano,M,free
1,66,Kevin,Arellano,M,free
2,66,Kevin,Arellano,M,free
3,66,Kevin,Arellano,M,free
4,66,Kevin,Arellano,M,free


# 5. `songsplay` Table

The construction of the `songplays` table is complex as it requires integrating data from the `songs`, `artists`, and original log files. The absence of direct song and artist IDs in the log files necessitates querying the `songs` and `artists` tables to match song title, artist name, and duration for the correct IDs.


In [28]:
df = df.merge(songs_df, how='left', left_on=['song', 'artist', 'length'], right_on=['title', 'artist_id', 'duration'])

In [29]:
df.head()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,...,song,status,ts,userAgent,userId,song_id,title,artist_id,year,duration
0,Great Lake Swimmers,Logged In,Kevin,M,0,Arellano,215.11791,free,"Harrisburg-Carlisle, PA",PUT,...,Your Rocky Spine,200,1542931645796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",66,,,,,
1,Soziedad Alkoholika,Logged In,Kevin,M,1,Arellano,204.7473,free,"Harrisburg-Carlisle, PA",PUT,...,Va Bien,200,1542931860796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",66,,,,,
2,Franz Ferdinand,Logged In,Kevin,M,2,Arellano,172.01587,free,"Harrisburg-Carlisle, PA",PUT,...,Eleanor Put Your Boots On,200,1542932064796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",66,,,,,
3,Modest Mouse,Logged In,Kevin,M,3,Arellano,209.52771,free,"Harrisburg-Carlisle, PA",PUT,...,Float On,200,1542932236796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",66,,,,,
4,Adam Lambert,Logged In,Kevin,M,4,Arellano,266.44853,free,"Harrisburg-Carlisle, PA",PUT,...,Aftermath,200,1542932445796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",66,,,,,


In [32]:
# Creating songplays DataFrame
# Including all relevant fields and possibly adding a 'songplay_id'
df['songplay_id'] = range(1, len(df) + 1)  # Generating songplay IDs
songplays_df = df[['songplay_id', 'ts', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent']].copy()

# Rename columns to match the expected format for songplays table
songplays_df.rename(columns={
    'ts': 'start_time',
    'userId': 'user_id',
    'sessionId': 'session_id',
    'userAgent': 'user_agent'
}, inplace=True)

# Convert timestamp to datetime
songplays_df.loc[:, 'start_time'] = pd.to_datetime(songplays_df['start_time'], unit='ms')

# Show the head of the DataFrame to verify
songplays_df.head()

Unnamed: 0,songplay_id,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent
0,1,2018-11-23 00:07:25.796,66,free,,,815,"Harrisburg-Carlisle, PA","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4..."
1,2,2018-11-23 00:11:00.796,66,free,,,815,"Harrisburg-Carlisle, PA","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4..."
2,3,2018-11-23 00:14:24.796,66,free,,,815,"Harrisburg-Carlisle, PA","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4..."
3,4,2018-11-23 00:17:16.796,66,free,,,815,"Harrisburg-Carlisle, PA","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4..."
4,5,2018-11-23 00:20:45.796,66,free,,,815,"Harrisburg-Carlisle, PA","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4..."
