## Spotify data ETL project

### Imports

In [37]:
import pandas as pd
import sqlalchemy
from sqlalchemy.orm import sessionmaker
import requests
import json
import datetime
from datetime import datetime, timedelta
import sqlite3

In [38]:
# constants
DB_LOCATION = 'sqlite:///spoty_songs.sqlite'
USER_ID = 'Hristiyan'
SPOTY_TOKEN = 'BQB2F97pqY6fOqjRb-qCi7yTrKLFApCfNPh4sByClMlRbIA3K9pUQ6v8YnrCDB-fb_3n43o6UR8HT8vO2ZT6_HfkOyxrSoTIxINqE2VAGVOFOAYcQ8kx6sFHtcZUYMLgRlYVw0kxIbiKhI6tumXNTOe_2EfB4owPZ8zewk0pS50Gaw2P9WtZfFkPB3KM-U8LQtViRw'

In [53]:
# create data validation function
def data_validation(df: pd.DataFrame) -> bool:
    if df.empty:
        print('No songs downloaded. Finishing execution')
        return False

    # Primary Key Check
    if pd.Series(df['played_at']).is_unique:
        pass
    else:
        raise Exception('Primary Key Check is violated')

    # Check for nulls
    if df.isnull().values.any():
        raise Exception('Null values found')

    # Check that all timestamps are of yesterday's date
    yesterday = datetime.now() - timedelta(days=1)
    yesterday = yesterday.replace(hour=0, minute=0, second=0, microsecond=0)

    timestamps = df['timestamp'].tolist()
    for timestamp in timestamps:
        if datetime.strptime(timestamp, '%Y-%m-%d') != yesterday:
            raise Exception('At least one of the returned songs does not come from within the last 24 hours')

    return True

In [39]:
# check if current script is being run as main program
if __name__ == "__main__":
    # create the headers for the request
    headers = {
        "Accept" : "application/json",
        "Content-Type" : "application/json",
        "Authorization" : "Bearer {token}".format(token=SPOTY_TOKEN)
    }

In [40]:
# define yesterday unix timestamp step by step
today = datetime.now()
yesterday = today - timedelta(days=1)
yesterday_unix_timestamp = int(yesterday.timestamp()) * 1000

In [41]:
# request the data from the API
r = requests.get("https://api.spotify.com/v1/me/player/recently-played?after={time}".format(time=yesterday_unix_timestamp), headers=headers)

In [42]:
# putting data into json format
data = r.json()

In [44]:
# create the lists needed for the dataframe
song_names = []
artist_names = []
played_at_list = []
timestamps = []

In [45]:
# loop through the data and append to the lists
for song in data["items"]:
    song_names.append(song["track"]["name"])
    artist_names.append(song["track"]["album"]["artists"][0]["name"])
    played_at_list.append(song["played_at"])
    timestamps.append(song["played_at"][0:10])

In [49]:
# create the dictionary of the data
song_dict = {
    "song_name" : song_names,
    "artist_name" : artist_names,
    "played_at" : played_at_list,
    "timestamp" : timestamps
}

In [50]:
# create the dataframe
songs_df = pd.DataFrame(song_dict, columns = ["song_name", "artist_name", "played_at", "timestamp"])

In [None]:
# validate the data using the data validation function
if data_validation(songs_df):
    print('Data valid, proceed to Load stage')