In [1]:
# require to wrap all our import libraries within the spotify function 
from airflow.decorators import dag, task
from airflow import DAG
from datetime import datetime
from datetime import timedelta
from airflow.operators.python import PythonOperator, BranchPythonOperator
from airflow.operators.bash import BashOperator
from airflow.utils.dates import days_ago 
import pathlib
import pandas as pd 
import sqlalchemy 
import configparser as ConfigParser
import spotipy
from spotipy.oauth2 import SpotifyOAuth

In [2]:
sqlalchemy.__version__

'1.4.27'

In [4]:
def check_if_valid_data(df:pd.DataFrame):
       # check if dataframe is empty, it means there were no songs listened to
       if df.empty:
           print('No songs downloaded. Finishing execution')
           return False 
       # Primary key check 
       if pd.Series(df['played_at_list']).is_unique:       # this check helps us understand that there are no duplicate rows in our database
           pass 
       else: 
           raise Exception("Primary Key check if violated")  # here pipeline fail, maybe can send email to me
       yesterday = datetime.now() - timedelta(days = 1)
       # just check yesterday's date at 0 hour, 0 minute, 0 second and 0 microseconds
       yesterday = yesterday.replace(hour = 0,minute = 0,second =0,microsecond =0 )
       # timestamps = df['timestamps'].tolist()
       df = df[df['timestamps']== yesterday.strftime("%Y-%m-%d")]
       songs_json= df.to_json()
       # for timestamp in timestamps:
       #     # strptime --> converts string to time
       #      # if we catch records that are not yesterday, we want the pipeline to raise exception 
       #     if datetime.strptime(timestamp,"%Y-%m-%d")!= yesterday:
       #         raise Exception("At least one of the songs does not come within last 24 hours")
       return songs_json

In [5]:
def update_database(DB_LOCATION,json_data):
        # start database
        # if value:

        songs_table = pd.read_json(json_data)

        engine = sqlalchemy.create_engine(DB_LOCATION)
        # conn = sqlite3.connect('james_played_tracks.sqlite')
        # cursor = conn.cursor()            
        sql_query = """
        CREATE TABLE IF NOT EXISTS james_played_tracks(
            played_at_list VARCHAR(200), 
            timestamps VARCHAR(200),
            artist_name VARCHAR(200), 
            song_names VARCHAR(200),
            CONSTRAINT primary_key_constraint PRIMARY KEY (played_at_list)
        )
        """
        engine.execute(sql_query)
        print("Opened database successfully")  
        # update database 
        try: 
            songs_table.to_sql(name = 'james_played_tracks',con = engine, if_exists= 'append',index = False)
            print("New songs populated in database!")
        except: 
            print("Data already exists in database")
        # conn.close()
        print("database closed successfully")

## Spotipy documentation: 

https://spotipy.readthedocs.io/en/master/#

In [6]:
redirect_uri = 'https://localhost:8888/callback/'
scope= "user-read-recently-played"

In [7]:
current_directory = !pwd

In [8]:
# CURRENT_PATH_DIR= pathlib.Path(!pwd).absolute()
GRANDPARENT_PATH = pathlib.Path(current_directory[0])
CONF_PATH = GRANDPARENT_PATH.joinpath("configuration/config.ini")
cf_parser= ConfigParser.ConfigParser()
cf_parser.read(CONF_PATH)


['/home/maxong/spotify_airflow/configuration/config.ini']

In [9]:
    # TOKEN = cf_parser.get('spotify_details','token')
WEBSITE = cf_parser.get('spotify_details','website')
CLIENT_ID = cf_parser.get('spotify_details','client_id')
CLIENT_SECRET = cf_parser.get('spotify_details','client_secret')
REDIRECT_URL = cf_parser.get('spotify_details','redirect_url')
SCOPE = cf_parser.get('spotify_details','scope')
DB_LOCATION = cf_parser.get('database','db_location_postgres')

In [10]:
config_dictionary = {
'website':WEBSITE,
'client_id':CLIENT_ID,
'client_secret':CLIENT_SECRET,
'redirect_url':REDIRECT_URL,
'scope':SCOPE,
'db_location':DB_LOCATION
}

## Following this stackoverflow to auto refresh the token 

https://stackoverflow.com/questions/48883731/refresh-token-spotipy#:~:text=So%20it%20will%20be%20refreshed,access%20token%20%2F%20refresh%20token%20previously.


Direct link to spotipy utils to follow https://github.com/spotipy-dev/spotipy/blob/master/spotipy/util.py


In [11]:
def create_spotify_api_details(config_dictionary):
    auth_manager=SpotifyOAuth(scope=config_dictionary['scope'],
                                client_id =config_dictionary['client_id'] ,
                                client_secret = config_dictionary['client_secret'],
                                redirect_uri = config_dictionary['redirect_url'])
    spotify = spotipy.Spotify(auth_manager=auth_manager)
    return auth_manager,spotify

   

In [12]:
def refresh_spotify_api_details(auth_manager, spotify,config_dictionary):
   token_info = auth_manager.cache_handler.get_cached_token()
   if auth_manager.is_token_expired(token_info):
       auth_manager, spotify = create_spotify_api_details(config_dictionary)
   return auth_manager, spotify

In [18]:
today = datetime.now().replace(hour = 0,second = 0,minute =0,microsecond=0)
# because everyday we want to see the songs we've listed to for the 
# previous 24 hrs
yesterday = today - timedelta(days =2)
# unix timestamp in miliseconds, that's why need to * 1000
yesterday_unix_timestamp = int(yesterday.timestamp()) * 1000

In [19]:
yesterday

datetime.datetime(2022, 11, 12, 0, 0)

In [20]:
auth_manager,spotify = create_spotify_api_details(config_dictionary)

In [21]:
auth_manager, spotify = refresh_spotify_api_details(auth_manager, spotify,config_dictionary)
data = spotify.current_user_recently_played(after =yesterday_unix_timestamp)

In [23]:
while True:
        auth_manager, spotify = refresh_spotify_api_details(auth_manager, spotify,config_dictionary)
        data = spotify.current_user_recently_played(after =yesterday_unix_timestamp)

        try:
            # if python not equals to zero 
            if data['items'] != []:
                artist_name = []
                song_names = []
                played_at_list = []
                timestamps = []         
                for i in data['items']:
                    if i['played_at'][:10]!= today.strftime("%Y-%m-%d"):
                        song_names.append(i['track']['name'])
                        played_at_list.append(i['played_at'])
                        timestamps.append(i['played_at'][:10])
                        artist_name.append(i['track']['artists'][0]['name'])            
                # data is in dataframe format now
                songs_table = pd.DataFrame([played_at_list,timestamps,artist_name,song_names]).T
                songs_table.columns = ['played_at_list','timestamps','artist_name','song_names']            

                # update database
                # update_database(DB_LOCATION,songs_table)
                print(songs_table)

                # once you updated the database, break out of the loop
                break
            # if there are no data that day, don't even open up the database, skip that day    
            else:
                print("No songs played yesterday")
                # if there are no songs, break out 
                break 
        except:
            print("Error with database or spotify data returned")

              played_at_list  timestamps     artist_name  \
0   2022-11-12T03:44:05.921Z  2022-11-12      Fujii Kaze   
1   2022-11-12T01:18:36.814Z  2022-11-12     OneRepublic   
2   2022-11-12T01:16:07.587Z  2022-11-12  Meghan Trainor   
3   2022-11-12T01:13:52.587Z  2022-11-12    Taylor Swift   
4   2022-11-12T01:10:29.739Z  2022-11-12            Joji   
5   2022-11-12T01:06:35.985Z  2022-11-12    Taylor Swift   
6   2022-11-12T01:03:40.843Z  2022-11-12     LE SSERAFIM   
7   2022-11-12T01:00:36.274Z  2022-11-12            JVKE   
8   2022-11-12T00:57:00.197Z  2022-11-12       Sam Smith   
9   2022-11-12T00:54:22.960Z  2022-11-12            Joji   
10  2022-11-12T00:50:50.702Z  2022-11-12    Taylor Swift   
11  2022-11-12T00:41:36.784Z  2022-11-12       Sam Smith   
12  2022-11-12T00:38:33.435Z  2022-11-12       Sam Smith   
13  2022-11-12T00:35:41.912Z  2022-11-12       Sam Smith   
14  2022-11-12T00:32:30.448Z  2022-11-12       Sam Smith   
15  2022-11-12T00:29:42.321Z  2022-11-12

In [24]:
songs_table

Unnamed: 0,played_at_list,timestamps,artist_name,song_names
0,2022-11-12T03:44:05.921Z,2022-11-12,Fujii Kaze,死ぬのがいいわ
1,2022-11-12T01:18:36.814Z,2022-11-12,OneRepublic,I Ain't Worried
2,2022-11-12T01:16:07.587Z,2022-11-12,Meghan Trainor,Made You Look
3,2022-11-12T01:13:52.587Z,2022-11-12,Taylor Swift,Lavender Haze
4,2022-11-12T01:10:29.739Z,2022-11-12,Joji,Glimpse of Us
5,2022-11-12T01:06:35.985Z,2022-11-12,Taylor Swift,Midnight Rain
6,2022-11-12T01:03:40.843Z,2022-11-12,LE SSERAFIM,ANTIFRAGILE
7,2022-11-12T01:00:36.274Z,2022-11-12,JVKE,golden hour
8,2022-11-12T00:57:00.197Z,2022-11-12,Sam Smith,Unholy (feat. Kim Petras)
9,2022-11-12T00:54:22.960Z,2022-11-12,Joji,Die For You


In [31]:
songs_json= check_if_valid_data(songs_table)

No songs downloaded. Finishing execution


In [32]:
songs_table = pd.read_json(songs_json)


ValueError: Invalid file path or buffer object type: <class 'bool'>

In [25]:
songs_table

Unnamed: 0,played_at_list,timestamps,artist_name,song_names
0,2022-11-12T03:44:05.921Z,2022-11-12,Fujii Kaze,死ぬのがいいわ
1,2022-11-12T01:18:36.814Z,2022-11-12,OneRepublic,I Ain't Worried
2,2022-11-12T01:16:07.587Z,2022-11-12,Meghan Trainor,Made You Look
3,2022-11-12T01:13:52.587Z,2022-11-12,Taylor Swift,Lavender Haze
4,2022-11-12T01:10:29.739Z,2022-11-12,Joji,Glimpse of Us
5,2022-11-12T01:06:35.985Z,2022-11-12,Taylor Swift,Midnight Rain
6,2022-11-12T01:03:40.843Z,2022-11-12,LE SSERAFIM,ANTIFRAGILE
7,2022-11-12T01:00:36.274Z,2022-11-12,JVKE,golden hour
8,2022-11-12T00:57:00.197Z,2022-11-12,Sam Smith,Unholy (feat. Kim Petras)
9,2022-11-12T00:54:22.960Z,2022-11-12,Joji,Die For You
