In [8]:
# require to wrap all our import libraries within the spotify function 
from airflow.decorators import dag, task
from airflow import DAG
from datetime import datetime
from datetime import timedelta
from airflow.operators.python import PythonOperator, BranchPythonOperator
from airflow.operators.bash import BashOperator
from airflow.utils.dates import days_ago 
import pathlib
import pandas as pd 
import sqlalchemy 
import configparser as ConfigParser
import spotipy
from spotipy.oauth2 import SpotifyOAuth

In [39]:
def check_if_valid_data(df:pd.DataFrame) -> bool:
        # check if dataframe is empty, it means there were no songs listened to
        if df.empty:
            print('No songs downloaded. Finishing execution')
            return False 

        # Primary key check 
        if pd.Series(df['played_at_list']).is_unique:       # this check helps us understand that there are no duplicate rows in our database
            pass 
        else: 
            raise Exception("Primary Key check if violated")  # here pipeline fail, maybe can send email to me
        yesterday = datetime.now() - timedelta(days = 1)
        # just check yesterday's date at 0 hour, 0 minute, 0 second and 0 microseconds
        yesterday = yesterday.replace(hour = 0,minute = 0,second =0,microsecond =0 )
        timestamps = df['timestamps'].tolist()

        for timestamp in timestamps:
            # strptime --> converts string to time
             # if we catch records that are not yesterday, we want the pipeline to raise exception 
            if datetime.strptime(timestamp,"%Y-%m-%d")!= yesterday:
                raise Exception("At least one of the songs does not come within last 24 hours")
        return True

## Spotipy documentation: 

https://spotipy.readthedocs.io/en/master/#

In [11]:
client_id = 'bf76bf609f1a4be6b656c55e4d77abd4'
client_secret = '8949a39342a74eddb3405b7a4f747852'

In [12]:
redirect_uri = 'https://localhost:8888/callback/'
scope= "user-read-recently-played"

In [14]:
current_directory = !pwd

In [15]:
# CURRENT_PATH_DIR= pathlib.Path(!pwd).absolute()
GRANDPARENT_PATH = pathlib.Path(current_directory[0])
CONF_PATH = GRANDPARENT_PATH.joinpath("configuration/config.ini")
cf_parser= ConfigParser.ConfigParser()
cf_parser.read(CONF_PATH)


['/home/maxong/spotify_airflow/configuration/config.ini']

In [16]:
    # TOKEN = cf_parser.get('spotify_details','token')
WEBSITE = cf_parser.get('spotify_details','website')
CLIENT_ID = cf_parser.get('spotify_details','client_id')
CLIENT_SECRET = cf_parser.get('spotify_details','client_secret')
REDIRECT_URL = cf_parser.get('spotify_details','redirect_url')
SCOPE = cf_parser.get('spotify_details','scope')
DB_LOCATION = cf_parser.get('database','db_location_postgres')

In [17]:
config_dictionary = {
'website':WEBSITE,
'client_id':CLIENT_ID,
'client_secret':CLIENT_SECRET,
'redirect_url':REDIRECT_URL,
'scope':SCOPE,
'db_location':DB_LOCATION
}

## Following this stackoverflow to auto refresh the token 

https://stackoverflow.com/questions/48883731/refresh-token-spotipy#:~:text=So%20it%20will%20be%20refreshed,access%20token%20%2F%20refresh%20token%20previously.

In [19]:
def create_spotify_api_details(config_dictionary):
    auth_manager=SpotifyOAuth(scope=config_dictionary['scope'],
                                client_id =config_dictionary['client_id'] ,
                                client_secret = config_dictionary['client_secret'],
                                redirect_uri = config_dictionary['redirect_url'])
    spotify = spotipy.Spotify(auth_manager=auth_manager)
    return auth_manager,spotify

   

In [20]:
def refresh_spotify_api_details(auth_manager, spotify,config_dictionary):
   token_info = auth_manager.cache_handler.get_cached_token()
   if auth_manager.is_token_expired(token_info):
       auth_manager, spotify = create_spotify(config_dictionary)
   return auth_manager, spotify

In [21]:
today = datetime.now().replace(hour = 0,second = 0,minute =0,microsecond=0)

# because everyday we want to see the songs we've listed to for the 
# previous 24 hrs
yesterday = today - timedelta(days =1)
# unix timestamp in miliseconds, that's why need to * 1000
yesterday_unix_timestamp = int(yesterday.timestamp()) * 1000

In [24]:
auth_manager, spotify = create_spotify_api_details(config_dictionary)


In [25]:
auth_manager, spotify = refresh_spotify_api_details(auth_manager, spotify,config_dictionary)
data = spotify.current_user_recently_played(after =yesterday_unix_timestamp)

In [27]:
data

{'items': [{'track': {'album': {'album_type': 'album',
     'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/6sFIWsNpZYqfjUpaCgueju'},
       'href': 'https://api.spotify.com/v1/artists/6sFIWsNpZYqfjUpaCgueju',
       'id': '6sFIWsNpZYqfjUpaCgueju',
       'name': 'Carly Rae Jepsen',
       'type': 'artist',
       'uri': 'spotify:artist:6sFIWsNpZYqfjUpaCgueju'}],
     'available_markets': ['AD',
      'AE',
      'AG',
      'AL',
      'AM',
      'AO',
      'AR',
      'AT',
      'AU',
      'AZ',
      'BA',
      'BB',
      'BD',
      'BE',
      'BF',
      'BG',
      'BH',
      'BI',
      'BJ',
      'BN',
      'BO',
      'BR',
      'BS',
      'BT',
      'BW',
      'BZ',
      'CD',
      'CG',
      'CH',
      'CI',
      'CL',
      'CM',
      'CO',
      'CR',
      'CV',
      'CW',
      'CY',
      'CZ',
      'DE',
      'DJ',
      'DK',
      'DM',
      'DO',
      'DZ',
      'EC',
      'EE',
      'EG',
      'ES',
      'FI'

In [26]:
check_if_valid_data()

<function __main__.check_if_valid_data(df: pandas.core.frame.DataFrame) -> bool>

In [36]:
data

{'items': [{'track': {'album': {'album_type': 'album',
     'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/6qqNVTkY8uBg9cP3Jd7DAH'},
       'href': 'https://api.spotify.com/v1/artists/6qqNVTkY8uBg9cP3Jd7DAH',
       'id': '6qqNVTkY8uBg9cP3Jd7DAH',
       'name': 'Billie Eilish',
       'type': 'artist',
       'uri': 'spotify:artist:6qqNVTkY8uBg9cP3Jd7DAH'}],
     'available_markets': ['AD',
      'AE',
      'AG',
      'AL',
      'AM',
      'AO',
      'AR',
      'AT',
      'AU',
      'AZ',
      'BA',
      'BB',
      'BD',
      'BE',
      'BF',
      'BG',
      'BH',
      'BI',
      'BJ',
      'BN',
      'BO',
      'BR',
      'BS',
      'BT',
      'BW',
      'BY',
      'BZ',
      'CA',
      'CD',
      'CG',
      'CH',
      'CI',
      'CL',
      'CM',
      'CO',
      'CR',
      'CV',
      'CW',
      'CY',
      'CZ',
      'DE',
      'DJ',
      'DK',
      'DM',
      'DO',
      'DZ',
      'EC',
      'EE',
      'EG',
 

In [31]:
while True:
        auth_manager, spotify = refresh_spotify_api_details(auth_manager, spotify,config_dictionary)
        data = spotify.current_user_recently_played(after =yesterday_unix_timestamp)

        try:
            # if python not equals to zero 
            if data['items'] != []:
                artist_name = []
                song_names = []
                played_at_list = []
                timestamps = []         
                for i in data['items']:
                    song_names.append(i['track']['name'])
                    played_at_list.append(i['played_at'])
                    timestamps.append(i['played_at'][:10])
                    artist_name.append(i['track']['artists'][0]['name'])            
                # data is in dataframe format now
                songs_table = pd.DataFrame([played_at_list,timestamps,artist_name,song_names]).T
                songs_table.columns = ['played_at_list','timestamps','artist_name','song_names']            

                # update database
                # update_database(DB_LOCATION,songs_table)
                print(songs_table)

                # once you updated the database, break out of the loop
                break
            # if there are no data that day, don't even open up the database, skip that day    
            else:
                print("No songs played yesterday")
                # if there are no songs, break out 
                break 
        except:
            print("Error with database or spotify data returned")

              played_at_list  timestamps       artist_name  \
0   2022-10-23T05:50:09.033Z  2022-10-23  Carly Rae Jepsen   
1   2022-10-23T05:46:22.881Z  2022-10-23  Carly Rae Jepsen   
2   2022-10-23T05:43:07.506Z  2022-10-23  Carly Rae Jepsen   
3   2022-10-23T05:40:38.071Z  2022-10-23  Carly Rae Jepsen   
4   2022-10-23T05:38:21.829Z  2022-10-23  Carly Rae Jepsen   
5   2022-10-23T05:35:22.231Z  2022-10-23  Carly Rae Jepsen   
6   2022-10-23T05:32:29.026Z  2022-10-23  Carly Rae Jepsen   
7   2022-10-23T05:29:59.050Z  2022-10-23  Carly Rae Jepsen   
8   2022-10-23T05:26:17.228Z  2022-10-23      Taylor Swift   
9   2022-10-23T05:23:03.004Z  2022-10-23      Taylor Swift   
10  2022-10-23T05:18:16.258Z  2022-10-23      Taylor Swift   
11  2022-10-23T05:17:42.048Z  2022-10-23      Taylor Swift   
12  2022-10-23T05:14:48.013Z  2022-10-23      Taylor Swift   
13  2022-10-23T05:11:07.006Z  2022-10-23      Taylor Swift   

                                song_names  
0                       

In [40]:
check_if_valid_data(songs_table)

True

In [19]:
import sqlalchemy

In [20]:
engine = sqlalchemy.create_engine('postgresql://dev:pandaburp94*@localhost:5433/postgres')


In [21]:
    sql_query = """
    CREATE TABLE IF NOT EXISTS james_played_tracks(
        played_at_list VARCHAR(200), 
        timestamps VARCHAR(200),
        artist_name VARCHAR(200), 
        song_names VARCHAR(200),
        CONSTRAINT primary_key_constraint PRIMARY KEY (played_at_list)
    )
    """
    engine.execute(sql_query)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7fa98f2fcd30>

In [22]:
songs_table.to_sql(name = 'james_played_tracks',con = engine, if_exists= 'append',index = False)


12

In [None]:
s