# Sprint 02 | ETL - Database Connection

## Dependencies

In [1]:
!pip install python-dotenv

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install sqlalchemy

Defaulting to user installation because normal site-packages is not writeable


In [3]:
!pip install pymysql

Defaulting to user installation because normal site-packages is not writeable


In [4]:
!pip install mysql-connector-python

Defaulting to user installation because normal site-packages is not writeable


In [5]:
# Library imports
import pandas as pd
from dotenv import load_dotenv
import os
from sqlalchemy import create_engine
import pymysql

In [6]:
# Set pandas settings
pd.set_option('display.max_columns', None)

## Get latest data file from `data - Sprint 02` folder in directory

In [7]:
# Specify the path to the 'data' subfolder
data_folder = 'data - Sprint 02'

# Get a list of all CSV files in the 'data' subfolder
csv_files = [os.path.join(data_folder, file) for file in os.listdir(data_folder) if file.endswith('.csv') and file.startswith('bulk_data_')]

# Check if any CSV files exist in the 'data' subfolder
if not csv_files:
    raise FileNotFoundError("No CSV files found in the 'data' subfolder with the specified prefix.")

# Sort CSV files by modification time (most recent first)
csv_files.sort(key=lambda x: os.path.getmtime(x), reverse=True)

# Select the most recent CSV file
latest_csv_file = csv_files[0]

# Read in the most recent CSV file
final_df = pd.read_csv(latest_csv_file)

# Print the name of the file being read
print(f"Reading DataFrame from CSV file: {latest_csv_file}")

final_df.head()

Reading DataFrame from CSV file: data - Sprint 02/bulk_data_20240421_024706.csv


Unnamed: 0,track_id,track_name,track_album,track_release_date,track_popularity,track_duration_sec,track_explicit,track_all_artists,artist_id,artist_name,artist_genres,artist_followers,artist_popularity,playlist_id,playlist_name,playlist_description,playlist_followers,playlist_snapshot_id,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,playlist_date
0,2oTFuT1tKLHlTJsnBSJJiN,"FTCU (feat. Travis Scott, Chris Brown & Sexyy ...",FTCU (SLEEZEMIX),2024-04-19,0,239.667,True,"['Nicki Minaj', 'Travis Scott', 'Chris Brown',...",0hCNtLu0JehylgoiP8L4Gh,Nicki Minaj,"['hip pop', 'pop', 'queens hip hop', 'rap']",30886319,87,37i9dQZF1DX6GwdWRQMQpq,Feelin' Myself,The hip-hop playlist that's a whole mood. Art ...,3112899,MTcxMzQ5OTIwMCwwMDAwMDAwMGMzNTcwOGMzOGE4MDk0ZT...,,,,,,,,,,,,04-21-2024
1,7mOC6phfrfz3jfigg1aGsE,Wanna Be (with Megan Thee Stallion),Ehhthang Ehhthang,2024-04-05,79,156.444,True,"['GloRilla', 'Megan Thee Stallion']",2qoQgPAilErOKCwE2Y8wOG,GloRilla,['southern hip hop'],645278,73,37i9dQZF1DX6GwdWRQMQpq,Feelin' Myself,The hip-hop playlist that's a whole mood. Art ...,3112899,MTcxMzQ5OTIwMCwwMDAwMDAwMGMzNTcwOGMzOGE4MDk0ZT...,0.918,0.616,1.0,-5.202,0.355,0.0222,0.0,0.135,0.37,135.054,4.0,04-21-2024
2,4XqBLxDaEdlqkHlSlb1Rzq,Enough (Miami),Enough (Miami),2024-03-15,72,158.727,True,['Cardi B'],4kYSro6naA4h99UJvo89HB,Cardi B,"['pop', 'rap']",23606695,81,37i9dQZF1DX6GwdWRQMQpq,Feelin' Myself,The hip-hop playlist that's a whole mood. Art ...,3112899,MTcxMzQ5OTIwMCwwMDAwMDAwMGMzNTcwOGMzOGE4MDk0ZT...,0.926,0.791,5.0,-4.553,0.273,0.00255,0.0,0.0772,0.473,139.166,5.0,04-21-2024
3,2MjXWroB9wlTG2kqv3avfS,Get It Sexyy,Get It Sexyy,2024-03-15,84,148.551,True,['Sexyy Red'],3DbwFQlvLxRSi2uX8mf81A,Sexyy Red,['trap queen'],788904,77,37i9dQZF1DX6GwdWRQMQpq,Feelin' Myself,The hip-hop playlist that's a whole mood. Art ...,3112899,MTcxMzQ5OTIwMCwwMDAwMDAwMGMzNTcwOGMzOGE4MDk0ZT...,0.806,0.737,0.0,-4.709,0.287,0.0983,0.0,0.051,0.536,145.031,4.0,04-21-2024
4,3Vlt0DKYBK0h3Vf92nywhp,OKLOSER,Scarlet 2 CLAUDE,2024-04-05,78,169.066,True,['Doja Cat'],5cj0lLjcoR7YOSnhnX0Po5,Doja Cat,"['dance pop', 'pop']",29478325,87,37i9dQZF1DX6GwdWRQMQpq,Feelin' Myself,The hip-hop playlist that's a whole mood. Art ...,3112899,MTcxMzQ5OTIwMCwwMDAwMDAwMGMzNTcwOGMzOGE4MDk0ZT...,0.898,0.674,1.0,-8.373,0.303,0.0255,0.0,0.0835,0.883,152.046,4.0,04-21-2024


## Connect to database

In [8]:
# Load environment variables from .env file
load_dotenv('config.env')

# Get database credentials from environment variables
username = os.environ.get('RDS_USERNAME')
password = os.environ.get('RDS_PASSWORD')
host = os.environ.get('RDS_HOST')
port = os.environ.get('RDS_PORT')
database_name = os.environ.get('RDS_DATABASE2')

# Construct the database URL
url = f"mysql+pymysql://{username}:{password}@{host}:{port}/{database_name}"

In [9]:
def fetch_data_from_database():
    # Create the SQLAlchemy engine using the connection from connect_to_database()
    engine = create_engine(url)
    
    # Fetch data from tables
    try:
        with engine.connect() as conn:
            # Fetch data from dim_track table
            dim_track_df = pd.read_sql_table("dim_track", conn)
            # Fetch data from dim_artist table
            dim_artist_df = pd.read_sql_table("dim_artist", conn)
            # Fetch data from fact_playlist table
            fact_playlist_df = pd.read_sql_table("fact_playlist", conn)
            # Fetch data from intermediate_table table
            intermediate_table_df = pd.read_sql_table("intermediate_table", conn)
    except Exception as e:
        print(f"Error fetching data: {e}")
        return None, None, None, None

    return dim_track_df, dim_artist_df, fact_playlist_df, intermediate_table_df

In [10]:
dim_track_df, dim_artist_df, fact_playlist_df, intermediate_table_df = fetch_data_from_database()

In [11]:
dim_track_df.head()

Unnamed: 0,track_id,track_name,track_album,track_release_date,track_popularity,track_duration_sec,track_explicit,track_all_artists,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,01BjQ7kJDzs7JEVSF1ZHHL,Karibu,Karibu,2024-03-29,23,242.5,0,['WITH U'],0.702,0.809,4,-9.457,0.0398,0.244,0.264,0.108,0.261,120.021,6
1,01TnMXIy7mJJQ7E8uy937N,Von dutch,Von dutch,2024-02-29,78,164.284,1,['Charli XCX'],0.706,0.698,9,-5.679,0.0537,0.00298,0.00375,0.116,0.648,130.013,6
2,01TyFEZu6mHbffsVfxgrFn,Show Me How,Show Me How,2018-02-28,39,215.11,0,['Men I Trust'],0.791,0.267,4,-11.997,0.0415,0.674,0.534,0.102,0.544,165.99,6
3,027ielmg4mNR1p1Ds3Mvgh,Não Vou Namorar,Não Vou Namorar,2024-01-19,78,162.461,1,"['DJ Ws da Igrejinha', 'DJ JOÃO PEREIRA', 'Mc ...",0.761,0.724,4,-0.681,0.325,0.818,9.5e-05,0.107,0.514,85.899,7
4,02DTkcgh0BpYrEYPO05Zex,Greatest Gift (feat. Lila Iké),falling or flying,2023-09-29,60,192.8,0,"['Jorja Smith', 'Lila Iké']",0.633,0.535,6,-6.577,0.189,0.0878,1.3e-05,0.0999,0.718,170.013,6


In [12]:
dim_artist_df.head()

Unnamed: 0,artist_id,artist_name,artist_genres,artist_followers,artist_popularity
0,00FQb4jTyendYWaN8pK0wa,Lana Del Rey,"['art pop', 'pop']",34155145,91
1,00x1fYSGhdqScXBRpSj3DW,Olivia Dean,"['pop soul', 'uk pop']",335904,65
2,01DTVE3KmoPogPZaOvMqO8,Sarkodie,"['afrobeats', 'afropop', 'azonto', 'ghanaian h...",596280,55
3,01XYiBYaoMJcNhPokrg0l0,STAYC,"['k-pop', 'k-pop girl group']",1801839,63
4,02DWGcShQivFepRvGJ7xhB,Adriatique,"['deep disco house', 'melodic house', 'melodic...",259713,58


In [13]:
fact_playlist_df.head()

Unnamed: 0,playlist_id,playlist_name,playlist_description,playlist_followers,AVG_track_release_date_by_playlist,AVG_track_popularity_by_playlist,AVG_track_duration_sec_by_playlist,RATIO_track_explicit_by_playlist,AVG_danceability_by_playlist,AVG_energy_by_playlist,MODE_key_by_playlist,AVG_loudness_by_playlist,AVG_speechiness_by_playlist,AVG_accousticness_by_playlist,AVG_instrumentalness_by_playlist,AVG_liveness_by_playlist,AVG_valence_by_playlist,AVG_tempo_by_playlist,AVG_artist_followers_by_playlist,AVG_artist_popularity_by_playlist
0,37i9dQZEVXbKuaTI1Z1Afx,Viral 50 - USA,Your daily update of the most viral tracks rig...,365991,NaT,79.48,181.245,0.42,0.68396,0.65046,,-6.12648,0.113584,,0.005662,0.18456,0.52066,132.019,2596260.0,72.42
1,37i9dQZEVXbLiRSasKsNU9,Viral 50 - Global,Your daily update of the most viral tracks rig...,1769513,NaT,79.36,198.576,0.32,0.68662,0.6759,,-5.68212,0.12251,,0.00296,0.196626,0.58662,123.731,1723380.0,69.2
2,37i9dQZEVXbLRQDuF5jeBp,Top 50 - USA,Your daily update of the most played tracks ri...,3377106,NaT,89.48,184.404,0.52,0.6613,0.6222,,-6.42558,0.082504,,0.003916,0.155642,0.48974,132.257,13008800.0,84.68
3,37i9dQZEVXbMDoHDwVN2tF,Top 50 - Global,Your daily update of the most played tracks ri...,17077756,NaT,90.96,189.094,0.42,0.66072,0.65024,,-6.34908,0.084378,,0.009312,0.157258,0.51234,125.733,20288600.0,85.24
4,37i9dQZF1DWSqBruwoIXkA,sad hour,you don't get to tell me about sad! Cover: Ben...,1757867,NaT,75.23,203.302,0.14,0.51726,0.405751,,-8.9754,0.05113,,0.029451,0.141135,0.295881,117.45,17376400.0,77.06


In [14]:
intermediate_table_df.head()

Unnamed: 0,track_id,artist_id,playlist_id,playlist_snapshot_id,playlist_date
0,01BjQ7kJDzs7JEVSF1ZHHL,2VhoJf6VdIoPnifnThy9UV,37i9dQZF1DX8tZsk68tuDw,MTcxMzQ5OTIwMCwwMDAwMDAwMGZiYjNmMDA3OTc2ZjQwMT...,2024-04-20
1,01TnMXIy7mJJQ7E8uy937N,25uiPmTg16RbhZWAqwLBy5,37i9dQZF1DWUa8ZRTfalHk,MTcxMzQ5OTIwMCwwMDAwMDAwMDdmYWY4OTI2MDkyMDhhND...,2024-04-20
2,01TnMXIy7mJJQ7E8uy937N,25uiPmTg16RbhZWAqwLBy5,37i9dQZF1DX4dyzvuaRJ0n,MTcxMzQ5OTI2MCwwMDAwMDAwMDliNmU2ZWJkZjZhYmQ3Zm...,2024-04-20
3,01TyFEZu6mHbffsVfxgrFn,3zmfs9cQwzJl575W1ZYXeT,37i9dQZF1DX0MLFaUdXnjA,MTcxMzQ5OTIwMCwwMDAwMDAwMDc1ODcwYmUzZmJiYTNkNT...,2024-04-20
4,027ielmg4mNR1p1Ds3Mvgh,0NB5wv4kn6A919CLHUKRmk,37i9dQZEVXbLiRSasKsNU9,NzY2OTA4MDAwLDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMD...,2024-04-20


## Data transformations

In [15]:
def fact_table_pre_aggregates(df):
    # Perform aggregation operations on a DataFrame containing playlist-related data
    # Group by playlist_id and calculate various aggregate metrics
    return df.groupby('playlist_id').agg(
        playlist_name=('playlist_name', 'last'),
        playlist_description=('playlist_description', 'last'),
        playlist_followers=('playlist_followers', 'last'),
        #playlist_date=('playlist_date', 'first'),
        #AVG_track_release_date_by_playlist=('track_release_date', 'mean'),
        AVG_track_popularity_by_playlist=('track_popularity', 'mean'),
        AVG_track_duration_sec_by_playlist=('track_duration_sec', 'mean'),
        RATIO_track_explicit_by_playlist=('track_explicit', lambda x: x.sum() / len(x)),
        AVG_danceability_by_playlist=('danceability', 'mean'),
        AVG_energy_by_playlist=('energy', 'mean'),
        AVG_loudness_by_playlist=('loudness', 'mean'),
        AVG_speechiness_by_playlist=('speechiness', 'mean'),
        AVG_acousticness_by_playlist=('acousticness', 'mean'),
        AVG_instrumentalness_by_playlist=('instrumentalness', 'mean'),
        AVG_liveness_by_playlist=('liveness', 'mean'),
        AVG_valence_by_playlist=('valence', 'mean'),
        AVG_tempo_by_playlist=('tempo', 'mean'),
        AVG_artist_followers_by_playlist=('artist_followers', 'mean'),
        AVG_artist_popularity_by_playlist=('artist_popularity', 'mean')
    ).reset_index()

In [16]:
def fact_table_aggregates(df):
    # Perform aggregation operations on a DataFrame containing playlist-related data
    # Group by playlist_id and calculate various aggregate metrics
    return df.groupby('playlist_id').agg(
        playlist_name=('playlist_name', 'last'),
        playlist_description=('playlist_description', 'last'),
        playlist_followers=('playlist_followers', 'last'),
        #playlist_date=('playlist_date', 'first'),
        #AVG_track_release_date_by_playlist=('track_release_date', 'mean'),
        AVG_track_popularity_by_playlist=('AVG_track_popularity_by_playlist', 'mean'),
        AVG_track_duration_sec_by_playlist=('AVG_track_duration_sec_by_playlist', 'mean'),
        RATIO_track_explicit_by_playlist=('RATIO_track_explicit_by_playlist', 'mean'),
        AVG_danceability_by_playlist=('AVG_danceability_by_playlist', 'mean'),
        AVG_energy_by_playlist=('AVG_energy_by_playlist', 'mean'),
        AVG_loudness_by_playlist=('AVG_loudness_by_playlist', 'mean'),
        AVG_speechiness_by_playlist=('AVG_speechiness_by_playlist', 'mean'),
        AVG_accousticness_by_playlist=('AVG_accousticness_by_playlist', 'mean'),
        AVG_instrumentalness_by_playlist=('AVG_instrumentalness_by_playlist', 'mean'),
        AVG_liveness_by_playlist=('AVG_liveness_by_playlist', 'mean'),
        AVG_valence_by_playlist=('AVG_valence_by_playlist', 'mean'),
        AVG_tempo_by_playlist=('AVG_tempo_by_playlist', 'mean'),
        AVG_artist_followers_by_playlist=('AVG_artist_followers_by_playlist', 'mean'),
        AVG_artist_popularity_by_playlist=('AVG_artist_popularity_by_playlist', 'mean')
    ).reset_index()

In [17]:
def ingest_new_data(dim_track_df, dim_artist_df, fact_playlist_df, intermediate_table_df, new_data_df):
    # Initialize DataFrames if they are None
    if dim_track_df is None:
        dim_track_df = pd.DataFrame(columns=[
            'track_id', 'track_name', 'track_album', 'track_release_date', 'track_popularity',
            'track_duration_sec', 'track_explicit', 'track_all_artists', 'danceability',
            'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness',
            'liveness', 'valence', 'tempo', 'time_signature', 'playlist_date'
        ])
    if dim_artist_df is None:
        dim_artist_df = pd.DataFrame(columns=[
            'artist_id', 'artist_name', 'artist_genres', 
            'artist_followers', 'artist_popularity', 'playlist_date'
        ])
    if fact_playlist_df is None:
        fact_playlist_df = pd.DataFrame(columns=[
            'playlist_id', 'playlist_name', 'playlist_description','playlist_followers',
            'AVG_track_popularity_by_playlist', 'AVG_track_duration_sec_by_playlist',
            'RATIO_track_explicit_by_playlist', 'AVG_danceability_by_playlist',
            'AVG_energy_by_playlist','AVG_loudness_by_playlist', 'AVG_speechiness_by_playlist',
            'AVG_accousticness_by_playlist','AVG_instrumentalness_by_playlist',
            'AVG_liveness_by_playlist','AVG_valence_by_playlist','AVG_tempo_by_playlist',
            'AVG_artist_followers_by_playlist','AVG_artist_popularity_by_playlist',
        ])
        fact_playlist_df = fact_table_aggregates(new_data_df)
    
    if intermediate_table_df is None:
        intermediate_table_df = pd.DataFrame(columns=[
            'track_id', 'artist_id', 'playlist_id', 
            'playlist_snapshot_id', 'playlist_date'
        ])
    
    # Append new data to the existing DataFrames
    if not new_data_df.empty:
        
        # Append and convert playlist_date to datetime data type for dim_track_df
        dim_track_df = dim_track_df.append(new_data_df[[
            'track_id', 'track_name', 'track_album', 'track_release_date', 'track_popularity',
            'track_duration_sec', 'track_explicit', 'track_all_artists', 'danceability',
            'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness',
            'liveness', 'valence', 'tempo', 'time_signature', 'playlist_date']])
        dim_track_df['playlist_date'] = pd.to_datetime(dim_track_df['playlist_date'])
        # Check if the DataFrame is not empty
        if not dim_track_df.empty:
            # Sort the DataFrame by playlist_date within each group of track_id
            dim_track_df = dim_track_df.sort_values(by=['track_id', 'playlist_date'], ascending=[True, False])
            # Select the first row of each group (corresponding to the most recent playlist_date)
            dim_track_df = dim_track_df.groupby('track_id').first().reset_index()
    
        # Append and convert playlist_date to datetime data type for dim_artist_df
        dim_artist_df = dim_artist_df.append(new_data_df[[
            'artist_id', 'artist_name', 'artist_genres', 
            'artist_followers', 'artist_popularity', 'playlist_date']])
        dim_artist_df['playlist_date'] = pd.to_datetime(dim_artist_df['playlist_date'])
        # Check if the DataFrame is not empty
        if not dim_artist_df.empty:
            # Sort the DataFrame by playlist_date within each group of artist_id
            dim_artist_df = dim_artist_df.sort_values(by=['artist_id', 'playlist_date'], ascending=[True, False])
            # Select the first row of each group (corresponding to the most recent playlist_date)
            dim_artist_df = dim_artist_df.groupby('artist_id').first().reset_index()
            
        # Append new data to fact_playlist_df
        aggregated_new_data_df = fact_table_pre_aggregates(new_data_df)
        fact_playlist_df = pd.concat([fact_playlist_df, aggregated_new_data_df], ignore_index=True)
        fact_playlist_df = fact_table_aggregates(fact_playlist_df)
        fact_playlist_df = fact_playlist_df.drop_duplicates(subset=['playlist_id']).reset_index(drop=True)

        # Append and convert playlist_date to datetime data type for intermediate_table_df
        intermediate_table_df = intermediate_table_df.append(new_data_df[[
            'track_id', 'artist_id', 'playlist_id', 'playlist_snapshot_id', 'playlist_date'
            ]]).drop_duplicates(subset=['track_id', 'playlist_date', 'playlist_id']).reset_index(drop=True)
        intermediate_table_df['playlist_date'] = pd.to_datetime(intermediate_table_df['playlist_date'])
    
    dim_track_df.drop(columns=['playlist_date'], inplace=True)
    dim_track_df.drop_duplicates(subset=['track_id'])
    dim_artist_df.drop(columns=['playlist_date'], inplace=True)
    dim_artist_df.drop_duplicates(subset=['artist_id'])
    intermediate_table_df.drop_duplicates(subset=['track_id', 'playlist_date', 'playlist_id'], inplace=True)
    
    return dim_track_df, dim_artist_df, fact_playlist_df, intermediate_table_df

In [18]:
dim_track_df, dim_artist_df, fact_playlist_df, intermediate_table_df = ingest_new_data(dim_track_df, dim_artist_df, fact_playlist_df, intermediate_table_df, final_df)

In [19]:
duplicate_rows = dim_track_df[dim_track_df.duplicated(subset=['track_id'], keep=False)]
print(duplicate_rows)

dim_track_df.head()

Empty DataFrame
Columns: [track_id, track_name, track_album, track_release_date, track_popularity, track_duration_sec, track_explicit, track_all_artists, danceability, energy, key, loudness, speechiness, acousticness, instrumentalness, liveness, valence, tempo, time_signature]
Index: []


Unnamed: 0,track_id,track_name,track_album,track_release_date,track_popularity,track_duration_sec,track_explicit,track_all_artists,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,00WnaHagi3KLsrbs20OFVt,Sloppy Seconds (Ick Pt. 2),Sloppy Seconds (Ick Pt. 2),2023-11-22,71,162.071,True,['Lay Bankz'],0.878,0.967,7.0,-4.481,0.274,0.191,0.0,0.237,0.735,139.921,4.0
1,01BjQ7kJDzs7JEVSF1ZHHL,Karibu,Karibu,2024-03-29 00:00:00,23,242.5,0,['WITH U'],0.702,0.809,4.0,-9.457,0.0398,0.244,0.264,0.108,0.261,120.021,6.0
2,01TnMXIy7mJJQ7E8uy937N,Von dutch,Von dutch,2024-02-29 00:00:00,78,164.284,1,['Charli XCX'],0.706,0.698,9.0,-5.679,0.0537,0.00298,0.00375,0.116,0.648,130.013,6.0
3,01TyFEZu6mHbffsVfxgrFn,Show Me How,Show Me How,2018-02-28 00:00:00,39,215.11,0,['Men I Trust'],0.791,0.267,4.0,-11.997,0.0415,0.674,0.534,0.102,0.544,165.99,6.0
4,027ielmg4mNR1p1Ds3Mvgh,Não Vou Namorar,Não Vou Namorar,2024-01-19 00:00:00,78,162.461,1,"['DJ Ws da Igrejinha', 'DJ JOÃO PEREIRA', 'Mc ...",0.761,0.724,4.0,-0.681,0.325,0.818,9.5e-05,0.107,0.514,85.899,7.0


In [20]:
duplicate_rows = dim_artist_df[dim_artist_df.duplicated(subset=['artist_id'], keep=False)]
print(duplicate_rows)

dim_artist_df.head()

Empty DataFrame
Columns: [artist_id, artist_name, artist_genres, artist_followers, artist_popularity]
Index: []


Unnamed: 0,artist_id,artist_name,artist_genres,artist_followers,artist_popularity
0,00FQb4jTyendYWaN8pK0wa,Lana Del Rey,"['art pop', 'pop']",34155145,91
1,00x1fYSGhdqScXBRpSj3DW,Olivia Dean,"['pop soul', 'uk pop']",335904,65
2,01DTVE3KmoPogPZaOvMqO8,Sarkodie,"['afrobeats', 'afropop', 'azonto', 'ghanaian h...",596280,55
3,01XYiBYaoMJcNhPokrg0l0,STAYC,"['k-pop', 'k-pop girl group']",1801839,63
4,01aC2ikO4Xgb2LUpf9JfKp,Gary Clark Jr.,"['electric blues', 'modern blues rock', 'rock'...",916092,59


In [21]:
duplicate_rows = fact_playlist_df[fact_playlist_df.duplicated(subset=['playlist_id'], keep=False)]
print(duplicate_rows)

fact_playlist_df.head()

Empty DataFrame
Columns: [playlist_id, playlist_name, playlist_description, playlist_followers, AVG_track_popularity_by_playlist, AVG_track_duration_sec_by_playlist, RATIO_track_explicit_by_playlist, AVG_danceability_by_playlist, AVG_energy_by_playlist, AVG_loudness_by_playlist, AVG_speechiness_by_playlist, AVG_accousticness_by_playlist, AVG_instrumentalness_by_playlist, AVG_liveness_by_playlist, AVG_valence_by_playlist, AVG_tempo_by_playlist, AVG_artist_followers_by_playlist, AVG_artist_popularity_by_playlist]
Index: []


Unnamed: 0,playlist_id,playlist_name,playlist_description,playlist_followers,AVG_track_popularity_by_playlist,AVG_track_duration_sec_by_playlist,RATIO_track_explicit_by_playlist,AVG_danceability_by_playlist,AVG_energy_by_playlist,AVG_loudness_by_playlist,AVG_speechiness_by_playlist,AVG_accousticness_by_playlist,AVG_instrumentalness_by_playlist,AVG_liveness_by_playlist,AVG_valence_by_playlist,AVG_tempo_by_playlist,AVG_artist_followers_by_playlist,AVG_artist_popularity_by_playlist
0,37i9dQZEVXbKuaTI1Z1Afx,Viral 50 - USA,Your daily update of the most viral tracks rig...,365991,79.48,181.245,0.42,0.68396,0.65046,-6.12648,0.113584,,0.005662,0.18456,0.52066,132.019,2596260.0,72.42
1,37i9dQZEVXbLRQDuF5jeBp,Top 50 - USA,Your daily update of the most played tracks ri...,3377106,89.48,184.404,0.52,0.6613,0.6222,-6.42558,0.082504,,0.003916,0.155642,0.48974,132.257,13008800.0,84.68
2,37i9dQZEVXbLiRSasKsNU9,Viral 50 - Global,Your daily update of the most viral tracks rig...,1769513,79.36,198.576,0.32,0.68662,0.6759,-5.68212,0.12251,,0.00296,0.196626,0.58662,123.731,1723380.0,69.2
3,37i9dQZEVXbMDoHDwVN2tF,Top 50 - Global,Your daily update of the most played tracks ri...,17077756,90.96,189.094,0.42,0.66072,0.65024,-6.34908,0.084378,,0.009312,0.157258,0.51234,125.733,20288600.0,85.24
4,37i9dQZF1DWSqBruwoIXkA,sad hour,you don't get to tell me about sad! Cover: Ben...,1757867,75.23,203.302,0.14,0.51726,0.405751,-8.9754,0.05113,,0.029451,0.141135,0.295881,117.45,17376400.0,77.06


In [22]:
duplicate_rows = intermediate_table_df[intermediate_table_df.duplicated(subset=['track_id', 'playlist_id', 'playlist_date'], keep=False)]
print(duplicate_rows)

intermediate_table_df.head()

Empty DataFrame
Columns: [track_id, artist_id, playlist_id, playlist_snapshot_id, playlist_date]
Index: []


Unnamed: 0,track_id,artist_id,playlist_id,playlist_snapshot_id,playlist_date
0,01BjQ7kJDzs7JEVSF1ZHHL,2VhoJf6VdIoPnifnThy9UV,37i9dQZF1DX8tZsk68tuDw,MTcxMzQ5OTIwMCwwMDAwMDAwMGZiYjNmMDA3OTc2ZjQwMT...,2024-04-20
1,01TnMXIy7mJJQ7E8uy937N,25uiPmTg16RbhZWAqwLBy5,37i9dQZF1DWUa8ZRTfalHk,MTcxMzQ5OTIwMCwwMDAwMDAwMDdmYWY4OTI2MDkyMDhhND...,2024-04-20
2,01TnMXIy7mJJQ7E8uy937N,25uiPmTg16RbhZWAqwLBy5,37i9dQZF1DX4dyzvuaRJ0n,MTcxMzQ5OTI2MCwwMDAwMDAwMDliNmU2ZWJkZjZhYmQ3Zm...,2024-04-20
3,01TyFEZu6mHbffsVfxgrFn,3zmfs9cQwzJl575W1ZYXeT,37i9dQZF1DX0MLFaUdXnjA,MTcxMzQ5OTIwMCwwMDAwMDAwMDc1ODcwYmUzZmJiYTNkNT...,2024-04-20
4,027ielmg4mNR1p1Ds3Mvgh,0NB5wv4kn6A919CLHUKRmk,37i9dQZEVXbLiRSasKsNU9,NzY2OTA4MDAwLDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMD...,2024-04-20


## Ingest new data to database

In [23]:
# Function to re-ingest the updated DataFrames into MySQL database

# Connect to the database
engine = create_engine(url)
    
# Disable foreign key constraints
with engine.connect() as connection:
    connection.execute("SET foreign_key_checks = 0")
    
# Clear existing data from tables
with engine.connect() as connection:
    connection.execute("TRUNCATE TABLE dim_track")
    connection.execute("TRUNCATE TABLE dim_artist")
    connection.execute("TRUNCATE TABLE fact_playlist")
    connection.execute("TRUNCATE TABLE intermediate_table")
    
# Re-enable foreign key constraints
with engine.connect() as connection:
    connection.execute("SET foreign_key_checks = 1")
    
# Re-ingest updated data into tables
dim_track_df.to_sql('dim_track', con=engine, if_exists='append', index=False)
dim_artist_df.to_sql('dim_artist', con=engine, if_exists='append', index=False)
fact_playlist_df.to_sql('fact_playlist', con=engine, if_exists='append', index=False)
intermediate_table_df.to_sql('intermediate_table', con=engine, if_exists='append', index=False)

## Tests

In [24]:
'''
test_df = final_df.copy()
test2_df = final_df.copy()
# Convert 'playlist_date' column to datetime format
test_df['playlist_date'] = pd.to_datetime(test_df['playlist_date'], errors='coerce')
# Add 30 days to 'playlist_date' column
test_df['playlist_date'] = test_df['playlist_date'] + pd.Timedelta(days=30)
test_df['playlist_followers'] = 666
test_df['playlist_description'] = 'penepenepenepenepenepenepene'
fact_playlist_df = pd.concat([final_df, test_df], ignore_index=True)

test2_df['playlist_date'] = pd.to_datetime(test2_df['playlist_date'], errors='coerce')
# Add 30 days to 'playlist_date' column
test2_df['playlist_date'] = test2_df['playlist_date'] + pd.Timedelta(days=500)
test2_df['playlist_followers'] = 333
test2_df['playlist_description'] = 'sexo anal'
fact_playlist_df_oof = pd.concat([fact_playlist_df, test2_df], ignore_index=True)

final_df.groupby('playlist_id').agg(
        playlist_name=('playlist_name', 'last'),
        playlist_description=('playlist_description', 'last'),
        playlist_followers=('playlist_followers', 'last'),
        #playlist_date=('playlist_date', 'first'),
        #AVG_track_release_date_by_playlist=('track_release_date', 'mean'),
        AVG_track_popularity_by_playlist=('track_popularity', 'mean'),
        AVG_track_duration_sec_by_playlist=('track_duration_sec', 'mean'),
        RATIO_track_explicit_by_playlist=('track_explicit', lambda x: x.sum() / len(x)),  # Calculate ratio of True to False
        AVG_danceability_by_playlist=('danceability', 'mean'),
        AVG_energy_by_playlist=('energy', 'mean'),
        AVG_loudness_by_playlist=('loudness', 'mean'),
        AVG_speechiness_by_playlist=('speechiness', 'mean'),
        AVG_acousticness_by_playlist=('acousticness', 'mean'),  # Fixed typo
        AVG_instrumentalness_by_playlist=('instrumentalness', 'mean'),
        AVG_liveness_by_playlist=('liveness', 'mean'),
        AVG_valence_by_playlist=('valence', 'mean'),
        AVG_tempo_by_playlist=('tempo', 'mean'),
        AVG_artist_followers_by_playlist=('artist_followers', 'mean'),
        AVG_artist_popularity_by_playlist=('artist_popularity', 'mean')
    ).reset_index()
'''

"\ntest_df = final_df.copy()\ntest2_df = final_df.copy()\n# Convert 'playlist_date' column to datetime format\ntest_df['playlist_date'] = pd.to_datetime(test_df['playlist_date'], errors='coerce')\n# Add 30 days to 'playlist_date' column\ntest_df['playlist_date'] = test_df['playlist_date'] + pd.Timedelta(days=30)\ntest_df['playlist_followers'] = 666\ntest_df['playlist_description'] = 'penepenepenepenepenepenepene'\nfact_playlist_df = pd.concat([final_df, test_df], ignore_index=True)\n\ntest2_df['playlist_date'] = pd.to_datetime(test2_df['playlist_date'], errors='coerce')\n# Add 30 days to 'playlist_date' column\ntest2_df['playlist_date'] = test2_df['playlist_date'] + pd.Timedelta(days=500)\ntest2_df['playlist_followers'] = 333\ntest2_df['playlist_description'] = 'sexo anal'\nfact_playlist_df_oof = pd.concat([fact_playlist_df, test2_df], ignore_index=True)\n\nfinal_df.groupby('playlist_id').agg(\n        playlist_name=('playlist_name', 'last'),\n        playlist_description=('playlist_de