In [1]:
#Imports

In [2]:
import pandas as pd 
import json
from typing import List
import os
from os import listdir
import matplotlib.pyplot as plt
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import plotly.express as px
import plotly.io as pio
from datetime import datetime
import pytz 
from collections import Counter

from utilis import (
    get_streamings,
    minsec_to_seconds,
    format_time,
    generate_top_10_songs_by_year,
    save_cache,
    search_track_id,
    get_audio_features
)


In [None]:
# Title: Load and Clean Raw Spotify Streaming Data

# Description:
# Loads raw Spotify streaming history from local files, 
# converts timestamps and extracts year and month, 
# drops irrelevant or podcast-specific columns, 
# removes entries missing required track and artist metadata, 
# filters out rows with empty string values, 
# and adds a formatted listening duration column in MM:SS format.

In [7]:
spotify_data_directory = 'data'

required_metadata = [
    'master_metadata_track_name',
    'master_metadata_album_artist_name',
    'master_metadata_album_album_name',
    'spotify_track_uri'
]

drop_columns = [
    'spotify_episode_uri', 'reason_start', 'reason_end', 'shuffle',
    'skipped', 'offline', 'offline_timestamp', 'incognito_mode',
    'episode_name', 'platform', 'episode_show_name', 'conn_country', 'ip_addr',
    'audiobook_title', 'audiobook_uri', 'audiobook_chapter_uri', 'audiobook_chapter_title'
]

# Load and fully clean the streaming data
streaming_data = (
    pd.DataFrame(get_streamings(spotify_data_directory))
    .assign(
        ts=lambda df: pd.to_datetime(df['ts']),
        year=lambda df: df['ts'].dt.year,
        month=lambda df: df['ts'].dt.strftime('%B')
    )
    .drop(columns=drop_columns, errors='ignore')
    .dropna(subset=required_metadata)
    .query(" & ".join([f"{col}.str.strip() != ''" for col in required_metadata]))
    .copy()  # avoid SettingWithCopyWarning
)

# Add listening length as MM:SS
streaming_data['listening_length'] = streaming_data['ms_played'].apply(
    lambda ms: f"{int(ms // 60000)}:{int((ms % 60000) // 1000):02d}" if pd.notnull(ms) else "0:00"
)

# View result
print(streaming_data.head())


                         ts  ms_played master_metadata_track_name  \
0 2017-03-11 02:59:24+00:00     254693                   Red Tide   
1 2017-03-11 02:59:32+00:00       6196            Red Tide Rising   
2 2017-03-11 02:59:45+00:00      10499            Red Tide Rising   
3 2017-03-11 03:01:38+00:00      19294                   Red Tide   
4 2017-03-11 03:03:03+00:00      85600                   Red Tide   

  master_metadata_album_artist_name master_metadata_album_album_name  \
0                      The Growlers               Are You In Or Out?   
1                     Orange Goblin          A Eulogy for the Damned   
2                     Orange Goblin          A Eulogy for the Damned   
3                      The Growlers               Are You In Or Out?   
4                      The Growlers               Are You In Or Out?   

                      spotify_track_uri  year  month listening_length  
0  spotify:track:5o7taLCeXkjmrmOd9YR9iS  2017  March             4:14  
1  spoti

In [18]:
#save Streaming_data as csv

In [None]:
# Save the cleaned streaming data to CSV
streaming_data.to_csv('wrapped_data/streaming_data.csv', index=False)

# Print the earliest timestamp in the dataset
print(streaming_data['ts'].min())

# Print the latest timestamp in the dataset
print(streaming_data['ts'].max())


2017-03-11 02:59:24+00:00
2025-03-20 23:47:08+00:00
