# Import Data

In [1]:
import pandas as pd

tracks = pd.read_csv("data/tracks.csv")
artists = pd.read_csv("data/artists.csv")

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
artists.info()
#1,162,095 artists


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1162095 entries, 0 to 1162094
Data columns (total 5 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   id          1162095 non-null  object 
 1   followers   1162084 non-null  float64
 2   genres      1162095 non-null  object 
 3   name        1162095 non-null  object 
 4   popularity  1162095 non-null  int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 44.3+ MB


In [4]:
#create a set of artists from tracks dataframe 
artists_on_track_df = []
def get_artists(artists):
    ids = [x.strip() for x in artists.strip('[]').replace("'", '').split(',')]
    for id in ids:
        artists_on_track_df.append(id)

for _ in tracks['id_artists'].values:
    get_artists(_)


In [5]:
len(artists_on_track_df)

757170

In [6]:
artists_ids_set = set(artists_on_track_df)
#757,170 
len(artists_ids_set)
#98,504 unique artist ids

98504

In [7]:
#pull out artist data that's relevant to our tracks dataframe 
artists_subset = artists[artists['id'].isin(artists_ids_set)]

In [8]:
artists_subset.info()
#not all of our artist ids are in the artist dataframe, so maybe we won't use. 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81776 entries, 137 to 1162092
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          81776 non-null  object 
 1   followers   81775 non-null  float64
 2   genres      81776 non-null  object 
 3   name        81776 non-null  object 
 4   popularity  81776 non-null  int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 3.7+ MB


# Data Cleaning 

In [9]:
tracks[tracks['name'].isna()]['id_artists'].value_counts()
#all missing values are from the same artist id...
#['0LyfQWJT6nXafLPZqxe9Of']
#since this id isn't in our artists dataframe referring to the Spotify API tells us this means "various artist" we can drop these from our dataframe
tracks.drop(tracks[tracks['name'].isna()].index, axis=0, inplace=True)

In [10]:
# Convert length in milliseconds to length in minutes and seconds
tracks['length_minutes'] = tracks['duration_ms'] // 60000
tracks['length_seconds'] = (tracks['duration_ms'] % 60000) // 1000

# Format the length as minutes:seconds
tracks['length_formatted'] = tracks.apply(lambda row: f"{row['length_minutes']}:{row['length_seconds']:02d}", axis=1)




In [11]:
tracks.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,length_minutes,length_seconds,length_formatted
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3,2,6,2:06
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1,1,38,1:38
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5,3,1,3:01
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3,2,56,2:56
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4,2,43,2:43


In [12]:
tracks.drop(['length_minutes', 'length_seconds'], axis=1, inplace=True)

In [13]:
#clean up artists and id_artists columns
def clean_artists(artists):
    ids = [x.strip() for x in artists.strip('[]').replace("'", '').split(',')]
    return ids
    
tracks['id_artists'] = tracks['id_artists'].apply(clean_artists)   
tracks['artists'] = tracks['artists'].apply(clean_artists)

In [14]:
#create release_year and decade columns
tracks['release_year'] = tracks['release_date'].apply(lambda x: x[:4])

def make_decade(year):
    decade = year[:3]
    decade = decade + '0'
    return decade

tracks['decade'] = tracks['release_year'].apply(make_decade)

tracks.drop('release_date', axis=1, inplace=True)

In [24]:
tracks.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,length_formatted,release_year,decade,artist_name_1,artist_name_2,artist_name_3
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,[Uli],[45tIt06XoI0Iio4LBEVpls],0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3,2:06,1922,1920,Uli,,
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,[Fernando Pessoa],[14jtPCOoNZwquk5wd9DxrY],0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1,1:38,1922,1920,Fernando Pessoa,,
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,[Ignacio Corsini],[5LiOoJbxVSAMkBS2fUm3X2],0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5,3:01,1922,1920,Ignacio Corsini,,
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,[Ignacio Corsini],[5LiOoJbxVSAMkBS2fUm3X2],0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3,2:56,1922,1920,Ignacio Corsini,,
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,[Dick Haymes],[3BiJGZsyX9sJchTqcSA7Su],0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4,2:43,1922,1920,Dick Haymes,,


In [17]:
#separate artists into different categories: 
tracks['artist_name_1'] = tracks['artists'].apply(lambda x: x[0])

def get_artist_2(artists):
    if len(artists) <2:
        artist = None
    else:
        artist = artists[1]
    return artist

tracks['artist_name_2'] = tracks['artists'].apply(get_artist_2)


In [23]:
def get_artist_3(artists):
    if len(artists) <3:
        artist = None
    else:
        artist = artists[2]
    return artist

tracks['artist_name_3'] = tracks['artists'].apply(get_artist_3)

In [28]:
tracks['artist_name_3'].isna().sum() / len(tracks)
#only 6% of songs have more than two artists, so we may consider dropping this column....

0.9425316356433078

In [29]:
tracks.to_csv('data/master_track_data.csv')