# Import Data

In [1]:
import pandas as pd


tracks = pd.read_csv("data/tracks.csv")
artists = pd.read_csv("data/artists.csv")

In [2]:
artists.info()
#1,162,095 artist

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1162095 entries, 0 to 1162094
Data columns (total 5 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   id          1162095 non-null  object 
 1   followers   1162084 non-null  float64
 2   genres      1162095 non-null  object 
 3   name        1162095 non-null  object 
 4   popularity  1162095 non-null  int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 44.3+ MB


In [3]:
#create a set of artists from tracks dataframe 
artists_on_track_df = []
def get_artists(artists):
    ids = [x.strip() for x in artists.strip('[]').replace("'", '').split(',')]
    for id in ids:
        artists_on_track_df.append(id)

for _ in tracks['id_artists'].values:
    get_artists(_)


In [4]:
artists_ids_set = (set(artists_on_track_df))
#757,170 
#98,504 unique artist ids

In [5]:
#pull out artist data that's relevant to our tracks dataframe 
artists_subset = artists[artists['id'].isin(artists_ids_set)]

In [6]:
artists_subset.head(30)

Unnamed: 0,id,followers,genres,name,popularity
137,7DLDYPMRJ0NFPcBTlg04DE,187.0,[],Guru Dutt,2
138,1OCPhFtvkZDLUJJkrJfD2G,155.0,[],The De Castro Sisters,8
139,4PiJnql6Z3yQ1okaLjPHpD,12.0,[],Orchestra Sinfonica,9
140,7eXSIjGrfSvF1xDDIJwBxC,0.0,[],Teddy Stewart Orchestra,16
149,17OXC3oLAPMOQwqmzcsoxo,735.0,[],Karen Peck,22
151,1pCXaoB8RV6mkcasIuqrsT,907.0,[],Stephen Hill,27
153,7frYUe4C7A42uZqCzD34Y4,53636.0,"['desi pop', 'punjabi hip hop', 'punjabi pop']",Sultaan,53
154,6acbdy69rtlv8m9EW31MYl,72684.0,"['afro dancehall', 'afropop', 'azontobeats', '...",Phyno,51
155,72578usTM6Cj5qWsi471Nc,248568.0,"['filmi', 'indian folk', 'indian rock', 'kanna...",Raghu Dixit,52
157,6iv4lysB1yHXoZJ2gfqTdh,786.0,['indian fusion'],Shashwat Singh,56


In [7]:
artists[artists['id'] == "['0LyfQWJT6nXafLPZqxe9Of']"]

Unnamed: 0,id,followers,genres,name,popularity


Since we have a lot of missing values in the genre column, we may not use this dataframe at all....

# Data Cleaning 

In [8]:
tracks[tracks['name'].isna()]['id_artists'].value_counts()
#all missing values are from the same artist id...
#['0LyfQWJT6nXafLPZqxe9Of']
#since this id isn't in our artists dataframe referring to the Spotify API tells us this means "various artist" we can drop these from our dataframe
tracks.drop(tracks[tracks['name'].isna()].index, axis=0, inplace=True)

In [9]:
# Convert length in milliseconds to length in minutes and seconds
tracks['length_minutes'] = tracks['duration_ms'] // 60000
tracks['length_seconds'] = (tracks['duration_ms'] % 60000) // 1000

# Format the length as minutes:seconds
tracks['length_formatted'] = tracks.apply(lambda row: f"{row['length_minutes']}:{row['length_seconds']:02d}", axis=1)




In [10]:
tracks.drop(['length_minutes', 'length_seconds'], axis=1, inplace=True)

In [11]:
#clean up artists and id_artists columns
def clean_artists(artists):
    ids = [x.strip() for x in artists.strip('[]').replace("'", '').split(',')]
    return ids
    
tracks['id_artists'] = tracks['id_artists'].apply(clean_artists)    

In [12]:
tracks['artists_cleaned'] = tracks['artists'].apply(clean_artists)

In [13]:
tracks['release_year'] = tracks['release_date'].apply(lambda x: x[:4])

In [14]:
tracks.drop('release_date', axis=1, inplace=True)

In [15]:
tracks.drop('artists', axis=1, inplace=True)

In [16]:
pd.set_option('display.max_columns', None)

In [17]:
testtest = tracks['release_year'][586644]

In [18]:
def make_decade(year):
    decade = year[:3]
    decade = decade + '0'
    return decade

make_decade(testtest)

'2010'

In [19]:

tracks['decade'] = tracks['release_year'].apply(make_decade)

In [20]:
tracks.to_csv('master_track_data.csv')

Unnamed: 0,id,name,popularity,duration_ms,explicit,id_artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,length_formatted,artists_cleaned,release_year,decade
586642,44tvGaqisGSy4Qy45vd0CF,Thousand Pieces (FYHYM2019),1,83477,0,"[22y1A8uijPBYxDPOrMWZPb, 78JtIs2G2CpfOZadEs1sU...",0.393,0.997,1,-6.145,0,0.145,0.000239,0.00268,0.756,0.0639,137.938,4,1:23,"[A & Z, Claudiu Adam, Clara Yates]",2020,2020
586643,1932nBdtEgMpDGkpl93cS4,Saving Angel (FYHYM2019),1,125216,0,"[4f0a5IgkYFHFts5Z9N9SDX, 51GkQKgac6wqdicVA2DvPu]",0.458,0.997,6,-6.124,0,0.194,0.000898,0.872,0.46,0.0393,137.996,4,2:05,"[RAM, Stine Grove]",2020,2020
586644,4cB00WOFuQFLoDpnydcx8c,7UP,62,203555,0,[0xu4jAQQv7ZAqvFGdc9HgP],0.616,0.469,0,-7.38,1,0.027,0.0167,0.0,0.111,0.233,135.035,4,3:23,[Boy In Space],2019,2010
586645,44r4zta6P9flkhKaVnbsvG,Freaks,70,174800,0,[14Y3trk7LaslSFTk1G35rx],0.674,0.568,1,-6.356,1,0.0408,0.116,0.0,0.165,0.64,166.107,4,2:54,[Jordan Clarke],2019,2010
586646,0F2muCxmWhi7NfY9poM0KZ,至少我還記得 - 天堂的微笑插曲,44,304800,0,[5fEQLwq1BWWQNR8GzhOIvi],0.269,0.368,10,-10.436,1,0.0339,0.724,3e-06,0.111,0.337,168.803,4,5:04,[Eric Chou],2019,2010
586647,0mDsJ6a7qY3DWbCAHancPj,King - Acoustic,2,243987,0,[5vBSrE1xujD2FXYRarbAXc],0.451,0.195,2,-8.335,1,0.044,0.958,3e-06,0.104,0.286,93.706,4,4:03,[Years & Years],2020,2020
586648,1PFGNasip0OTl24ZHPT6iy,你一定要幸福,49,290636,0,[5luSIDrs6oR8T858ZQYb1B],0.467,0.458,3,-6.598,1,0.0277,0.76,0.0,0.133,0.34,137.911,4,4:50,[开开],2020,2020
586649,0MmaEacabpK8Yp3Mdeo5uY,下雨天,50,265846,0,[5VGgFE9nPgMfEnYiPT5J2B],0.528,0.673,4,-3.639,1,0.0314,0.143,0.0,0.0989,0.297,130.066,4,4:25,[芝麻],2020,2020
586650,1uviKYHZuM4uINK33F7sCt,Fix It to Break It,70,198799,0,[7okSU80WTrn4LXlyXYbX3P],0.493,0.461,2,-8.524,1,0.0456,0.845,0.0,0.115,0.35,51.414,4,3:18,[Clinton Kane],2020,2020
586651,7s5bBky7QncZeb8XmceSOH,最好是,49,272771,0,[7Dx7RhX0mFuXhCOUgB01uM],0.596,0.487,7,-7.485,1,0.0309,0.654,0.0,0.0895,0.277,99.003,4,4:32,[JJ Lin],2020,2020
