In [325]:
import warnings
warnings.filterwarnings("ignore")
import pandas

# Read in data

In [326]:
tracks = pd.read_csv("data/raw/tracks.csv")
artists = pd.read_csv("data/raw/artists.csv")

# Merge tracks and artists dataframes

In [327]:
# Reformat
tracks['id_artists'] = [i[2:-2] for i in tracks['id_artists']]

# Merge
artists.rename(columns = {'id': 'id_artists','popularity': 'artists_popularity'}, inplace=True)
artists.drop(['genres','name'],axis = 1, inplace = True)
tracks = tracks.merge(artists, on='id_artists')

# Remove unncessary columns

In [328]:
tracks.drop(['id','id_artists','artists','name'], axis=1, inplace=True)

# Reformat release date to only include the year

In [329]:
tracks['release_year'] = pd.DatetimeIndex(tracks['release_date']).year
tracks.drop(columns=['release_date'], inplace=True)

# Filter out songs that released before Spotify launched (2008), as well as spoken word tracks

In [330]:
tracks = tracks[tracks['release_year'] >= 2008]
tracks = tracks[tracks['speechiness'] < 0.66]

In [331]:
tracks.head()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,followers,artists_popularity,release_year
3470,9,189894,0,0.563,0.152,0,-14.428,0,0.0401,0.939,0.67,0.093,0.363,122.871,4,696099.0,63,2014
3471,9,154852,0,0.66,0.257,5,-17.578,0,0.0469,0.972,0.879,0.11,0.908,128.896,4,696099.0,63,2014
3472,9,173739,0,0.797,0.426,0,-11.107,0,0.128,0.957,0.887,0.343,0.965,130.456,4,696099.0,63,2014
3473,6,193171,0,0.705,0.0886,0,-15.026,1,0.0705,0.973,0.192,0.112,0.683,86.795,4,696099.0,63,2014
3474,6,188403,0,0.619,0.0275,3,-22.171,1,0.0439,0.956,0.405,0.121,0.319,88.447,4,696099.0,63,2014


# Normalize the data

In [332]:
def normalize(df, ignore_column):
    result = df.copy()
    for feature_name in df.columns.drop(ignore_column):
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

tracks = normalize(tracks, ignore_column='popularity')

In [333]:
tracks.head()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,followers,artists_popularity,release_year
3470,9,0.03679,0.0,0.571574,0.152,0.0,0.686011,0.0,0.061221,0.942771,0.67,0.093,0.363,0.558253,0.8,0.008823,0.63,0.461538
3471,9,0.029766,0.0,0.670051,0.257,0.454545,0.625559,0.0,0.071603,0.975904,0.879,0.11,0.908,0.585627,0.8,0.008823,0.63,0.461538
3472,9,0.033551,0.0,0.809137,0.426,0.0,0.749746,0.0,0.19542,0.960843,0.887,0.343,0.965,0.592715,0.8,0.008823,0.63,0.461538
3473,6,0.037447,0.0,0.715736,0.0886,0.0,0.674535,1.0,0.107634,0.976908,0.192,0.112,0.683,0.394345,0.8,0.008823,0.63,0.461538
3474,6,0.036491,0.0,0.628426,0.0275,0.272727,0.537413,1.0,0.067023,0.959839,0.405,0.121,0.319,0.401851,0.8,0.008823,0.63,0.461538


# Write processed tracks dataframe to CSV

In [323]:
tracks.to_csv("data/processed_tracks.csv", index=False)