# Pre-processing of Songs data

Some pre-processing and basic feature engineering for Songs data.

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Dirty feat-eng.

In [2]:
songs = pd.read_csv(filepath_or_buffer='data/songs.csv')
songs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2296320 entries, 0 to 2296319
Data columns (total 7 columns):
song_id        object
song_length    int64
genre_ids      object
artist_name    object
composer       object
lyricist       object
language       float64
dtypes: float64(1), int64(1), object(5)
memory usage: 122.6+ MB


In [3]:
# composer or lyricist NaN = artist_name
songs.loc[songs['composer'].isnull(), 'composer'] = songs['artist_name']
songs.loc[songs['lyricist'].isnull(), 'lyricist'] = songs['artist_name']

# split songs length into 3 categories
songs.loc[songs['song_length']<200000, 'song_length'] = 0
songs.loc[(songs['song_length']<250000) & (songs['song_length']>=200000), 'song_length'] = 1
songs.loc[songs['song_length']>=250000, 'song_length'] = 2

# Manually rewrite one entry badly written
songs.loc[songs['language'].isnull(), 'lyricist'] = songs['artist_name']
songs.loc[songs['language'].isnull(), 'language'] = 31

In [4]:
songs['genre_ids'].value_counts().to_dict()

{'465': 567911,
 '958': 176349,
 '2022': 168870,
 '1609': 166457,
 '2122': 139938,
 '1259': 101485,
 '921': 67578,
 '1152': 48730,
 '359': 43601,
 '786': 42999,
 '726': 34969,
 '139': 34876,
 '1011': 34013,
 '940': 33490,
 '1572|275': 24338,
 '1955': 20980,
 '691': 19472,
 '139|125|109': 17613,
 '873': 17542,
 '437': 17212,
 '947': 17106,
 '388': 16776,
 '458': 15438,
 '444': 14859,
 '1616': 14188,
 '242': 13756,
 '451': 13283,
 '880': 13059,
 '423': 11814,
 '829': 11759,
 '2130': 11053,
 '1138': 10770,
 '1180': 10702,
 '1616|2058': 9283,
 '893': 7527,
 '1152|947': 7355,
 '864|857|850|843': 6842,
 '409': 6519,
 '2072': 4987,
 '430': 4863,
 '698': 4845,
 '545': 4513,
 '94': 4503,
 '940|1152': 4259,
 '798': 4207,
 '1609|2107': 4144,
 '940|388': 3997,
 '2086|374': 3972,
 '2079': 3923,
 '352|1995': 3920,
 '822': 3653,
 '465|2022': 3482,
 '786|2122': 3259,
 '786|947': 2863,
 '1633': 2717,
 '381|2086|374': 2647,
 '1145': 2428,
 '388|940': 2406,
 '465|798': 2370,
 '1616|2072': 2356,
 '118': 2

In [5]:
# remove lowly populated genre_ids
var_dict = songs['genre_ids'].value_counts().to_dict() # dict of {genre: counts}
limit = 10
misc =  {k: v for k, v in var_dict.items() if v <= limit} # genre to be merged
for genres in misc.keys():
    songs.loc[songs['genre_ids']==genres, 'genre_ids'] = 'Misc'
del misc, var_dict # clear temp obj
songs['genre_ids'] = songs['genre_ids'].astype(str) # uniformization to string

In [6]:
# Switch "Various Artists" in "artist_name" by something more relevant when possible
songs.loc[(songs['artist_name']=='Various Artists') & (songs['composer']!='Various Artists'),
           'artist_name'] = songs['composer']
songs.loc[(songs['artist_name']=='Various Artists') & (songs['lyricist']!='Various Artists'),
           'artist_name'] = songs['lyricist']