In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Import mlc_data.csv
mlc_df = pd.read_csv("../data/mlc_data.csv")
mlc_df.head()

Unnamed: 0,If,Country Code,Registrant Code,Year of Reference,Usage Period,Streaming Platform (DSP),Streams,Recording Artist,Recording Label,Recording Title,Release Title,Recording Duration (Seconds),"Songwriter's Listed (1 = ""Yes"", 0 = ""No"")"
0,USUG12101043,US,UG1,21.0,,AudioMack,10175,Future,,FUTURE FT DEJ LOAF HEY THERE PROD BY DDS,#unknown#,181,0
1,USRC12100543,US,RC1,21.0,,SoundCloud,8597,LUCKY3RD,LUCKY3RD,Keep It Cool LUCKY3RD,Keep It Cool LUCKY3RD,133,0
2,USSM12102263,US,SM1,21.0,,SoundCloud,261280,LUCKY3RD,LUCKY3RD,Life Goes On LUCKY3RD,Life Goes On LUCKY3RD,171,0
3,USLD91731547,US,LD9,17.0,2/1/2021,Trebel,5,Bachata & Merengue Mix,Orchard,No dudes de mi- Merengue & Bachata Mix,Mega Mix 2010,1250,0
4,USAT22007048,US,AT2,20.0,,AudioMack,62105,Foolio,,WHEN I SEE YOU REMIX,#unknown#,187,0


## Clean up dataset

In [3]:
# Abi: Rename column If to ISRC -- ORIGINAL 'If ' column HAS A SPACE after it!!
mlc_df.columns = ['ISRC', 'Country', 'Registrant', 'Release_Year', 'Period', 'DSP', 'Streams', 'Artist', 'Label', 'Song', 'Album', 'Duration', 'Songwriter?']
mlc_df.head()

Unnamed: 0,ISRC,Country,Registrant,Release_Year,Period,DSP,Streams,Artist,Label,Song,Album,Duration,Songwriter?
0,USUG12101043,US,UG1,21.0,,AudioMack,10175,Future,,FUTURE FT DEJ LOAF HEY THERE PROD BY DDS,#unknown#,181,0
1,USRC12100543,US,RC1,21.0,,SoundCloud,8597,LUCKY3RD,LUCKY3RD,Keep It Cool LUCKY3RD,Keep It Cool LUCKY3RD,133,0
2,USSM12102263,US,SM1,21.0,,SoundCloud,261280,LUCKY3RD,LUCKY3RD,Life Goes On LUCKY3RD,Life Goes On LUCKY3RD,171,0
3,USLD91731547,US,LD9,17.0,2/1/2021,Trebel,5,Bachata & Merengue Mix,Orchard,No dudes de mi- Merengue & Bachata Mix,Mega Mix 2010,1250,0
4,USAT22007048,US,AT2,20.0,,AudioMack,62105,Foolio,,WHEN I SEE YOU REMIX,#unknown#,187,0


In [4]:
# Abi: Change all Stream values to numeric without the comma
mlc_df["Streams"] = mlc_df["Streams"].str.replace(",", "")
mlc_df["Streams"] = mlc_df["Streams"].str.replace(" ", "")
mlc_df[["Streams"]] = mlc_df[["Streams"]].apply(pd.to_numeric)
mlc_df.head()

Unnamed: 0,ISRC,Country,Registrant,Release_Year,Period,DSP,Streams,Artist,Label,Song,Album,Duration,Songwriter?
0,USUG12101043,US,UG1,21.0,,AudioMack,10175,Future,,FUTURE FT DEJ LOAF HEY THERE PROD BY DDS,#unknown#,181,0
1,USRC12100543,US,RC1,21.0,,SoundCloud,8597,LUCKY3RD,LUCKY3RD,Keep It Cool LUCKY3RD,Keep It Cool LUCKY3RD,133,0
2,USSM12102263,US,SM1,21.0,,SoundCloud,261280,LUCKY3RD,LUCKY3RD,Life Goes On LUCKY3RD,Life Goes On LUCKY3RD,171,0
3,USLD91731547,US,LD9,17.0,2/1/2021,Trebel,5,Bachata & Merengue Mix,Orchard,No dudes de mi- Merengue & Bachata Mix,Mega Mix 2010,1250,0
4,USAT22007048,US,AT2,20.0,,AudioMack,62105,Foolio,,WHEN I SEE YOU REMIX,#unknown#,187,0


In [5]:
# Inspecting mlc_df after data cleaning
print(mlc_df.shape) # 100,000 rows and 13 columns
print(mlc_df.dtypes)

(100000, 13)
ISRC             object
Country          object
Registrant       object
Release_Year    float64
Period           object
DSP              object
Streams           int64
Artist           object
Label            object
Song             object
Album            object
Duration          int64
Songwriter?       int64
dtype: object


In [13]:
# Rudy: More info
mlc_df.describe()

Unnamed: 0,Release_Year,Streams,Duration,Songwriter?
count,98239.0,100000.0,100000.0,100000.0
mean,19.671933,56265.5,1016.89369,0.54161
std,12.63887,262027.6,15565.692133,0.498268
min,0.0,1.0,0.0,0.0
25%,17.0,6807.0,149.0,0.0
50%,19.0,16772.5,190.0,1.0
75%,20.0,39903.0,236.0,1.0
max,99.0,21456170.0,818738.0,1.0


## Exploring the dataset

In [15]:
# Most popular streaming platform by number of records in the dataframe
DSP_counts = mlc_df['DSP'].value_counts()
print(DSP_counts)

Spotify               32268
Apple                 22200
Amazon                14438
Pandora               13777
Tidal                  3521
YouTube                2752
SoundCloud             2122
GTL                    1090
Melodyv                 829
Trebel                  817
iHeart Radio            707
AudioMack               550
NugsNet                 316
LiveXLive               200
Qoboz                   104
Midwest Tape            102
Deezer                   51
Anghami                  46
Sonos                    23
Recisio                  22
Smithsonian              21
Ultimate Guitar          19
PowerMusic                8
Wolfgangs                 4
Fan Label                 4
MixCloud                  4
Pacemaker                 3
Classical Archives        2
MonkingMe                 1
Name: DSP, dtype: int64


In [8]:
# Patrick: Most popular streaming platform by number of records in the dataframe with non-US (foreign) country code
# foreign = mlc_df.loc[mlc_df['Country']!= 'US']
# foreign['DSP'].value_counts()

## How likely is each DSP to provide songwriter info?

In [11]:
# Exploring Songwriter? values -- there are more recordings that have songwriter info provided than not
mlc_df['Songwriter?'].value_counts()

1    54161
0    45839
Name: Songwriter?, dtype: int64

In [27]:
# Likelihood of each DSP to provide songwriter info
sw_ratio = mlc_df.groupby('DSP')['Songwriter?'].mean()
sw_df = sw_ratio.sort_values(ascending=False)
sw_df = sw_df.astype(object)
print(sw_df)

DSP
Classical Archives         1.0
Qoboz                 0.903846
Deezer                0.862745
Spotify               0.854376
Apple                 0.808514
Recisio               0.545455
Amazon                 0.26174
Pandora               0.244828
Melodyv               0.114596
LiveXLive                 0.07
GTL                    0.02844
Tidal                 0.003692
Smithsonian                0.0
Ultimate Guitar            0.0
Trebel                     0.0
Wolfgangs                  0.0
SoundCloud                 0.0
Sonos                      0.0
YouTube                    0.0
Pacemaker                  0.0
PowerMusic                 0.0
Anghami                    0.0
NugsNet                    0.0
MonkingMe                  0.0
MixCloud                   0.0
Midwest Tape               0.0
Fan Label                  0.0
AudioMack                  0.0
iHeart Radio               0.0
Name: Songwriter?, dtype: object


## Exploring US only data

In [23]:
# Subset mlc_df to make a US only df
US_df = mlc_df.loc[mlc_df['Country']== 'US']
US_df.head()

Unnamed: 0,ISRC,Country,Registrant,Release_Year,Period,DSP,Streams,Artist,Label,Song,Album,Duration,Songwriter?
0,USUG12101043,US,UG1,21.0,,AudioMack,10175,Future,,FUTURE FT DEJ LOAF HEY THERE PROD BY DDS,#unknown#,181,0
1,USRC12100543,US,RC1,21.0,,SoundCloud,8597,LUCKY3RD,LUCKY3RD,Keep It Cool LUCKY3RD,Keep It Cool LUCKY3RD,133,0
2,USSM12102263,US,SM1,21.0,,SoundCloud,261280,LUCKY3RD,LUCKY3RD,Life Goes On LUCKY3RD,Life Goes On LUCKY3RD,171,0
3,USLD91731547,US,LD9,17.0,2/1/2021,Trebel,5,Bachata & Merengue Mix,Orchard,No dudes de mi- Merengue & Bachata Mix,Mega Mix 2010,1250,0
4,USAT22007048,US,AT2,20.0,,AudioMack,62105,Foolio,,WHEN I SEE YOU REMIX,#unknown#,187,0


In [26]:
# Top 5 streamed songs in US
US_df.sort_values('Streams', ascending=False).head()

Unnamed: 0,ISRC,Country,Registrant,Release_Year,Period,DSP,Streams,Artist,Label,Song,Album,Duration,Songwriter?
46643,USAT21901833,US,AT2,19.0,4/1/2021,Spotify,21456166,"Doja Cat,SZA",Kemosabe Records/RCA Records,Kiss Me More (feat. SZA),Kiss Me More (feat. SZA),209,1
24773,USUG12003339,US,UG1,20.0,1/1/2021,Spotify,17109896,Olivia Rodrigo,Olivia Rodrigo PS,drivers license,drivers license,243,1
27720,USUM72104713,US,UM7,21.0,,Apple,15747471,DJ Khaled,Epic/We The Best,EVERY CHANCE I GET (feat. Lil Baby & Lil Durk),KHALED KHALED,237,1
66711,USKBB2100008,US,KBB,21.0,4/1/2021,SoundCloud,9072859,Polo G,Columbia,Polo G - RAPSTAR,Polo G - RAPSTAR,165,0
69799,USCGJ1409314,US,CGJ,14.0,3/1/2021,Apple,8667375,Lil Baby,Quality Control Music/Motown Records,Real As It Gets (feat. EST Gee),Real As It Gets (feat. EST Gee) - Single,193,1


In [28]:
# Patrick: Most popular US streaming platform by number of records
US_DSP = mlc_df.loc[mlc_df['Country']== 'US']
US_DSP['DSP'].value_counts()

Spotify               13614
Apple                  9389
Amazon                 5919
Pandora                5646
Tidal                  1425
YouTube                1095
SoundCloud              857
GTL                     438
Melodyv                 342
Trebel                  333
iHeart Radio            284
AudioMack               234
NugsNet                 142
LiveXLive                84
Qoboz                    55
Midwest Tape             39
Deezer                   29
Anghami                  24
Recisio                  13
Smithsonian              13
Ultimate Guitar          10
Sonos                     9
Wolfgangs                 2
PowerMusic                2
Pacemaker                 1
MixCloud                  1
Fan Label                 1
Classical Archives        1
Name: DSP, dtype: int64

## Exploring foreign (non-US) data

In [29]:
# Subset mlc_df to make a non-US df
foreign_df = mlc_df.loc[mlc_df['Country']== 'US']
foreign_df.head()

Unnamed: 0,ISRC,Country,Registrant,Release_Year,Period,DSP,Streams,Artist,Label,Song,Album,Duration,Songwriter?
0,USUG12101043,US,UG1,21.0,,AudioMack,10175,Future,,FUTURE FT DEJ LOAF HEY THERE PROD BY DDS,#unknown#,181,0
1,USRC12100543,US,RC1,21.0,,SoundCloud,8597,LUCKY3RD,LUCKY3RD,Keep It Cool LUCKY3RD,Keep It Cool LUCKY3RD,133,0
2,USSM12102263,US,SM1,21.0,,SoundCloud,261280,LUCKY3RD,LUCKY3RD,Life Goes On LUCKY3RD,Life Goes On LUCKY3RD,171,0
3,USLD91731547,US,LD9,17.0,2/1/2021,Trebel,5,Bachata & Merengue Mix,Orchard,No dudes de mi- Merengue & Bachata Mix,Mega Mix 2010,1250,0
4,USAT22007048,US,AT2,20.0,,AudioMack,62105,Foolio,,WHEN I SEE YOU REMIX,#unknown#,187,0


In [30]:
# Top 5 foreign (non-US) streamed songs
foreign_df.sort_values('Streams', ascending=False).head()

Unnamed: 0,ISRC,Country,Registrant,Release_Year,Period,DSP,Streams,Artist,Label,Song,Album,Duration,Songwriter?
46643,USAT21901833,US,AT2,19.0,4/1/2021,Spotify,21456166,"Doja Cat,SZA",Kemosabe Records/RCA Records,Kiss Me More (feat. SZA),Kiss Me More (feat. SZA),209,1
24773,USUG12003339,US,UG1,20.0,1/1/2021,Spotify,17109896,Olivia Rodrigo,Olivia Rodrigo PS,drivers license,drivers license,243,1
27720,USUM72104713,US,UM7,21.0,,Apple,15747471,DJ Khaled,Epic/We The Best,EVERY CHANCE I GET (feat. Lil Baby & Lil Durk),KHALED KHALED,237,1
66711,USKBB2100008,US,KBB,21.0,4/1/2021,SoundCloud,9072859,Polo G,Columbia,Polo G - RAPSTAR,Polo G - RAPSTAR,165,0
69799,USCGJ1409314,US,CGJ,14.0,3/1/2021,Apple,8667375,Lil Baby,Quality Control Music/Motown Records,Real As It Gets (feat. EST Gee),Real As It Gets (feat. EST Gee) - Single,193,1


In [31]:
# Most popular foreign (non-US) streaming platform by number of records
foreign_DSP = mlc_df.loc[mlc_df['Country']!= 'US']
foreign_DSP['DSP'].value_counts()

Spotify               18654
Apple                 12811
Amazon                 8519
Pandora                8131
Tidal                  2096
YouTube                1657
SoundCloud             1265
GTL                     652
Melodyv                 487
Trebel                  484
iHeart Radio            423
AudioMack               316
NugsNet                 174
LiveXLive               116
Midwest Tape             63
Qoboz                    49
Anghami                  22
Deezer                   22
Sonos                    14
Recisio                   9
Ultimate Guitar           9
Smithsonian               8
PowerMusic                6
Fan Label                 3
MixCloud                  3
Wolfgangs                 2
Pacemaker                 2
MonkingMe                 1
Classical Archives        1
Name: DSP, dtype: int64