In [1]:
import pandas as pd
import matplotlib.ticker as mtick
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re

#### Download and first look at the data

In [2]:
mlc=pd.read_csv('../data/Matching_Sample.csv')

In [3]:
mlc.head(2)

Unnamed: 0,If,Country Code,Registrant Code,Year of Reference,Usage Period,Streaming Platform (DSP),Streams,Recording Artist,Recording Label,Recording Title,Release Title,Recording Duration (Seconds),"Songwriter's Listed (1 = ""Yes"", 0 = ""No"")"
0,USUG12101043,US,UG1,21.0,3/1/2021,Apple,16216932,Drake,OVO,Lemon Pepper Freestyle (feat. Rick Ross),Scary Hours 2,383,1
1,USRC12100543,US,RC1,21.0,4/1/2021,Spotify,21456166,"Doja Cat,SZA",Kemosabe Records/RCA Records,Kiss Me More (feat. SZA),Kiss Me More (feat. SZA),209,1


#### Checking the tail end for bad data

In [4]:
mlc.tail()

Unnamed: 0,If,Country Code,Registrant Code,Year of Reference,Usage Period,Streaming Platform (DSP),Streams,Recording Artist,Recording Label,Recording Title,Release Title,Recording Duration (Seconds),"Songwriter's Listed (1 = ""Yes"", 0 = ""No"")"
99995,QZK6F2019397,QZ,K6F,20.0,3/1/2021,Spotify,1987400,Penelope Scott,Tesla's Pigeon,Rät,Public Void,195,1
99996,QZK6F2019397,QZ,K6F,20.0,3/1/2021,Spotify,1987400,Penelope Scott,Tesla's Pigeon,Rät,Public Void,195,1
99997,SE6HN1926755,SE,6HN,19.0,5/1/2021,Spotify,1380829,awfultune,awfultune,I Met Sarah in the Bathroom,I Met Sarah in the Bathroom,122,1
99998,SEYOK1669274,SE,YOK,16.0,5/1/2021,Spotify,1531690,Vacations,Nettwerk Records,Young,Vibes,190,1
99999,QZK6F2019397,QZ,K6F,20.0,5/1/2021,Spotify,1768669,Penelope Scott,Tesla's Pigeon,Rät,Public Void,195,1


In [5]:
mlc.columns = ['ISRC', 'Country', 'Registrant', 'Release_Year', 'Period', 'DSP', 'Streams', 'Artist', 'Label', 'Song', 'Album', 'Duration', 'Songwriter?']
mlc

Unnamed: 0,ISRC,Country,Registrant,Release_Year,Period,DSP,Streams,Artist,Label,Song,Album,Duration,Songwriter?
0,USUG12101043,US,UG1,21.0,3/1/2021,Apple,16216932,Drake,OVO,Lemon Pepper Freestyle (feat. Rick Ross),Scary Hours 2,383,1
1,USRC12100543,US,RC1,21.0,4/1/2021,Spotify,21456166,"Doja Cat,SZA",Kemosabe Records/RCA Records,Kiss Me More (feat. SZA),Kiss Me More (feat. SZA),209,1
2,USSM12102263,US,SM1,21.0,,Apple,15747471,DJ Khaled,Epic/We The Best,EVERY CHANCE I GET (feat. Lil Baby & Lil Durk),KHALED KHALED,237,1
3,USLD91731547,US,LD9,17.0,4/1/2021,Apple,17259260,Rod Wave,Alamo Records,Tombstone,SoulFly,160,1
4,USAT22007048,US,AT2,20.0,3/1/2021,Apple,8789577,Pooh Shiesty,Atlantic Records,Back In Blood (Feat. Lil Durk),Back in Blood (feat. Lil Durk) - Single,184,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,QZK6F2019397,QZ,K6F,20.0,3/1/2021,Spotify,1987400,Penelope Scott,Tesla's Pigeon,Rät,Public Void,195,1
99996,QZK6F2019397,QZ,K6F,20.0,3/1/2021,Spotify,1987400,Penelope Scott,Tesla's Pigeon,Rät,Public Void,195,1
99997,SE6HN1926755,SE,6HN,19.0,5/1/2021,Spotify,1380829,awfultune,awfultune,I Met Sarah in the Bathroom,I Met Sarah in the Bathroom,122,1
99998,SEYOK1669274,SE,YOK,16.0,5/1/2021,Spotify,1531690,Vacations,Nettwerk Records,Young,Vibes,190,1


###### Exploratory phase

#### Foreign producers seem to get the most exposure on Spotify, Apple, and Amazon...What about domestic?

In [6]:
domestic = mlc.loc[mlc['Country']== 'US']

domestic['DSP'].value_counts()

Apple           10093
Spotify          8434
Pandora          7585
Amazon           7505
Tidal            1895
YouTube          1328
SoundCloud       1026
Trebel            574
GTL               433
iHeart Radio      383
Melodyv           307
AudioMack         232
LiveXLive         105
Midwest Tape       74
Deezer             39
Qoboz              25
Smithsonian        21
Sonos              19
Wolfgangs           4
Fan Label           2
MixCloud            1
Name: DSP, dtype: int64

#### Apple, Spotify, and Pandora made the top 3 for domestic, but it was close.
who was listening to classical music?

In [7]:
mlc.loc[mlc['DSP']=='Classical Archives']
#Long French names, but these are really long songs

Unnamed: 0,ISRC,Country,Registrant,Release_Year,Period,DSP,Streams,Artist,Label,Song,Album,Duration,Songwriter?
71086,DEB339350501,DE,B33,93.0,4/1/2021,Classical Archives,12,"Tõnu Kaljuste, Estonian Philharmonic Chamber C...",ECM New Series,"Te Deum, for double chorus, strings, prepared ...",Pärt: Te Deum,1723,1
87615,NLA508532103,NL,A50,85.0,2/1/2021,Classical Archives,8,Beaux Arts Trio,Decca Music Group Ltd.,"Trio élégiaque, for piano and strings in D-, O...",Rachmaninov: Piano Trios Nos.1 & 2,1195,1


In [8]:
#What is the longest song?
longest_song = max(mlc.Duration)
longest_song

818738

In [9]:
mlc.Label.value_counts()

Atlantic Records                  1384
Lofi Records                      1232
Warner Records                     995
Columbia                           843
Universal                          818
                                  ... 
Silence and Sounds                   1
Classic Music Company                1
FreestyleRs// PlanSecret Music       1
Eddie Vedder/Republic World          1
God Over Money Records               1
Name: Label, Length: 11778, dtype: int64

In [10]:
mlc.loc[mlc['Duration']==mlc.Duration.max()]


Unnamed: 0,ISRC,Country,Registrant,Release_Year,Period,DSP,Streams,Artist,Label,Song,Album,Duration,Songwriter?
79752,QMCE32000650,QM,CE3,20.0,2/1/2021,Trebel,6464,Megan Thee Stallion,Warner,Cry Baby (feat. DaBaby),Good News,818738,0


#### Megan Thee Stallion has a 9 day long song. That is impressive.

In [11]:
mlc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   ISRC          98240 non-null   object 
 1   Country       98303 non-null   object 
 2   Registrant    98303 non-null   object 
 3   Release_Year  98239 non-null   float64
 4   Period        91898 non-null   object 
 5   DSP           96001 non-null   object 
 6   Streams       100000 non-null  object 
 7   Artist        100000 non-null  object 
 8   Label         98992 non-null   object 
 9   Song          100000 non-null  object 
 10  Album         99931 non-null   object 
 11  Duration      100000 non-null  int64  
 12  Songwriter?   100000 non-null  int64  
dtypes: float64(1), int64(2), object(10)
memory usage: 9.9+ MB


In [12]:
mlc['Streams']=mlc['Streams'].str.replace(" ", ",")
mlc['Streams']=mlc['Streams'].str.replace(",", "")


In [13]:
mlc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   ISRC          98240 non-null   object 
 1   Country       98303 non-null   object 
 2   Registrant    98303 non-null   object 
 3   Release_Year  98239 non-null   float64
 4   Period        91898 non-null   object 
 5   DSP           96001 non-null   object 
 6   Streams       100000 non-null  object 
 7   Artist        100000 non-null  object 
 8   Label         98992 non-null   object 
 9   Song          100000 non-null  object 
 10  Album         99931 non-null   object 
 11  Duration      100000 non-null  int64  
 12  Songwriter?   100000 non-null  int64  
dtypes: float64(1), int64(2), object(10)
memory usage: 9.9+ MB


In [14]:
mlc['Streams']=mlc['Streams'].astype(int)

In [15]:
mlc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   ISRC          98240 non-null   object 
 1   Country       98303 non-null   object 
 2   Registrant    98303 non-null   object 
 3   Release_Year  98239 non-null   float64
 4   Period        91898 non-null   object 
 5   DSP           96001 non-null   object 
 6   Streams       100000 non-null  int32  
 7   Artist        100000 non-null  object 
 8   Label         98992 non-null   object 
 9   Song          100000 non-null  object 
 10  Album         99931 non-null   object 
 11  Duration      100000 non-null  int64  
 12  Songwriter?   100000 non-null  int64  
dtypes: float64(1), int32(1), int64(2), object(9)
memory usage: 9.5+ MB


In [16]:
# sh{}
# good{}
# long{}

# for index in mlc:
#     if Duration <= 30:
#         sh{}+1
#     elif Duration <30 or mlc.Duration > 1800:
#         good{}+1
#     else:
#         long{}+1

In [17]:
mlc["Songwriter?"] = pd.to_numeric(mlc["Songwriter?"]*100)
sw_ratio = mlc.groupby('DSP')['Songwriter?'].mean()
sw_ratio = sw_ratio.sort_values(ascending=False)
print(sw_ratio)

#Jasmine Drumright came up with this code to look into who has songwriter info

DSP
Classical Archives    100.000000
Qoboz                  90.384615
Deezer                 86.274510
Spotify                85.437585
Apple                  80.851351
Recisio                54.545455
Amazon                 26.173985
Pandora                24.482834
Melodyv                11.459590
LiveXLive               7.000000
GTL                     2.844037
Tidal                   0.369213
Smithsonian             0.000000
Ultimate Guitar         0.000000
Trebel                  0.000000
Wolfgangs               0.000000
SoundCloud              0.000000
Sonos                   0.000000
YouTube                 0.000000
Pacemaker               0.000000
PowerMusic              0.000000
Anghami                 0.000000
NugsNet                 0.000000
MonkingMe               0.000000
MixCloud                0.000000
Midwest Tape            0.000000
Fan Label               0.000000
AudioMack               0.000000
iHeart Radio            0.000000
Name: Songwriter?, dtype: float64


In [18]:
mlc.head()

Unnamed: 0,ISRC,Country,Registrant,Release_Year,Period,DSP,Streams,Artist,Label,Song,Album,Duration,Songwriter?
0,USUG12101043,US,UG1,21.0,3/1/2021,Apple,16216932,Drake,OVO,Lemon Pepper Freestyle (feat. Rick Ross),Scary Hours 2,383,100
1,USRC12100543,US,RC1,21.0,4/1/2021,Spotify,21456166,"Doja Cat,SZA",Kemosabe Records/RCA Records,Kiss Me More (feat. SZA),Kiss Me More (feat. SZA),209,100
2,USSM12102263,US,SM1,21.0,,Apple,15747471,DJ Khaled,Epic/We The Best,EVERY CHANCE I GET (feat. Lil Baby & Lil Durk),KHALED KHALED,237,100
3,USLD91731547,US,LD9,17.0,4/1/2021,Apple,17259260,Rod Wave,Alamo Records,Tombstone,SoulFly,160,100
4,USAT22007048,US,AT2,20.0,3/1/2021,Apple,8789577,Pooh Shiesty,Atlantic Records,Back In Blood (Feat. Lil Durk),Back in Blood (feat. Lil Durk) - Single,184,100


In [19]:
DJs=mlc.loc[mlc.Artist.str.contains('DJ')]

In [20]:
DJs.value_counts()
#That's too much info

ISRC          Country  Registrant  Release_Year  Period    DSP      Streams  Artist                       Label                                                Song                                      Album                                                                                             Duration  Songwriter?
FR10S1847070  FR       10S         18.0          3/1/2021  GTL      1        DJ Brytos                    Brytos Entertainment Under License to CD RUN Africa  Classic Hip Hop Trap                      Classic Hip Hop Trap                                                                              4769      0              20
FR6V81680808  FR       6V8         16.0          3/1/2021  GTL      1        DJ Just Dizle;DJ Spinna      Big Lynden Radio                                     Dearly Departed, Pt. 2 (The Rip Mixtape)  Dearly Departed (The Rip Mixtape) (Dj Just Dizle & Dj Spinna Présent)                             5225      0              11
FR10S1847071  FR       10

In [21]:
DJs.Artist.unique()

array(['DJ Khaled',
       'DJ Scheme,Ski Mask The Slump God,Danny Towers,Lil Yachty',
       'DJ Chose,Megan Thee Stallion',
       'DJ Khaled,Bryson Tiller,Meek Mill,H.E.R.',
       'DJ Khaled,Big Sean,Rick Ross,A Boogie Wit da Hoodie,Diddy',
       'DJ Khaled,Lil Baby,Lil Durk',
       'DJ Scheme,Cordae,Ski Mask The Slump God,Take A Daytrip',
       'Juice WRLD, DJ Scheme', 'DJ Snake,Selena Gomez',
       'Brent Faiyaz,DJ Dahi,Tyler, The Creator',
       'Brent Faiyaz & DJ Dahi Feat. Tyler The Creator',
       'Kaito Shoma,DJ Paul,Kingpin Skinny Pimp',
       'DJ Khaled,Bryson Tiller,Roddy Ricch,Lil Baby',
       'DJ Pharris,Chance the Rapper,Wiz Khalifa,Rockie Fresh',
       'DJ Chose|Megan Thee Stallion',
       'DJ Khaled,Nas,JAY-Z,James Fauntleroy',
       'Fat Joe, DJ Khaled & Amorphous', 'DJ Scheme,Juice WRLD',
       'DJ Khaled feat. Justin Bieber & 21 Savage',
       'DJ Khaled feat. Nas, JAY-Z & James Fauntleroy',
       'Bad Bunny|Mambo Kingz|Prince Royce|DJ Luian|J Balvin

In [34]:
DJs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 593 entries, 2 to 97693
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ISRC          585 non-null    object 
 1   Country       585 non-null    object 
 2   Registrant    585 non-null    object 
 3   Release_Year  585 non-null    float64
 4   Period        482 non-null    object 
 5   DSP           538 non-null    object 
 6   Streams       593 non-null    int32  
 7   Artist        593 non-null    object 
 8   Label         580 non-null    object 
 9   Song          593 non-null    object 
 10  Album         593 non-null    object 
 11  Duration      593 non-null    int64  
 12  Songwriter?   593 non-null    int64  
dtypes: float64(1), int32(1), int64(2), object(9)
memory usage: 62.5+ KB


In [37]:
DJs.Artist.astype(str)

2                                                DJ Khaled
28                                               DJ Khaled
38                                               DJ Khaled
105                                              DJ Khaled
175      DJ Scheme,Ski Mask The Slump God,Danny Towers,...
                               ...                        
96752                                    Phyno, DJ Kaywise
96753                                    Phyno, DJ Kaywise
96974                             Yelawolf,DJ Muggs,Caskey
97217                  Yelawolf,DJ Muggs,Struggle Jennings
97693                                    Yelawolf,DJ Muggs
Name: Artist, Length: 593, dtype: object

In [40]:
#regex=r"DJ\s\w+"

DJs = re.findall('DJ\s\w+', str(DJs.Artist))
#GOT IT TO WORK!!!!

In [44]:
DJs

['DJ Khaled',
 'DJ Khaled',
 'DJ Khaled',
 'DJ Khaled',
 'DJ Scheme',
 'DJ Kaywise',
 'DJ Kaywise',
 'DJ Muggs',
 'DJ Muggs',
 'DJ Muggs']

In [None]:
sw_ratio = mlc.groupby('DSP')['Songwriter?'].mean()
count_o_correctsub=sw_ratio.sort_values(ascending=False)

In [None]:
rows_of_DSP=mlc.DSP.value_counts()

In [None]:
#Group think composition
streams_per_DSP=mlc.groupby('DSP')['Streams'].sum()
streams_per_DSP=streams_per_DSP.sort_values(ascending=False)
print(streams_per_DSP)

#### Turn the findings into a dataframe

In [None]:
songwriter_df=pd.DataFrame(sw_ratio)
songwriter_df.columns =['SW_avg']
print(songwriter_df)

In [None]:
songwriter_df.reset_index(inplace=True)
songwriter_df = songwriter_df.rename(columns = {'index':"DSP"})

In [None]:
print(songwriter_df)

In [None]:
streams_df=pd.DataFrame(streams_per_DSP)

In [None]:
streams_df.reset_index(inplace=True)
streams_df = streams_df.rename(columns = {'index': "DSP"})
print(streams_df)

In [None]:
row_o_dsp=pd.DataFrame(rows_of_DSP)
row_o_dsp.reset_index(inplace=True)
row_o_dsp = row_o_dsp.rename(columns = {'index': "DSP", 'DSP': "songs_per_DSP"})
print(row_o_dsp)

In [None]:
DSP_df1 = pd.merge(songwriter_df, streams_df, on=['DSP'], how='inner')
print(DSP_df1)

In [None]:
DSP_df2 = pd.merge(DSP_df1, row_o_dsp, on=['DSP'], how='inner')
print(DSP_df2)

In [None]:
DSP_df2 = DSP_df2.sort_values(by=['Streams'], ascending=False)
print(DSP_df2)

In [None]:
pd.reset_option('display.float_format')
print(DSP_df2)

In [None]:
DSP_df2.info()

In [None]:
top_7=DSP_df2.head(7)
print(top_7)

In [None]:
ax = sns.barplot(x="DSP", y="Streams", data=top_7, palette=sns.color_palette("Blues_r", 10), order=top_7['DSP'])
plt.xticks(rotation = 50)
sns.set(rc = {'figure.figsize':(15,5)});

In [None]:
scatter= sns.barplot(data=top_7, x="DSP", y="SW_avg", palette=sns.color_palette("GnBu", 10))

In [None]:
# violin = sns.violinplot(data=mlc, x = 'DSP', y='Songwriter?', 
#                         inner='quart', linewidth=1,
#                        palette={0:'b', 1:"r"}, split=True)
#sns.despine(left=True)


In [None]:
# Plot top 7 by Songs_Per_DSP
plt.figure(figsize=(12, 6))
plt.bar(top_7.DSP, top_7.songs_per_DSP, color=['c'], edgecolor='gold')
plt.ylabel('Number of Songs', fontsize=16)
plt.xticks(rotation = 50, fontsize=16)
plt.yticks([5000, 10000, 15000, 20000, 25000, 30000, 35000], fontsize=16)
plt.title('Top 7 DSPs by Number of Songs Streamed', fontsize=20);


In [None]:
top_7

In [None]:
top_7DSP=top_7.sort_values(by=['songs_per_DSP'], ascending=False)
top_7DSP

In [None]:
#need to remind myself what the mlc contained
mlc.head(1)

In [None]:
mlc.Label.value_counts()

In [None]:
streams_per_Label=mlc.groupby('Label')['Streams'].sum()
streams_per_Label=streams_per_Label.sort_values(ascending=False)
print(streams_per_Label)

In [None]:
Label_ratio = mlc.groupby('Label')['Songwriter?'].mean()
Label_ratio = Label_ratio.sort_values(ascending=False)

In [None]:
Label_df = pd.merge(streams_per_Label, Label_ratio, on=['Label'], how='inner')
print(Label_df)

In [None]:
Label_df=Label_df.rename(columns={'Songwriter?':'SW_avg'})

In [None]:
Label_df.reset_index(inplace=True)
Label_df = Label_df.rename(columns = {'index': "Label"})
Label_df.head(10)

In [None]:
radiohead = Label_df.head(7)
print(radiohead)

In [None]:
# Plot the responses for different events and regions
plt.gcf().set_size_inches(15,7) # Rudy: Make it bigger
sns.barplot(x="Label", y="Streams", data=radiohead, palette=['c','orange','g','r','m','b','pink'])
plt.title('Record Label by Number of Streams', fontsize=20)
plt.xticks(rotation=50, fontsize=16)
plt.xlabel('', fontsize=0)
plt.ylabel('Number of Streams', fontsize=16)
plt.yticks(fontsize=16);


In [None]:
Top_7_Labels = Label_df.sort_values(by='SW_avg', ascending=False)
Top_7_Labels = Top_7_Labels[Top_7_Labels['Label'].isin(['Atlantic Records', 'Columbia', '300 Entertainment','Lofi Records', 'Alamo (Geffen Records)','Alamo Records', 'Taylor Swift'])]
Top_7_Labels

In [None]:
sns.set_theme(style="darkgrid")
sns.set(font_scale=1)
sns.barplot(x="Label", y="SW_avg", data=Top_7_Labels, palette=['pink','orange','r','b','c','m','g'], edgecolor='black')
plt.title('Recording Label by % of Songwriter Info Provided', fontsize=20)
plt.xticks(rotation=50, fontsize=16)
plt.xlabel('', fontsize=0)
plt.ylabel('% of Songwriter Info Provided', fontsize=16)
plt.yticks(fontsize=16)
plt.gcf().set_size_inches(15,7)

In [None]:
plt.pie(Top_7_Labels.Streams, labels = Top_7_Labels.Label, autopct='%.0f%%')
plt.show()

In [None]:
label_song_count = pd.DataFrame(mlc.Label.value_counts())
label_song_count.reset_index(inplace=True)
songs_by_label = label_song_count.rename(columns = {'index': "Label", 'Label':'Num_of_songs'})
songs_by_label

In [None]:
Label_df=pd.merge(Label_df, songs_by_label, on=['Label'], how='inner')
Label_df

In [None]:
Top_7_lsong = Label_df.sort_values(by='Num_of_songs', ascending=False)
Top_7_lsong= Top_7_lsong[Top_7_lsong['Label'].isin(['Atlantic Records', 'Columbia', '300 Entertainment','Lofi Records', 'Alamo (Geffen Records)','Alamo Records', 'Taylor Swift'])]
Top_7_lsong

In [None]:

plt.hlines(y=Top_7_lsong['Label'], xmin=0, xmax=Top_7_lsong['SW_avg'],linewidth=3, alpha =0.8)
plt.scatter(Top_7_lsong['Num_of_songs'], s=80, alpha=1)
plt.title("Percent of submitted songwriter", fontsize=20, x=0.5,y=1.02)
plt.xlabel('Lable', fontsize=20)
plt.ylabel('',fontsize=20)
plt.yticks(fontsize=15)
plt.xticks(fontsize=15)
plt.grid();

#### LET'S LOOK AT FOREIGN AND DOMESTIC OPTIONS

In [None]:
foreign=mlc.loc[mlc['Country']!= 'US']
foreign.head()

In [None]:
foreign['Streams'].value_counts()

In [None]:
foreign_ratio = foreign.groupby('Country')['Songwriter?'].mean()
foreign_ratio

In [None]:
foreign_ratio=pd.DataFrame(foreign_ratio)

In [None]:
foreign_ratio.reset_index(inplace=True)
foreign_ratio = foreign_ratio.rename(columns = {'index': "Country", 'Songwriter?':'SW_avg'})
print(foreign_ratio)
#foreign_ratio = foreign_ratio.rename(columns={})

In [None]:
foreign.info()

In [None]:
foreign.Streams.astype(int)
#foreign['Streams']=foreign['Streams'].str.replace(",", "")

In [None]:
foreign_stream=foreign.groupby('Country')['Streams'].mean()
foreign_stream=foreign_stream.sort_values(ascending=False)
foreign_stream

In [None]:
foreign_stream.reset_index(inplace=True)
foreign_stream = foreign_ratio.rename(columns = {'index': "Country"})
foreign_stream