In [210]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.style as style
%matplotlib inline
import seaborn as sns
from scipy import stats
import re

import warnings
warnings.filterwarnings('ignore')

plt.rc('xtick', labelsize = 10)
plt.rc('ytick', labelsize = 10)
plt.rc('axes', labelsize = 12, labelpad = 5)

In [211]:
df_songs = pd.read_csv('drake_songs.csv', index_col='Unnamed: 0')
df_wiki = pd.read_csv('drake_wiki.csv', index_col='Unnamed: 0')

## Data Cleaning

#### Songs Tables

In [212]:
df_songs.head()

Unnamed: 0,title,album,release_date,featured_artists,producer_artists,writer_artists,genius_track_id,genius_album_id
0,0 to 100 / The Catch Up,#FYM12,2014-06-01,[],"['Chester Hansen', 'Vinylz', 'Ging', 'Boi-1da'...","['Chester Hansen', 'Ging', 'Nineteen85', 'Drak...",156640,914226.0
1,10 Bands,If You’re Reading This It’s Too Late,2015-02-13,[],"['Ging', 'Sevn Thomas', 'Boi-1da']","['Sevn Thomas', 'Ging', 'Boi-1da', 'Quentin Mi...",703738,119674.0
2,1Xtra Freestyle,Tim Westwood I Freestyles,2010-06-11,[],['Tim Westwood'],['Drake'],421444,883007.0
3,2,<single>,,[],[],[],2457033,
4,2011 Juno Awards In Toronto,<single>,2011-03-26,[],[],[],214614,


In [213]:
df_songs.dtypes

title                object
album                object
release_date         object
featured_artists     object
producer_artists     object
writer_artists       object
genius_track_id       int64
genius_album_id     float64
dtype: object

In [214]:
for column in ['featured_artists', 'producer_artists', 'writer_artists']:
    df_songs[column] = df_songs[column].apply(lambda x: x.strip('[]').replace("'",""))
    df_songs[column] = df_songs[column].apply(lambda x: x.split(','))

commercial_releases = [
    'So Far Gone',
    'Thank Me Later',
    'Take Care',
    'Care Package',
    'Nothing Was the Same',
    "If You're Reading This It's Too Late",
    'Views',
    'What a Time To Be Alive',
    'More Life',
    'Scary Hours',
    'Scorpion',
    'The Best in the World Pack',
    'Dark Lane Demo Tapes',
    'Scary Hours 2',
    'Certified Lover Boy',
    'Honestly, Nevermind',
    'Her Loss'
]

df_songs_commercial = df_songs[(df_songs['album'].isin(commercial_releases))]

In [215]:
df_songs.sample(5)

Unnamed: 0,title,album,release_date,featured_artists,producer_artists,writer_artists,genius_track_id,genius_album_id
72,Can’t Take a Joke,Scorpion,2018-06-29,[],[ModMaxx],"[ModMaxx, Drake]",3807748,420582.0
106,Doing His Thing,<single>,,[],[Terrel “T. Slack”],"[Terrel “T. Slack”, Drake]",57394,
188,Greatness,Young Sweet Jones,2010-06-21,[Bishop Brigante],[Rene Snaz Hill],"[Bishop Brigante, Drake]",3001,881461.0
524,The Ride,Take Care,2011-11-15,[The Weeknd],"[Doc McKinney, The Weeknd]","[Drake, The Weeknd, Doc McKinney, Anthony P...",55694,10787.0
506,Thank Me Now,Thank Me Later,2010-06-15,[],[Timbaland],"[Timbaland, Drake]",518,2630.0


#### Wiki Table

In [216]:
df_wiki.columns = [x.replace('[','').replace(']','') for x in df_wiki.columns]
df_wiki.columns = [re.sub(r'[0-9]','',x) for x in df_wiki.columns]
df_wiki.columns = [x.lower() for x in df_wiki.columns]

df_wiki_obj = df_wiki.select_dtypes('object')
df_wiki[df_wiki_obj.columns] = df_wiki_obj.apply(lambda x: x.str.strip())
df_wiki = df_wiki.replace(df_wiki['can'].iloc[8],np.NaN)
df_wiki.fillna(0, inplace=True)

country_cols = ['can','aus','fra','ire','nz','swe','uk','us','usr&b/hh','usrap']
df_wiki[country_cols] = df_wiki[country_cols].astype('int')

In [217]:
df_wiki['sales'].iloc[0]

'CAN: 176,000[27] UK: 219,053[28] US: 1,830,000[29]'

In [218]:
df_wiki['sales'].iloc[0].split(':')

['CAN', ' 176,000[27] UK', ' 219,053[28] US', ' 1,830,000[29]']

# EDA

**What artists are featured the most on Drake songs?**

In [219]:
featured_artists = {}
for list in df_songs_commercial['featured_artists']:
    if len(list) > 1:
        for name in list:
            name = name.strip()
            if name in featured_artists:
                featured_artists[name.strip()] += 1
            else:
                featured_artists[name.strip()] = 1
        

In [220]:
df_featured_artists = pd.DataFrame.from_dict(featured_artists, orient='index')
df_featured_artists.reset_index(inplace=True)
df_featured_artists.rename({
    'index' : 'artist',
    0 : 'count'
}, axis = 1, inplace = True)

In [221]:
df_featured_artists.sort_values('count', ascending=False).head(10)

Unnamed: 0,artist,count
22,Lil Wayne,6
2,2 Chainz,2
20,Young Thug,2
0,Ty Dolla $ign,1
15,Project Pat,1
27,Future,1
26,Bun B,1
25,Santigold,1
24,André 3000,1
23,Tyga,1


**What producer has produced the most Drake songs?**

In [222]:
producer_artists = {}
for list in df_songs_commercial['producer_artists']:
    if len(list) > 1:
        for name in list:
            name = name.strip()
            if name in producer_artists:
                producer_artists[name.strip()] += 1
            else:
                producer_artists[name.strip()] = 1

df_producer_artists = pd.DataFrame.from_dict(producer_artists, orient='index')
df_producer_artists.reset_index(inplace=True)
df_producer_artists.rename({
    'index' : 'artist',
    0 : 'count'
}, axis = 1, inplace = True)

In [223]:
df_producer_artists.sort_values('count', ascending=False).head(10)

Unnamed: 0,artist,count
14,40,68
6,Boi-1da,26
44,Nineteen85,11
64,OZ,8
26,T-Minus,8
41,Noel Cadastre,8
8,Maneesh,7
4,Vinylz,7
2,Allen Ritter,7
60,No I.D.,6


**Who has the most writing credits on Drake songs?**

In [224]:
writer_artists = {}
for list in df_songs_commercial['writer_artists']:
    if len(list) > 1:
        for name in list:
            name = name.strip()
            if name in writer_artists:
                writer_artists[name.strip()] += 1
            else:
                writer_artists[name.strip()] = 1

df_writer_artists = pd.DataFrame.from_dict(writer_artists, orient='index')
df_writer_artists.reset_index(inplace=True)
df_writer_artists.rename({
    'index' : 'artist',
    0 : 'count'
}, axis = 1, inplace = True)

In [225]:
df_writer_artists.sort_values('count', ascending=False).head(10)

Unnamed: 0,artist,count
3,Drake,200
5,40,102
21,Boi-1da,30
49,Anthony Palman,21
116,Kenza Samir,14
173,Nineteen85,13
71,Lil Wayne,13
26,Maneesh,11
99,Noel Cadastre,11
73,T-Minus,10


We can see that Drake is credited on 200 songs (commercial album releases) but what percentage of his entire discography is that?

In [226]:
per_writing = np.round(df_writer_artists[df_writer_artists['artist'] == 'Drake']['count'].values[0] / df_songs_commercial.shape[0] * 100, 2)
print(f'Drake has writing credits on {per_writing}% of his commercially released songs')
print('...')
print('*Data does not account for ghost writers*')

Drake has writing credits on 95.24% of his commercially released songs
...
*Data does not account for ghost writers*
