# DATA DICTIONARY

1. `id`: Unique identifier or ID for each artist.
2. `followers`: The number of followers or fans an artist has.
3. `genres`: The music genres associated with the artist.
4. `name`: The name of the artist.
5. `popularity`: A measure of the artist's popularity.


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
artists=pd.read_csv('artists.csv')

In [3]:
artists.sort_values('popularity',ascending=False).head()

Unnamed: 0,id,followers,genres,name,popularity
144481,1uNFoZAHBGtllmzznpCI3s,44606973.0,"['canadian pop', 'pop', 'post-teen pop']",Justin Bieber,100
115489,4q3ewBCX7sLwd24euuV69X,32244734.0,"['latin', 'reggaeton', 'trap latino']",Bad Bunny,98
126338,06HL4z0CvFAxyc27GXpf02,38869193.0,"['pop', 'post-teen pop']",Taylor Swift,98
313676,3TVXtAsR1Inumwj472S9r4,54416812.0,"['canadian hip hop', 'canadian pop', 'hip hop'...",Drake,98
144484,3Nrfpe0tUJi4K4DXYWgMUX,31623813.0,"['k-pop', 'k-pop boy group']",BTS,96


In [4]:
artists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1104349 entries, 0 to 1104348
Data columns (total 5 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   id          1104349 non-null  object 
 1   followers   1104336 non-null  float64
 2   genres      1104349 non-null  object 
 3   name        1104346 non-null  object 
 4   popularity  1104349 non-null  int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 42.1+ MB


# cleaning

# missing

In [5]:
artists.genres=artists.genres.str.replace(r"[\[''\"\"\]]","",regex=True).str.strip()

In [6]:
artists.isnull().sum()

id             0
followers     13
genres         0
name           3
popularity     0
dtype: int64

In [7]:
# artists
artists.dropna(subset='name',inplace=True)

In [8]:
 # followers
artists.dropna(subset='followers',inplace=True)

In [10]:
# genres
artists=artists[artists.genres!='']
artists.set_index(np.arange(len(artists)))

Unnamed: 0,id,followers,genres,name,popularity
0,0VLMVnVbJyJ4oyZs2L3Yl2,71.0,carnaval cadiz,Las Viudas De Los Bisabuelos,6
1,0dt23bs4w8zx154C5xdVyl,63.0,carnaval cadiz,Los De Capuchinos,5
2,0pGhoB99qpEJEsBQxgaskQ,64.0,carnaval cadiz,Los “Pofesionales”,7
3,3HDrX2OtSuXLW5dLR85uN3,53.0,carnaval cadiz,Los Que No Paran De Rajar,6
4,22mLrN5fkppmuUPsHx6i2G,59.0,"classical harp, harp",Vera Dulova,3
...,...,...,...,...,...
298604,1q9C5XlekzXbRLIuLCDTre,90087.0,"social media pop, teen pop",Brent Rivera,33
298605,4fh2BIKYPFvXFsQLhaeVJp,309.0,la indie,Lone Kodiak,20
298606,7akMsd2vb4xowNTehv3gsY,774.0,indie rockism,The Str!ke,0
298607,35m7AJrUCtHYHyIUhCzmgi,205.0,indie rockism,Hunter Fraser,6


In [11]:
artists.isnull().sum()

id            0
followers     0
genres        0
name          0
popularity    0
dtype: int64

# add new column [main_genre] + cleaning

In [12]:
genre_list = ['brazilian', 'bollywood', 'sufi','filmi','pop', 'rock', 'classic', 'r&b', 'edm', 'rap', 'hip hop','electronic', 
                 'country', 'jazz',  'indie', 'folk',  'metal', 'latin',  'cumbia', 'instrumental','gospel', 'afrobeat',
                 'blues','acoustic','traditional','sufi', 'chinese']

def find_main_genre(entry):
    words = entry.split(', ')
    word_counts = {}
    for word in words:
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1
    
    if max(word_counts.values())>1:
        main_genre = max(word_counts, key=word_counts.get)
        return main_genre
    else:    
        for genre in genre_list:
            if genre in word:
                return genre
    

artists['main_genre'] = artists['genres'].apply(find_main_genre)


# columns

In [18]:
artists.main_genre.isnull().sum()
artists.dropna(subset='main_genre',inplace=True)
artists.drop('genres',axis='columns',inplace=True)

In [19]:
artists.info()

<class 'pandas.core.frame.DataFrame'>
Index: 152693 entries, 141 to 1104345
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          152693 non-null  object 
 1   followers   152693 non-null  float64
 2   name        152693 non-null  object 
 3   popularity  152693 non-null  int64  
 4   main_genre  152693 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 7.0+ MB


# exporting cleaned dataset

In [15]:
artists.to_csv('artists_cleaned.csv',index=False)