In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from dotenv import load_dotenv
import json
import os
import lyricsgenius as genius


In [2]:
artist_list = ["5-30", 
"9 Miller",
"Holly Hood",
"Mike El Nite",
"Papillon",
"SippinPurp",
"Sir Scratch (PRT)",
"Valas",
"Valete",
"Wet Bed Gang"]


In [3]:
load_dotenv()
geniusCreds = os.environ.get("geniusCreds")

# "(Acoustic)", "Rework", "(Live)", "(Demo)", "TBA", "FULL LENGTH", "Billboard", "BBC", "bootleg", 'فارسی', "Türkçe", "Aşk Şarkısı", "instrumental"

api = genius.Genius(geniusCreds, verbose =False, excluded_terms = ["Remix", "edit", "feat", "Paassatempo"], sleep_time=0.3, remove_section_headers = True, skip_non_songs = True)

In [4]:
def crawler(lst):
    for name in tqdm(lst):
        try:
            artist = api.search_artist(name, per_page=50)
            artist.save_lyrics()
            print(str(name) + " json saved")
        except:
            print("failed for", name)
            file = open(r"not-working.txt","a")
            file.write(name)
            file.close()

crawler(artist_list)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))

Wrote `Lyrics_530.json`
5-30 json saved
Wrote `Lyrics_9Miller.json`
9 Miller json saved
Wrote `Lyrics_HollyHood.json`
Holly Hood json saved
Wrote `Lyrics_MikeElNite.json`
Mike El Nite json saved
Wrote `Lyrics_Papillon.json`
Papillon json saved
Wrote `Lyrics_SippinPurpp.json`
SippinPurp json saved
Wrote `Lyrics_SirScratchPRT.json`
Sir Scratch (PRT) json saved
Wrote `Lyrics_Valas.json`
Valas json saved
Wrote `Lyrics_Valete.json`
Valete json saved
Wrote `Lyrics_WetBedGang.json`
Wet Bed Gang json saved



## Construção do conjunto de dados

In [5]:
#initailizing empty dataframe for concat each json info
maindata = pd.DataFrame()    

for filename in os.listdir():
    if filename.endswith(".json"): 
        with open(filename) as json_data:
           
            data = json.load(json_data)
            
            #getting artist name and release date
            songs = data.get("songs")
            songs_df = pd.DataFrame(songs)
             
            #getting artist name and release date
            songs_df["artist_name"] = data.get("name")
            songs_df['date'] = [i.get("release_date") for i in songs]

            #getting album name
            album_lst = [i.get("album") for i in songs]
            songs_df["album"] = [i.get("name") if i else "---" for i in album_lst]

            #getting featured artists
            featured_lists = [song.get("featured_artists") for song in songs]
            songs_df["featured_artists"] = [[featured_artist["name"] for featured_artist in featured_lst] for featured_lst in featured_lists]
         

            maindata = pd.concat([maindata, songs_df])
              
maindata = maindata[['artist_name', 'title', 'album', 'featured_artists', 'lyrics', 'date', 'url']]

   
#maindata.to_csv("teste.csv")
maindata

Unnamed: 0,artist_name,title,album,featured_artists,lyrics,date,url
0,Mike El Nite,Dr. Bayard,Inter-Missão,"[SippinPurpp, Fínix MG]",,2018-05-07,https://genius.com/Mike-el-nite-dr-bayard-lyrics
1,Mike El Nite,Mambo nº1,Rusga para Concerto em G Menor,[ProfJam],,2013-12-16,https://genius.com/Mike-el-nite-mambo-no1-lyrics
2,Mike El Nite,Só badalhocas,Vaporetto Titano,[Da Chick],Mike El Nite:\nSó badalhocas nesta festa\nCom...,2016-01-01,https://genius.com/Mike-el-nite-so-badalhocas-...
3,Mike El Nite,Horizontes,O Justiceiro,[],,2016-04-22,https://genius.com/Mike-el-nite-horizontes-lyrics
4,Mike El Nite,T.U.G.A.,O Justiceiro,[],"Hey, vive a vida como uma festa\nSob o vento d...",2016-04-22,https://genius.com/Mike-el-nite-tuga-lyrics
...,...,...,...,...,...,...,...
9,Valas,Raíz,Raízes de Pedra,[],,,https://genius.com/Valas-raiz-lyrics
10,Valas,Talvez,Check-In,[],,2018-06-01,https://genius.com/Valas-talvez-lyrics
11,Valas,Dia D,Check-In,[],,2018-06-01,https://genius.com/Valas-dia-d-lyrics
12,Valas,El Dorado,Check-In,[DJ Sims],,2018-06-01,https://genius.com/Valas-el-dorado-lyrics


In [6]:
#adding crazy amount of json files fetched to prevent loss of lyrics

maindata_batch = pd.read_csv("tudo_junto.csv", parse_dates=["date"])
new_maindata = pd.concat([maindata, maindata_batch])

In [12]:
check = new_maindata.drop_duplicates(subset=['artist_name', 'lyrics'])
#check.to_csv("letras_final.csv")
#check.isna().sum()
check["artist_name"].value_counts()

Phoenix RDC              69
Sam The Kid              58
Valete                   54
ProfJam                  49
Boss AC                  47
Bispo                    44
Xeg                      43
Jimmy P                  43
Regula                   41
Allen Halloween          38
Dealema                  38
LON3R JOHNY              37
Da Weasel                36
Dillaz                   35
NGA                      35
Piruka                   31
GROGNation               31
Alcool Club              30
Wet Bed Gang             27
Agir                     27
Slow J                   26
Conjunto Corona          26
Keso                     24
Mike El Nite             22
Plutonio                 22
Deejay Telio             20
Kappa Jotta              20
X-Tense                  18
Sir Scratch (PRT)        17
Papillon                 17
Força Suprema            17
Carlão (PacMan)          16
Cálculo                  15
Chico da Tina            15
Estraca                  14
Fínix MG            

In [16]:
check.dropna(subset=['lyrics'], inplace=True)

In [18]:
check["artist_name"].value_counts()

Phoenix RDC              68
Sam The Kid              57
Valete                   53
ProfJam                  48
Boss AC                  46
Bispo                    43
Xeg                      42
Jimmy P                  42
Regula                   40
Allen Halloween          37
Dealema                  37
LON3R JOHNY              36
Da Weasel                35
Dillaz                   34
NGA                      34
Piruka                   30
GROGNation               30
Alcool Club              29
Wet Bed Gang             26
Agir                     26
Slow J                   25
Conjunto Corona          25
Keso                     23
Mike El Nite             21
Plutonio                 21
Deejay Telio             19
Kappa Jotta              19
X-Tense                  17
Sir Scratch (PRT)        16
Papillon                 16
Força Suprema            16
Carlão (PacMan)          15
Cálculo                  14
Chico da Tina            14
Estraca                  13
Fínix MG            

In [19]:
check.to_csv("letras_final.csv")