In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from dotenv import load_dotenv
import json
import os
import lyricsgenius as genius


In [2]:
artist_list = ["5-30",
"9 Miller",
"Agir",
"Alcool Club",
"Allen Halloween",
"Bispo",
"Boss AC",
"Cálculo",
"Carlão",
"Chico da Tina",
"Classe Crua",
"Conjunto Corona",
"Da Weasel",
"Dealema",
"Deejay Telio",
"Deezy (Dope Muzik)",
"Dillaz",
"Estraca",
"Fínix MG",
"Força Suprema",
"GROGNation",
"Gson",
"Harold",
"Holly Hood",
"Jimmy P",
"Kappa Jotta",
"Keso",
"LON3R JOHNY",
"Mike El Nite",
"NERVE",
"NGA",
"Papillon",
"Phoenix RDC",
"Plutónio",
"Piruka",
"Prodígio",
"ProfJam",
"ProfJam x benji price",
"Regula",
"Sam The Kid",
"SippinPurp",
"Sir Scratch (PRT)",
"Slow J",
"Tóy Tóy T-Rex",
"Vado Más Ki Ás",
"Valas",
"Valete",
"Waze",
"Wet Bed Gang",
"X-Tense",
"Xeg"]


In [3]:
load_dotenv()
geniusCreds = os.environ.get("geniusCreds")

# "(Acoustic)", "Rework", "(Live)", "(Demo)", "TBA", "FULL LENGTH", "Billboard", "BBC", "bootleg", 'فارسی', "Türkçe", "Aşk Şarkısı", "instrumental"

api = genius.Genius(geniusCreds, verbose =False, excluded_terms = ["Remix", "edit", "feat", "Paassatempo"], sleep_time=0.3, remove_section_headers = True, skip_non_songs = True)

In [4]:
def crawler(lst):
    for name in tqdm(lst):
        try:
            artist = api.search_artist(name, per_page=50)
            artist.save_lyrics()
            print(str(name) + " json saved")
        except:
            print("failed for", name)
            file = open(r"not-working.txt","a")
            file.write(name)
            file.close()

crawler(artist_list)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=51.0), HTML(value='')))

Wrote `Lyrics_530.json`
5-30 json saved
Wrote `Lyrics_9Miller.json`
9 Miller json saved
Wrote `Lyrics_Agir.json`
Agir json saved
Wrote `Lyrics_AlcoolClub.json`
Alcool Club json saved
Wrote `Lyrics_AllenHalloween.json`
Allen Halloween json saved
Wrote `Lyrics_Bispo.json`
Bispo json saved
Wrote `Lyrics_BossAC.json`
Boss AC json saved
Wrote `Lyrics_Cálculo.json`
Cálculo json saved
Wrote `Lyrics_CarlãoPacMan.json`
Carlão json saved
Wrote `Lyrics_ChicodaTina.json`
Chico da Tina json saved
Wrote `Lyrics_ClasseCrua.json`
Classe Crua json saved
Wrote `Lyrics_ConjuntoCorona.json`
Conjunto Corona json saved
Wrote `Lyrics_DaWeasel.json`
Da Weasel json saved
Wrote `Lyrics_Dealema.json`
Dealema json saved
Wrote `Lyrics_DeejayTelio.json`
Deejay Telio json saved
Wrote `Lyrics_DeezyDopeMuzik.json`
Deezy (Dope Muzik) json saved
Wrote `Lyrics_Dillaz.json`
Dillaz json saved
Wrote `Lyrics_Estraca.json`
Estraca json saved
Wrote `Lyrics_FínixMG.json`
Fínix MG json saved
Wrote `Lyrics_ForçaSuprema.json`
Forç

## Construção do conjunto de dados

In [3]:
#initailizing empty dataframe for concat each json info
maindata = pd.DataFrame()    

for filename in os.listdir():
    if filename.endswith(".json"): 
        with open(filename) as json_data:
           
            data = json.load(json_data)
            
            #getting artist name and release date
            songs = data.get("songs")
            songs_df = pd.DataFrame(songs)
             
            #getting artist name and release date
            songs_df["artist_name"] = data.get("name")
            songs_df['date'] = [i.get("release_date") for i in songs]

            #getting album name
            album_lst = [i.get("album") for i in songs]
            songs_df["album"] = [i.get("name") if i else "---" for i in album_lst]

            #getting featured artists
            featured_lists = [song.get("featured_artists") for song in songs]
            songs_df["featured_artists"] = [[featured_artist["name"] for featured_artist in featured_lst] for featured_lst in featured_lists]
         

            maindata = pd.concat([maindata, songs_df])
              
maindata = maindata[['artist_name', 'title', 'album', 'featured_artists', 'lyrics', 'date', 'url']]

   
#maindata.to_csv("teste.csv")
maindata

Unnamed: 0,artist_name,title,album,featured_artists,lyrics,date,url
0,Slow J,Às Vezes,The Art Of Slowing Down,[Nerve],Às vezes dói mas eu escondo\nDesde que eu apre...,2017-03-17,https://genius.com/Slow-j-as-vezes-lyrics
1,Slow J,Serenata,The Art Of Slowing Down,[],Eu escrevo serenatas de agonia\nDebaixo da jan...,2016-05-15,https://genius.com/Slow-j-serenata-lyrics
2,Slow J,Comida,The Art Of Slowing Down,[],"Eu 'tou tão longe da verdade pura, dura\nÀ pro...",2015-12-31,https://genius.com/Slow-j-comida-lyrics
3,Slow J,Cristalina,The Free Food Tape,[],,2015-11-19,https://genius.com/Slow-j-cristalina-lyrics
4,Slow J,Vida Boa,The Art Of Slowing Down,[],"Não quero uma boa vida, eu quero uma vida boa\...",2016-05-27,https://genius.com/Slow-j-vida-boa-lyrics
...,...,...,...,...,...,...,...
82,Bispo,Intro (Passo a Passo),Passo a Passo [Mixtape],[],,2013-08-23,https://genius.com/Bispo-intro-passo-a-passo-a...
83,Bispo,Saldos,Passo a Passo [Mixtape],[Miacra],,2013-08-23,https://genius.com/Bispo-saldos-lyrics
84,Bispo,Por Aí,Passo a Passo [Mixtape],[],Então qual é ó desaparecido?\nJá à bués que um...,2013-08-23,https://genius.com/Bispo-por-ai-lyrics
85,Bispo,Preto No Branco,Passo a Passo [Mixtape],[],"Rosto trancado, olho vidrado\nLágrimas escorre...",2013-08-23,https://genius.com/Bispo-preto-no-branco-lyrics


In [22]:
#adding crazy amount of json files fetched to prevent loss of lyrics

maindata_batch = pd.read_csv("tudo_junto.csv", parse_dates=["date"])
new_maindata = pd.concat([maindata, maindata_batch])

In [24]:
check = new_maindata.drop_duplicates(subset=['artist_name', 'lyrics'])
check.to_csv("letras_final.csv")