## Setup

In [11]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import pandas as pd
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

## Get the data

In [21]:
import os
import requests
import zipfile
import pandas as pd


url = "https://www.kaggle.com/api/v1/datasets/download/joebeachcapital/30000-spotify-songs"
zip_path = "spotify_songs.zip"
extract_path = "./datasets"
#create dataframe songs:


# Faz o download do arquivo zip
response = requests.get(url)
with open(zip_path, "wb") as file:
    file.write(response.content)
os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(extract_path)

## Take a quick look at the dataset

In [None]:
csv_path = os.path.join(extract_path, "spotify_songs.csv")
songs = pd.read_csv(csv_path)
print(songs.head())

In [None]:
songs.info()

In [None]:
pd.set_option('display.max_rows', None)
songs["playlist_genre"].value_counts()

In [None]:
# Agrupar por 'playlist_genre' e calcular a média da popularidade
genre_popularity = songs.groupby("playlist_genre")["track_popularity"].mean()

# Ordenar os resultados em ordem decrescente
genre_popularity_sorted = genre_popularity.sort_values(ascending=False)

# Exibir os gêneros com músicas mais populares
print(genre_popularity_sorted)


In [None]:
# Encontrar o valor máximo de popularidade
max_popularity = songs["track_popularity"].max()

# Filtrar todas as músicas com a popularidade máxima
most_popular_songs = songs[songs["track_popularity"] == max_popularity]

# Exibir o resultado
print(most_popular_songs[["track_name", "track_popularity"]])


In [None]:
# Ordenar as músicas pela popularidade em ordem decrescente
songs_sorted = songs.sort_values(by="track_popularity", ascending=False)

# Exibir apenas o nome das músicas, nome dos artistas e suas popularidades
(songs_sorted[["track_name", "track_popularity", "track_artist"]]).head(20)

In [None]:
songs.describe()

In [None]:
import matplotlib.pyplot as plt
songs.hist(bins=50, figsize=(20,15))
plt.show()

## Create a Set Test

In [15]:
#to make this notebook's output identical at every run
np.random.seed(42)

In [None]:
import numpy as np# Use train_test_split to split the data into a training set and a test set
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(songs, test_size=0.2, random_state=42)
len(train_set)

In [None]:
len(test_set)

## Criar uma cópia do conjunto de treino

In [18]:
songs_cp = songs.copy()

## Retirar músicas duplicadas

In [None]:
# Remover duplicatas, mantendo apenas uma ocorrência de cada 'track_name' e 'track_artist'
#songs_cp_unique = songs_cp.drop_duplicates(subset=["track_name", "track_artist"])

songs_cp_unique = songs_cp.drop_duplicates(subset=["track_id"])

#songs_cp_unique = songs_cp

# Ordenar pela popularidade para manter a ordem
#songs_cp_sorted = songs_cp_unique.sort_values(by="track_popularity", ascending=False)
songs_cp_sorted = songs_cp_unique.sort_values(by=["track_name", "track_artist"], ascending=False)


# Exibir as músicas sem duplicatas
songs_cp_sorted[["track_name", "track_popularity", "track_artist"]].head(20)

# Selecionar linha de indice do dataframe songs_cp_sorted com track_id 2XU0oxnq2qxCpomAAuJY8K
#songs_cp.loc[songs_cp["track_id"] == "2XU0oxnq2qxCpomAAuJY8K"]


In [None]:
songs_cp.sort_values(by="track_popularity", ascending=False)[["track_name", "track_popularity", "track_artist"]].head(20)

In [None]:
len(songs_cp_sorted)