# My Spotify streaming data

In [17]:
# Importando bibliotecas
import pandas as pd
import os
import numpy as np
from datetime import timedelta

### Carregando os datasets

In [6]:
# Path para a pasta
path = 'datasets/streaming_history/'

# Lista para armazenar todos os dataframes
dataframes = []

# Criando looping para importar os datasets
for filename in os.listdir(path):
    if filename.endswith('.json'):
        path_final = os.path.join(path, filename)
        df = pd.read_json(path_final)
        dataframes.append(df)

# Concatenando todos os dfs
streaming_data = pd.concat(dataframes, ignore_index=True)

# Mostrando head
streaming_data.head()

Unnamed: 0,ts,username,platform,ms_played,conn_country,ip_addr_decrypted,user_agent_decrypted,master_metadata_track_name,master_metadata_album_artist_name,master_metadata_album_album_name,...,episode_name,episode_show_name,spotify_episode_uri,reason_start,reason_end,shuffle,skipped,offline,offline_timestamp,incognito_mode
0,2017-06-17T07:45:29Z,ox2t5jgl3cz2w3gbmcne91ay7,"Android OS 5.1.1 API 22 (asus, ZB500KG)",170640,BR,,,Goya No Machiawase,Hello Sleepwalkers,Masked Monkey Awakening,...,,,,playbtn,,True,,0.0,1497689000000.0,False
1,2017-06-17T07:47:14Z,ox2t5jgl3cz2w3gbmcne91ay7,"Android OS 5.1.1 API 22 (asus, ZB500KG)",103057,BR,,,夏の夜 - mindless ver.,The SALOVERS,いざ、サラバーズ!,...,,,,trackdone,,True,,0.0,1497689000000.0,False
2,2017-06-17T07:50:35Z,ox2t5jgl3cz2w3gbmcne91ay7,"Android OS 5.1.1 API 22 (asus, ZB500KG)",202160,BR,,,CATCHY,SAKANAMON,cue,...,,,,fwdbtn,,True,,0.0,1497689000000.0,False
3,2017-06-17T07:51:11Z,ox2t5jgl3cz2w3gbmcne91ay7,"Android OS 5.1.1 API 22 (asus, ZB500KG)",34252,BR,,,忘却,unsuspected monogram,the mass,...,,,,trackdone,,True,,0.0,1497689000000.0,False
4,2017-06-17T07:53:01Z,ox2t5jgl3cz2w3gbmcne91ay7,"Android OS 5.1.1 API 22 (asus, ZB500KG)",19598,BR,,,Odoru Rollschach (Album MIX),Owarikara,Saihate Songs,...,,,,fwdbtn,,True,,0.0,1497689000000.0,False


## Analisando infos dos dados

In [7]:
streaming_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261182 entries, 0 to 261181
Data columns (total 21 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   ts                                 261182 non-null  object 
 1   username                           261182 non-null  object 
 2   platform                           261182 non-null  object 
 3   ms_played                          261182 non-null  int64  
 4   conn_country                       261182 non-null  object 
 5   ip_addr_decrypted                  253340 non-null  object 
 6   user_agent_decrypted               252783 non-null  object 
 7   master_metadata_track_name         259312 non-null  object 
 8   master_metadata_album_artist_name  259312 non-null  object 
 9   master_metadata_album_album_name   259312 non-null  object 
 10  spotify_track_uri                  259312 non-null  object 
 11  episode_name                       260 

## Limpando e dropando colunas desnecessarias

In [8]:
# Index das colunas para drop
index_drop = [1,5,6,11,12,13,20,4]

# Lista das colunas para drop
names_drop = []

# Pegando a lista dos nomes das colunas para drop
for index in index_drop:
    names_drop.append(streaming_data.columns[index])

# Dropando colunas
streaming_data = streaming_data.drop(columns=names_drop)

# Mostrando df
print(streaming_data)

                          ts                                 platform  \
0       2017-06-17T07:45:29Z  Android OS 5.1.1 API 22 (asus, ZB500KG)   
1       2017-06-17T07:47:14Z  Android OS 5.1.1 API 22 (asus, ZB500KG)   
2       2017-06-17T07:50:35Z  Android OS 5.1.1 API 22 (asus, ZB500KG)   
3       2017-06-17T07:51:11Z  Android OS 5.1.1 API 22 (asus, ZB500KG)   
4       2017-06-17T07:53:01Z  Android OS 5.1.1 API 22 (asus, ZB500KG)   
...                      ...                                      ...   
261177  2023-10-22T16:34:32Z                                  android   
261178  2023-10-22T16:34:33Z                                  android   
261179  2023-10-22T16:34:34Z                                  android   
261180  2023-10-22T16:34:35Z                                  android   
261181  2023-10-22T16:34:38Z                                  android   

        ms_played    master_metadata_track_name  \
0          170640            Goya No Machiawase   
1          103057    

### Verificando null values

In [9]:
# Substituindo os valores Null para 'Desconhecido'
streaming_data['master_metadata_track_name'] = streaming_data['master_metadata_track_name'].fillna('Desconhecido')
streaming_data['master_metadata_album_artist_name'] = streaming_data['master_metadata_album_artist_name'].fillna('Desconhecido')
streaming_data['master_metadata_album_album_name'] = streaming_data['master_metadata_album_album_name'].fillna('Desconhecido')


# Convertendo os tipos dos dados

In [19]:
# Convertendo coluna de datetime
streaming_data['ts'] = pd.to_datetime(streaming_data['ts'])

# Mudando o formato da data e hora
streaming_data['ts'] = streaming_data['ts'].dt.strftime('%d-%m-%Y %H:%M:%S')

# Criando coluna de dia da semana
streaming_data['day_of_week'] = pd.to_datetime(streaming_data['ts']).dt.day_name()

# Calcular soma de tempo escutado por dia da semana
sum_duration_day = streaming_data.groupby('day_of_week')['ms_played'].sum()

# Convertendo ms para horas minutos e segundos
sum_duration_day = pd.to_timedelta(avg_duration, unit='ms')

print(avg_duration)

  streaming_data['ts'] = pd.to_datetime(streaming_data['ts'])
  streaming_data['day_of_week'] = pd.to_datetime(streaming_data['ts']).dt.day_name()


day_of_week
Friday      46 days 02:51:54.677000
Monday      46 days 20:29:35.566000
Saturday    44 days 07:15:12.672000
Sunday      42 days 23:26:17.462000
Thursday    47 days 17:01:58.231000
Tuesday     51 days 04:55:05.486000
Wednesday   51 days 16:49:28.681000
Name: ms_played, dtype: timedelta64[ns]


In [20]:
# Convertendo para datetime
streaming_data['ts'] = pd.to_datetime(streaming_data['ts'])

  streaming_data['ts'] = pd.to_datetime(streaming_data['ts'])


In [27]:
# Convert 'ts' column to datetime type
streaming_data['ts'] = pd.to_datetime(streaming_data['ts'])

# Calculating the sum of time played per year
sum_duration_year = streaming_data.groupby(streaming_data['ts'].dt.year)['ms_played'].sum()

# Convert ms to hours
sum_duration_year = sum_duration_year / (1000 * 3600)

print(sum_duration_year.mean())


1134.403681547619
