# Spotify data analysis: A retrospective

Every December since 2016, Spotify users discover their "Spotify Wrapped". The latter provides a compilation of data about their activity on the platform over the past year: top artists, top songs, top genres, etc. They get a deep dive into their most memorable listening moments of the year.

Objectives:
* JSON : Import several JSON files in an elegant way in Python
* SQL : 
  * Write SQL queries in Python
  * Demonstrate my abilities to code in SQL -> WINDOW FUNCTIONS, JOINS, etc
* PYTHON : Make a summary visual report in Python (hvplot)

In [10]:
# Import all relevant Python packages
import glob
import os
import pandas as pd
import json
import string
import sqlite3

## 1. Import the Spotify data (JSON files)

*Objective* : I received my Spotify data in different batches (8 JSON files). My goal is to import all of them in a single piece of code.
*Inspiration for the code below* : https://stackoverflow.com/questions/41857659/python-pandas-add-filename-column-csv

In [2]:
# Step 1 : I look for all JSON files available in my Windows folder (where I stored my Spotify data)

path_to_json = 'C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData/' #local path to my JSON files folder 
json_pattern = os.path.join(path_to_json,'*.json') #looking for every JSON file
globbed_files = glob.glob(json_pattern) #listing all the JSON files
globbed_files #overview

['C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData\\endsong_0.json',
 'C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData\\endsong_1.json',
 'C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData\\endsong_2.json',
 'C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData\\endsong_3.json',
 'C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData\\endsong_4.json',
 'C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData\\endsong_5.json',
 'C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData\\endsong_6.json',
 'C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData\\endvideo.json']

Comment: My Windows folder contains 8 different JSON files. 7 start with the name "endsong" and 1 is focused on videos (endvideo).

In [3]:
# Step 2: I open and read every JSON file thanks to a for loop. 
   # I also add a column "filename" that contains the name of each file
   # Finally, I store all data in a "data" list

data = [] # empty list (initialization) 
for json in globbed_files:
    frame = pd.read_json(json) 
    frame['filename'] = os.path.basename(json) 
    data.append(frame) 

In [5]:
# Step 3: Export and then import the data as dataframe
bigframe = pd.concat(data, ignore_index=True) #dont want pandas to try an align row indexes
bigframe.to_csv("spotify_data.csv", sep = ';')

  bigframe = pd.concat(data, ignore_index=True) #dont want pandas to try an align row indexes


In [14]:
# Step 4 : Remove private information (username, user agent, device, IP address) 
spotify_data = pd.read_csv("spotify_data.csv", sep = ';', index_col = 0)

# Blur device information
spotify_data["device"] = spotify_data["platform"].str[:10]

# Delete columns that share too much information
to_delete = ['username', 'ip_addr_decrypted', 'user_agent_decrypted', 'platform']
spotify_data.drop(to_delete, axis=1, inplace=True)

# End results
spotify_data.head(5)

Unnamed: 0,ts,ms_played,conn_country,master_metadata_track_name,master_metadata_album_artist_name,master_metadata_album_album_name,spotify_track_uri,episode_name,episode_show_name,spotify_episode_uri,reason_start,reason_end,shuffle,skipped,offline,offline_timestamp,incognito_mode,filename,device
0,2021-07-27T14:56:37Z,168168,FR,One Summer's Day,Smyang Piano,One Summer's Day,spotify:track:1MFASTOgDmN4AdjC6aPX3b,,,,trackdone,trackdone,False,,0.0,1627398000000.0,False,endsong_0.json,Windows 10
1,2021-12-06T19:56:36Z,6370,FR,ONE SHOT,B.A.P,ONE SHOT,spotify:track:3ECzwLt2OYNXjLMeTnYzp2,,,,clickrow,endplay,False,,0.0,1638821000000.0,False,endsong_0.json,Android OS
2,2020-11-25T11:05:19Z,184453,FR,BTD (Before The Dawn),INFINITE,Evolution,spotify:track:1IoCgKZwRgvMZzha9c52jM,,,,trackdone,trackdone,False,,0.0,1606302000000.0,False,endsong_0.json,Windows 10
3,2022-05-16T09:12:05Z,492,FR,El Dorado,Thomas Bergersen,SkyWorld,spotify:track:4o0sJFXHckKlBhhynZsSIg,,,,fwdbtn,fwdbtn,False,,0.0,1652692000000.0,False,endsong_0.json,Android OS
4,2022-03-26T20:24:49Z,355,FR,Euphoria,BTS,Love Yourself 結 'Answer',spotify:track:5YMXGBD6vcYP7IolemyLtK,,,,fwdbtn,fwdbtn,False,,0.0,1648326000000.0,False,endsong_0.json,Android OS


In [None]:
# Step 1 : Import the data
    # A améliorer / solution plus élégante à mettre en place
df0 = pd.read_json('C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData/endsong_0.json')
df1 = pd.read_json('C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData/endsong_1.json')
df2 = pd.read_json('C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData/endsong_2.json')
df3 = pd.read_json('C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData/endsong_3.json')
df4 = pd.read_json('C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData/endsong_4.json')
df5 = pd.read_json('C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData/endsong_5.json')
df6 = pd.read_json('C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData/endsong_6.json')

In [None]:
# Step 2 : Remove private information (username, user agent, IP address)
to_delete = ['username', 'ip_addr_decrypted', 'user_agent_decrypted']
df0.drop(to_delete, axis=1, inplace=True)
df1.drop(to_delete, axis=1, inplace=True)
df2.drop(to_delete, axis=1, inplace=True)
df3.drop(to_delete, axis=1, inplace=True)
df4.drop(to_delete, axis=1, inplace=True)
df5.drop(to_delete, axis=1, inplace=True)
df6.drop(to_delete, axis=1, inplace=True)

## 2. Set up the SQL connection

In [None]:
# Create the SQLITE3 connection
cnx = sqlite3.connect(':memory:')

# Transform dfx dataframes to a dfx SQL tables
df0.to_sql(name='df0', con=cnx)
df1.to_sql(name='df1', con=cnx)
df2.to_sql(name='df2', con=cnx)
df3.to_sql(name='df3', con=cnx)
df4.to_sql(name='df4', con=cnx)
df5.to_sql(name='df5', con=cnx)
df6.to_sql(name='df6', con=cnx)

## 3. Glimpse of the data

In [None]:
# Print the 10 first rows to ensure everything is fine
read_data = pd.read_sql('SELECT * FROM df0 LIMIT 10', cnx)
print(read_data)

## 4. Data transformation

In [None]:
# Union all -> 1 dataset only


Opérations à faire : 
* Colonne source de données : df0, df1, etc
* Joindre les données -> UNION
* Transformer certaines variables : millisecondes en secondes et minutes
* renommer variables aux noms à rallonge

In [None]:
read_data = pd.read_sql('select COUNT(*) AS nb_rows, MAX(ts) AS max_timestamp from df0', cnx)
print(read_data)

In [None]:
# Step 2 : SQL connection
import sqlite3

# Set up SQL connexion
sql_connect = sqlite3.connect('spotify_data.db')

# To execute some SQL command
cursor = sql_connect.cursor()

# Create database
#cursor.execute('''CREATE TABLE spotify_data (
 #   ts timestamp,
  #  username text)''') 

#cursor.execute("INSERT INTO spotify_data VALUES ('2022-07-17T06:07:42Z', 'v7x27nfjb2dri60b7jzl159rl')")

cursor.execute("SELECT * FROM spotify_data;")
print(cursor.fetchone())

sql_connect.commit()

sql_connect.close()

In [None]:
query = "SELECT * FROM df0;"
results = cursor.execute(query).fetchall()
sql_connect.close()

In [None]:

from sqlalchemy import create_engine

# Create database engine to manage connections
engine = create_engine("sqlite:///data.db")

# Load entire weather table by table name
weather = pd.read_sql("df0", engine)

In [None]:
# 
df.shape
#df.head(5)

In [None]:
# Test 2 : Je récupère la liste des fichiers JSON dans mon dossier
import os, json
import pandas as pd

path_to_json = 'C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
print(json_files) 

# Puis j'essaie d'importer chaque file une par une 
for file in json_files :
    df = pd.read_json(path_to_json + file)
    
df

In [None]:
spotify_data = pd.DataFrame(columns=['ts', 'username', 'platform',
                                    'ms_played', 'conn_country', 'user_agent_decrypted',
                                     'master_metadata_track_name',
                                     'master_metadata_album_artist_name',
                                     'master_metadata_album_album_name',
                                     'spotify_track_uri',
                                     'episode_name',
                                     'episode_show_name',
                                     'spotify_episode_uri',
                                     'reason_start',
                                     'reason_end',
                                     'shuffle', 'skipped', 'offline', 'offline_timestamp'
                                    ])
spotify_data.head(5)
#print spotify_json['features'][0]['geometry']

In [None]:
# Essai pour importer tous les fichiers du dossier
# Récupérer le 
path = 'C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData'
all_files = glob.glob(path + "/*.json")

all_files

In [None]:
for file in all_files:
    data = pd.read_json(file, lines=True)
    temp = temp.concat(data, ignore_index = True)

In [None]:
temp.head(5)

In [None]:
temp = pd.DataFrame()

path_to_json = 'C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData/' 

json_pattern = os.path.join(path_to_json,'*.json')
file_list = glob.glob(json_pattern)

for file in file_list:
    data = pd.read_json(file, lines=True)
    temp = temp.append(data, ignore_index = True)

In [None]:
# Inspirations fichiers csv en source 

import pandas as pd
import glob
import os

globbed_files = glob.glob("*.csv") #creates a list of all csv files

data = [] # pd.concat takes a list of dataframes as an agrument
for csv in globbed_files:
    frame = pd.read_csv(csv)
    frame['filename'] = os.path.basename(csv)
    data.append(frame)

bigframe = pd.concat(data, ignore_index=True) #dont want pandas to try an align row indexes
bigframe.to_csv("Pandas_output2.csv")

In [None]:
spotify_data = pd.read_csv("spotify_data.csv", sep = ';', index_col = 0)
spotify_data.head(5) 

In [None]:
dataframe = pd.DataFrame(data)
dataframe.head(10)