# Spotify data analysis: A retrospective

Every December since 2016, Spotify users discover their "Spotify Wrapped". The latter provides a compilation of data about their activity on the platform over the past year: top artists, top songs, top genres, etc. They get a deep dive into their most memorable listening moments of the year.

Objectives:
* JSON : Import several JSON files in an elegant way in Python
* SQL : 
  * Write SQL queries in Python
  * Demonstrate my abilities to code in SQL -> WINDOW FUNCTIONS, JOINS, etc
* PYTHON : Make a summary visual report in Python (hvplot)

## 1. Import the Spotify data (JSON files)

In [None]:
# Step 0: Import all the relevant Python packages
import pandas as pd
import os
import json
import glob

In [2]:
# Step 1 : Import the data
    # A améliorer / solution plus élégante à mettre en place
df0 = pd.read_json('C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData/endsong_0.json')
df1 = pd.read_json('C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData/endsong_1.json')
df2 = pd.read_json('C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData/endsong_2.json')
df3 = pd.read_json('C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData/endsong_3.json')
df4 = pd.read_json('C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData/endsong_4.json')
df5 = pd.read_json('C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData/endsong_5.json')
df6 = pd.read_json('C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData/endsong_6.json')


## 2. Set up a SQL connection

In [4]:
# Essai : ça fonctionne, mais le rendu est moche
from sqlalchemy import create_engine
engine = create_engine('sqlite://', echo=False)
df0.to_sql('df0', con=engine)
engine.execute("SELECT * FROM df0").fetchall()


[(0, '2021-07-27T14:56:37Z', 'v7x27nfjb2dri60b7jzl159rl', 'Windows 10 (10.0.19042; x64)', 168168, 'FR', '195.36.154.135', 'unknown', "One Summer's Day", 'Smyang Piano', "One Summer's Day", 'spotify:track:1MFASTOgDmN4AdjC6aPX3b', None, None, None, 'trackdone', 'trackdone', 0, None, 0, 1627397626788, 0),
 (1, '2021-12-06T19:56:36Z', 'v7x27nfjb2dri60b7jzl159rl', 'Android OS 7.0 API 24 (samsung, SM-G920F)', 6370, 'FR', '88.170.227.77', 'unknown', 'ONE SHOT', 'B.A.P', 'ONE SHOT', 'spotify:track:3ECzwLt2OYNXjLMeTnYzp2', None, None, None, 'clickrow', 'endplay', 0, None, 0, 1638820701148, 0),
 (2, '2020-11-25T11:05:19Z', 'v7x27nfjb2dri60b7jzl159rl', 'Windows 10 (10.0.18363; x64)', 184453, 'FR', '212.195.100.232', 'unknown', 'BTD (Before The Dawn)', 'INFINITE', 'Evolution', 'spotify:track:1IoCgKZwRgvMZzha9c52jM', None, None, None, 'trackdone', 'trackdone', 0, None, 0, 1606302133920, 0),
 (3, '2022-05-16T09:12:05Z', 'v7x27nfjb2dri60b7jzl159rl', 'Android OS 12 API 31 (samsung, SM-G990B)', 492, 'F

In [9]:
import sqlite3
# Create your connection.
cnx = sqlite3.connect(':memory:')
df0.to_sql(name='df0', con=cnx)
read_data = pd.read_sql('select * from df0', cnx)
print(read_data)

       index                    ts                   username  \
0          0  2021-07-27T14:56:37Z  v7x27nfjb2dri60b7jzl159rl   
1          1  2021-12-06T19:56:36Z  v7x27nfjb2dri60b7jzl159rl   
2          2  2020-11-25T11:05:19Z  v7x27nfjb2dri60b7jzl159rl   
3          3  2022-05-16T09:12:05Z  v7x27nfjb2dri60b7jzl159rl   
4          4  2022-03-26T20:24:49Z  v7x27nfjb2dri60b7jzl159rl   
...      ...                   ...                        ...   
15846  15846  2022-09-09T16:20:01Z  v7x27nfjb2dri60b7jzl159rl   
15847  15847  2022-07-07T08:26:46Z  v7x27nfjb2dri60b7jzl159rl   
15848  15848  2021-11-28T19:13:57Z  v7x27nfjb2dri60b7jzl159rl   
15849  15849  2022-10-18T07:37:39Z  v7x27nfjb2dri60b7jzl159rl   
15850  15850  2020-11-27T08:49:16Z  v7x27nfjb2dri60b7jzl159rl   

                                        platform  ms_played conn_country  \
0                   Windows 10 (10.0.19042; x64)     168168           FR   
1      Android OS 7.0 API 24 (samsung, SM-G920F)       6370        

In [10]:
read_data = pd.read_sql('select COUNT(*) AS nb_rows, MAX(ts) AS max_timestamp from df0', cnx)
print(read_data)

   nb_rows         max_timestamp
0    15851  2022-11-24T21:27:24Z


In [None]:
# Step 2 : SQL connection
import sqlite3

# Set up SQL connexion
sql_connect = sqlite3.connect('spotify_data.db')

# To execute some SQL command
cursor = sql_connect.cursor()

# Create database
#cursor.execute('''CREATE TABLE spotify_data (
 #   ts timestamp,
  #  username text)''') 

#cursor.execute("INSERT INTO spotify_data VALUES ('2022-07-17T06:07:42Z', 'v7x27nfjb2dri60b7jzl159rl')")

cursor.execute("SELECT * FROM spotify_data;")
print(cursor.fetchone())

sql_connect.commit()

sql_connect.close()

In [None]:
query = "SELECT * FROM df0;"
results = cursor.execute(query).fetchall()
sql_connect.close()

In [None]:

from sqlalchemy import create_engine

# Create database engine to manage connections
engine = create_engine("sqlite:///data.db")

# Load entire weather table by table name
weather = pd.read_sql("df0", engine)

In [None]:
# 
df.shape
#df.head(5)

In [None]:
# Test 2 : Je récupère la liste des fichiers JSON dans mon dossier
import os, json
import pandas as pd

path_to_json = 'C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
print(json_files) 

# Puis j'essaie d'importer chaque file une par une 
for file in json_files :
    df = pd.read_json(path_to_json + file)
    
df

In [None]:
spotify_data = pd.DataFrame(columns=['ts', 'username', 'platform',
                                    'ms_played', 'conn_country', 'user_agent_decrypted',
                                     'master_metadata_track_name',
                                     'master_metadata_album_artist_name',
                                     'master_metadata_album_album_name',
                                     'spotify_track_uri',
                                     'episode_name',
                                     'episode_show_name',
                                     'spotify_episode_uri',
                                     'reason_start',
                                     'reason_end',
                                     'shuffle', 'skipped', 'offline', 'offline_timestamp'
                                    ])
spotify_data.head(5)
#print spotify_json['features'][0]['geometry']

In [None]:
# Essai pour importer tous les fichiers du dossier
# Récupérer le 
path = 'C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData'
all_files = glob.glob(path + "/*.json")

all_files

In [None]:
for file in all_files:
    data = pd.read_json(file, lines=True)
    temp = temp.concat(data, ignore_index = True)

In [None]:
temp.head(5)

In [None]:
temp = pd.DataFrame()

path_to_json = 'C:/Users/margo/Documents/Documents/Formation/Github/Spotify/MyData/' 

json_pattern = os.path.join(path_to_json,'*.json')
file_list = glob.glob(json_pattern)

for file in file_list:
    data = pd.read_json(file, lines=True)
    temp = temp.append(data, ignore_index = True)

In [None]:
temp.head()

In [None]:
df.head(5)