# Data import

In [1]:
import pandas as pd
import os

DATA_FOLDER = './data_v2'
DATA_FILES = [os.path.join(DATA_FOLDER, file) for file in  os.listdir(DATA_FOLDER)]

datasets = {data: pd.read_json(data, lines=True) for data in DATA_FILES}

# Dataset analysys

## Null values

In [2]:
for dataset_name, dataset in datasets.items():
  null_dict = dataset.isnull().sum()
  print(f"DATASET {dataset_name}: ")
  print('---------------------------')
  for key, val in null_dict.items():
    print(f"{key:<15}: {val:<10}")
  print('===========================')
    

DATASET ./data_v2\artists.jsonl: 
---------------------------
id             : 0         
name           : 0         
genres         : 0         
DATASET ./data_v2\sessions.jsonl: 
---------------------------
timestamp      : 0         
user_id        : 0         
track_id       : 0         
event_type     : 0         
session_id     : 0         
DATASET ./data_v2\tracks.jsonl: 
---------------------------
id             : 0         
name           : 0         
popularity     : 0         
duration_ms    : 0         
explicit       : 0         
id_artist      : 0         
release_date   : 0         
danceability   : 0         
energy         : 0         
key            : 0         
mode           : 103719    
loudness       : 0         
speechiness    : 0         
acousticness   : 0         
instrumentalness: 0         
liveness       : 0         
valence        : 0         
tempo          : 0         
time_signature : 0         
DATASET ./data_v2\track_storage.jsonl: 
-----------------

## IDS = -1

In [3]:
for dataset_name, dataset in datasets.items():
  id_candidates = [key for key in dataset.keys() if 'id' in key]
  negative_ids = 0
  for id_candidate in id_candidates:
    negative_ids += len(dataset[dataset[id_candidate] == -1])
  print(f"DATASET {dataset_name}: ")
  print('---------------------------')
  print(f"Negative id\'s: {negative_ids:<10}")
  print('===========================')

DATASET ./data_v2\artists.jsonl: 
---------------------------
Negative id's: 0         
DATASET ./data_v2\sessions.jsonl: 
---------------------------
Negative id's: 0         
DATASET ./data_v2\tracks.jsonl: 
---------------------------
Negative id's: 0         
DATASET ./data_v2\track_storage.jsonl: 
---------------------------
Negative id's: 0         
DATASET ./data_v2\users.jsonl: 
---------------------------
Negative id's: 0         


## Wnioski

Wyraźnie widać, że wcześniejsze problemy z danymi już nie występują. Możemy zatem przejść do faktycznej analizy danych pod względem biznesowym

# Analiza danych

In [34]:
import pandas as pd
from dateutil import parser
from datetime import date
import matplotlib.pyplot as plt

In [35]:
artists = pd.read_json("data_v2/artists.jsonl", lines=True)
sessions = pd.read_json("data_v2/sessions.jsonl", lines=True)
track_storage = pd.read_json("data_v2/track_storage.jsonl", lines=True)
tracks = pd.read_json("data_v2/tracks.jsonl", lines=True)
users = pd.read_json("data_v2/users.jsonl", lines=True)

pd.set_option('mode.chained_assignment', None)

### Session event types

In [36]:
sessions['event_type'].unique()

array(['PLAY', 'SKIP', 'ADVERTISEMENT', 'LIKE', 'BUY_PREMIUM'],
      dtype=object)

Interesujące dla nas mogą być zdarzenia: 'PLAYED', 'LIKE' i 'SKIP'

In [44]:
sessions_play = sessions[sessions['event_type'] == 'PLAY']
sessions_like = sessions[sessions['event_type'] == 'LIKE']
sessions_skip = sessions[sessions['event_type'] == 'SKIP']

sessions_sorted_dict = {'play': sessions_play, 'like':sessions_like, 'skip': sessions_skip}

Timestamps

In [45]:
for event, session_data in sessions_sorted_dict.items():
  print(f"{event}: {session_data['timestamp'].min()} - {session_data['timestamp'].max()}")

play: 2023-01-09 07:53:36.153061 - 2023-04-10 01:50:41.712441
like: 2023-01-09 08:01:48.944061 - 2023-04-10 01:54:38.356441
skip: 2023-01-09 08:08:07.719061 - 2023-04-10 01:55:04.620441


Mamy do czynienia z danymi z przedziału 4 miesięcy - od 9.01 do 10.04

Połączenie danych z utworami i artystami + segregacja po miesiącu wydarzenia

In [60]:
def get_week(date):
  return (date.month - 1) * 4 + date.day % 7

for key, session_data in sessions_sorted_dict.items():
  merged = session_data.merge(tracks[['id', 'id_artist']], left_on='track_id', right_on='id', how='left')
  merged = merged.merge(artists[['id', 'name']], left_on='id_artist', right_on='id', how='left')
  merged['weeks_ordered'] = merged['timestamp'].apply(get_week)
  sessions_sorted_dict[key] = merged

KeyError: 'id_artist'