# Analiza atrybutów

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

In [3]:
DIR_DATA = 'data/v2'

In [4]:
artists = pd.read_json(path_or_buf=f'{DIR_DATA}/artists.jsonl', lines=True)
sessions = pd.read_json(path_or_buf=f'{DIR_DATA}/sessions.jsonl', lines=True)
tracks = pd.read_json(path_or_buf=f'{DIR_DATA}/tracks.jsonl', lines=True)
users = pd.read_json(path_or_buf=f'{DIR_DATA}/users.jsonl', lines=True)

In [5]:
features = ['play_growth_previous', 'like_growth_previous', 
            'average_duration_ratio', 'explicit', 'danceability', 
            'energy', 'key', 'loudness', 'speechiness', 'acousticness', 
            'instrumentalness', 'liveness', 'valence', 'tempo',
            'average_genres_ratio']

# ==================== #
# play_growth_previous - przyrost sumy odtworzeń w poprzednim tygodniu
# like_growth_previous - przyrost sumy polubień w poprzednim tygodniu
# average_duration_ratio - średni stosunek czasu słuchania do długości utworu
# average_genres_ratio - średni stosunek ilości pokrywających się ulubionych /
## gatunków użytkownika z zadeklarowanymi gatunkami autora utworu do ich ilości
# ===================== #

target = ['play_growth', 'play_growth_next']

### Przygotowanie danych do użycia 
- podział sesji według tygodni i utworów
- policzenie danych takich jak liczba odtworzeń, liczba polubień, przyrost odtworzeń, średni czas odtwarzania 

In [46]:
tracks = tracks.rename(columns={'id': 'track_id'})

merged_df = pd.merge(tracks, sessions, on='track_id', how='inner')
merged_df['timestamp'] = pd.to_datetime(merged_df['timestamp'])
merged_df['week'] = merged_df['timestamp'].dt.strftime('%U')

start_date = pd.to_datetime('2022-12-05')

merged_df['week'] = (merged_df['timestamp'] - start_date).dt.days // 7
merged_df['week'] = merged_df['week'] + 1


#remove first and last week, because they're not complete
merged_df = merged_df[merged_df.week.isin([1, 53]) == False]
merged_df.info()

all_weeks = sorted(merged_df['week'].unique())
print(all_weeks)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1196511 entries, 0 to 1223971
Data columns (total 22 columns):
 #   Column            Non-Null Count    Dtype         
---  ------            --------------    -----         
 0   track_id          1196511 non-null  object        
 1   name              1196511 non-null  object        
 2   popularity        1196511 non-null  int64         
 3   duration_ms       1196511 non-null  int64         
 4   explicit          1196511 non-null  int64         
 5   id_artist         1196511 non-null  object        
 6   release_date      1196511 non-null  object        
 7   danceability      1196511 non-null  float64       
 8   energy            1196511 non-null  float64       
 9   key               1196511 non-null  int64         
 10  loudness          1196511 non-null  float64       
 11  speechiness       1196511 non-null  float64       
 12  acousticness      1196511 non-null  float64       
 13  instrumentalness  1196511 non-null  float6

Policzymy wartości odtworzeń, polubień i średnią długość odsłuchania piosenki

In [61]:
sessions_df = sessions
sessions_df['timestamp'] = pd.to_datetime(sessions_df['timestamp'])
sessions_df['week'] = sessions_df['timestamp'].dt.strftime('%U')

start_date = pd.to_datetime('2022-12-05')

sessions_df['week'] = (sessions_df['timestamp'] - start_date).dt.days // 7
sessions_df['week'] = sessions_df['week'] + 1

#remove first and last week, because they're not complete
sessions_df = sessions_df[sessions_df.week.isin([1, 53]) == False]

all_weeks = sorted(sessions_df['week'].unique())
all_tracks = tracks['track_id'].unique()


# create new data frame for output
df1 = pd.DataFrame({'track_id': all_tracks[:]})
df2 = pd.DataFrame({'week': all_weeks[:], 'play_count': 0, 'like_count': 0, 'playtime_ratio': 0})

df1 = pd.merge(df1, df2, how='cross')
print(df1.head)
# Iterate through each track and week combination

for track_id in df1['track_id'].unique():
    for week in df1['week'].unique():
        play_count = sessions_df[(sessions_df['track_id'] == track_id) & (sessions_df['week'] == week) & (sessions_df['event_type'] == 'play')].shape[0]
        



<bound method NDFrame.head of                        track_id  week  play_count  like_count  playtime_ratio
0        0RNxWy0PC3AyH4ThH3aGK6     2           0           0               0
1        0RNxWy0PC3AyH4ThH3aGK6     3           0           0               0
2        0RNxWy0PC3AyH4ThH3aGK6     4           0           0               0
3        0RNxWy0PC3AyH4ThH3aGK6     5           0           0               0
4        0RNxWy0PC3AyH4ThH3aGK6     6           0           0               0
...                         ...   ...         ...         ...             ...
1143007  27Y1N4Q4U3EfDU5Ubw8ws2    48           0           0               0
1143008  27Y1N4Q4U3EfDU5Ubw8ws2    49           0           0               0
1143009  27Y1N4Q4U3EfDU5Ubw8ws2    50           0           0               0
1143010  27Y1N4Q4U3EfDU5Ubw8ws2    51           0           0               0
1143011  27Y1N4Q4U3EfDU5Ubw8ws2    52           0           0               0

[1143012 rows x 5 columns]>
0
0
0

KeyboardInterrupt: 