In [1]:
# # Выполним препроцессинг сырых данных
# !python ../src/make_dataset.py

# Import

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import sys
sys.path.append('../')

In [3]:
from src.config import (
    path_tracks_temp,
    path_persons_temp,
    path_sessions_temp,
    
    path_sessions_train,
    path_sessions_test,
    path_sessions_val
)

# Чтение файлов

### Данные о треках (справочник `треки`-`исполнитель`-`число_проигрываний`)
Некоторые треки и исполнители имеют мало прослушиваний, но будем работать со всеми исполнителями.

In [4]:
# треки
tracks = pd.read_csv(path_tracks_temp)
print(tracks.shape)
tracks.head()

(4544643, 3)


Unnamed: 0,track_id,playcount,person_id
0,0,4.0,0
1,1,495.0,1
2,2,2.0,2
3,3,2.0,3
4,4,1.0,4


In [5]:
# расчет числа проигрываний каждого исполнителя.
persons_playcount = tracks.groupby(['person_id'], as_index=False)['playcount'].sum()
persons_playcount

Unnamed: 0,person_id,playcount
0,0,4.0
1,1,495.0
2,2,2.0
3,3,2.0
4,4,1.0
...,...,...
560922,595136,0.0
560923,595137,0.0
560924,595138,0.0
560925,595139,0.0


In [6]:
persons_playcount['playcount'].describe(percentiles=[
    0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.9725
])

count    5.609270e+05
mean     1.219149e+04
std      9.367164e+04
min     -6.000000e+00
2.5%     0.000000e+00
5%       0.000000e+00
25%      6.000000e+00
50%      1.160000e+02
75%      1.248000e+03
95%      3.520810e+04
97.2%    8.508707e+04
max      9.400856e+06
Name: playcount, dtype: float64

### Данные о сессиях с прослушиванием треков

In [7]:
# сессии с прослушиванием
sessions = pd.read_csv(path_sessions_temp)
print(sessions.shape)
sessions.head()

(28178697, 7)


Unnamed: 0,session_id,timestamp,playtime,numtracks,user_id,track_id,track_playratio
0,287144,1390231051,4547,23,44361,4698874,
1,287144,1390231051,4547,23,44361,838286,1.01
2,287144,1390231051,4547,23,44361,2588097,1.01
3,287144,1390231051,4547,23,44361,2746740,1.44
4,287144,1390231051,4547,23,44361,3873988,1.01


# Переход к истории прослушивания исполнителей

In [8]:
# переход от истории прослушиваний треков к истории прослушивания исполнителей
sessions_extend = sessions.merge(
    tracks,
    how='inner',
    on=['track_id']
)

In [9]:
print(sessions_extend.shape)
sessions_extend.head()

(30732788, 9)


Unnamed: 0,session_id,timestamp,playtime,numtracks,user_id,track_id,track_playratio,playcount,person_id
0,287144,1390231051,4547,23,44361,4698874,,,142266
1,287144,1390231051,4547,23,44361,838286,1.01,212.0,107103
2,982046,1405452797,6892,29,32894,838286,1.0,212.0,107103
3,982049,1405702985,15646,68,32894,838286,1.0,212.0,107103
4,1873088,1406217037,-1,1,23183,838286,,212.0,107103


In [10]:
mask_popular_persons = (persons_playcount['playcount'] >= 250000)
popular_persons = persons_playcount.loc[mask_popular_persons, 'person_id'].values

print(mask_popular_persons.sum() / persons_playcount.shape[0])
print(popular_persons.shape)
popular_persons

0.01054860971213723
(5917,)


array([   109,    129,    144, ..., 468053, 468284, 468998])

In [11]:
# возьмем сессии популярных исполнителей, отобранных выше
sessions_extend = sessions_extend[
    sessions_extend['person_id'].isin(popular_persons)
]

In [15]:
%%time
numpersons = (sessions_extend
    .groupby(['session_id'], as_index=False)['person_id']
    .nunique()
    .rename(columns={'person_id':'numpersons'})
)
numpersons = numpersons[numpersons['numpersons'] >= 4]
numpersons

CPU times: user 6.11 s, sys: 416 ms, total: 6.52 s
Wall time: 6.53 s


Unnamed: 0,session_id,numpersons
5,12,4
10,20,4
41,69,4
74,122,7
95,149,9
...,...,...
2031609,2764444,11
2031610,2764446,21
2031613,2764449,12
2031614,2764450,8


In [17]:
sessions_extend = pd.merge(
    sessions_extend,
    numpersons,
    how='right',
    on=['session_id']
)

In [19]:
print(sessions_extend.shape)
sessions_extend.head()

(9356443, 10)


Unnamed: 0,session_id,timestamp,playtime,numtracks,user_id,track_id,track_playratio,playcount,person_id,numpersons
0,12,1405519516,5202,25,41504,1210840,1.0,353.0,154295,4
1,12,1405519516,5202,25,41504,1210840,1.0,94.0,154295,4
2,12,1405519516,5202,25,41504,1210766,1.0,1093.0,154295,4
3,12,1405519516,5202,25,41504,1210626,1.0,328.0,154295,4
4,12,1405519516,5202,25,41504,1210759,1.0,4.0,154295,4


In [20]:
percentile_70, percentile_85 = np.percentile(sessions_extend['timestamp'], [70, 85])

In [21]:
# разбиение данные на Train/Validation/Test
sessions_train = sessions_extend[
    sessions_extend['timestamp'] < percentile_70
]

sessions_test = sessions_extend[
    (sessions_extend['timestamp'] >= percentile_70)
    & (sessions_extend['timestamp'] < percentile_85)
]

sessions_val = sessions_extend[
    sessions_extend['timestamp'] >= percentile_85
]

In [22]:
print(sessions_train.shape)
print(sessions_test.shape)
print(sessions_val.shape)

(6549461, 10)
(1403506, 10)
(1403476, 10)


In [23]:
%%time
sessions_train.to_csv(
    path_sessions_train,
    encoding='utf-8-sig',
    index=False
)
sessions_test.to_csv(
    path_sessions_test,
    encoding='utf-8-sig',
    index=False
)
sessions_val.to_csv(
    path_sessions_val,
    encoding='utf-8-sig',
    index=False
)

CPU times: user 44.2 s, sys: 1.17 s, total: 45.3 s
Wall time: 1min 3s
