# Import

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import sys
sys.path.append('../')

In [2]:
from src.config import (
    path_tracks_temp,
    path_persons_temp,
    path_sessions_temp
)

In [3]:
def dict_unique_counter(arr):
    return dict(zip(*np.unique(arr, return_counts=True)))

# Чтение файлов

### Данные об исполнителях

In [4]:
# исполнители
persons = pd.read_csv(path_persons_temp)
print(persons.shape)
persons.head()

# поле "person_name" по фиксированному "person_id" может именть
# пример с "David Guetta"
# будем использовать для декодинга person_id
persons[persons['person_id'].isin([227])]

(595049, 3)


Unnamed: 0,person_id,person_name,person_MBID
4673,227,David+Guetta+&+Nicky+Romero,
12079,227,"David+Guetta,+Sam+Martin",1bb1eec6-88c3-4028-8920-a985c4b9081a
17306,227,David+Guetta+ft.+Chris+Willis,1bb1eec6-88c3-4028-8920-a985c4b9081a
31403,227,David+Guetta+-+Ne-Yo+-+Kelly+Rowland,1bb1eec6-88c3-4028-8920-a985c4b9081a
38019,227,David+Guetta+&+Chris+Willis+Feat.+Fergie+&+LMFAO,1bb1eec6-88c3-4028-8920-a985c4b9081a
...,...,...,...
570347,227,David+Guetta+&+Alesso+feat.+Tegan+&+Sara,
572905,227,David+Guetta+ft.+Kelly+Rowland,1bb1eec6-88c3-4028-8920-a985c4b9081a
578055,227,David+Guetta+Feat+Lil+Wayne+&+Chris+Brown,1bb1eec6-88c3-4028-8920-a985c4b9081a
588531,227,David+Guetta+&+Glowinthedark+feat.+Harrison+Shaw,


### Данные о треках (справочник `треки`-`исполнитель`-`число_проигрываний`)
Некоторые треки и исполнители имеют мало прослушиваний, но будем работать со всеми исполнителями.

In [5]:
# треки
tracks = pd.read_csv(path_tracks_temp)
print(tracks.shape)
tracks.head()

(4544643, 3)


Unnamed: 0,track_id,playcount,person_id
0,0,4.0,0
1,1,495.0,1
2,2,2.0,2
3,3,2.0,3
4,4,1.0,4


In [6]:
tracks['track_id'].nunique(), tracks['person_id'].nunique()

(4519105, 560927)

In [7]:
# расчет числа проигрываний каждого исполнителя.
persons_playcount = tracks.groupby(['person_id'], as_index=False)['playcount'].sum()
persons_playcount

Unnamed: 0,person_id,playcount
0,0,4.0
1,1,495.0
2,2,2.0
3,3,2.0
4,4,1.0
...,...,...
560922,595136,0.0
560923,595137,0.0
560924,595138,0.0
560925,595139,0.0


In [8]:
persons_playcount['playcount'].describe(percentiles=[
    0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.9725
])

count    5.609270e+05
mean     1.219149e+04
std      9.367164e+04
min     -6.000000e+00
2.5%     0.000000e+00
5%       0.000000e+00
25%      6.000000e+00
50%      1.160000e+02
75%      1.248000e+03
95%      3.520810e+04
97.2%    8.508707e+04
max      9.400856e+06
Name: playcount, dtype: float64

In [9]:
mask_popular_persons = (persons_playcount['playcount'] >= 140000)

print(mask_popular_persons.sum() / persons_playcount.shape[0])

popular_persons = persons_playcount.loc[mask_popular_persons, 'person_id'].values
print(popular_persons.shape)
popular_persons

0.018392767686347778
(10317,)


array([    78,    109,    129, ..., 468284, 468998, 472663])

### Данные о сессиях с прослушиванием треков

In [11]:
# сессии с прослушиванием
sessions = pd.read_csv(path_sessions_temp)
print(sessions.shape)
sessions.head()

(28178697, 7)


Unnamed: 0,session_id,timestamp,playtime,numtracks,user_id,track_id,track_playratio
0,287144,1390231051,4547,23,44361,4698874,
1,287144,1390231051,4547,23,44361,838286,1.01
2,287144,1390231051,4547,23,44361,2588097,1.01
3,287144,1390231051,4547,23,44361,2746740,1.44
4,287144,1390231051,4547,23,44361,3873988,1.01


In [12]:
sessions.head(30)

Unnamed: 0,session_id,timestamp,playtime,numtracks,user_id,track_id,track_playratio
0,287144,1390231051,4547,23,44361,4698874,
1,287144,1390231051,4547,23,44361,838286,1.01
2,287144,1390231051,4547,23,44361,2588097,1.01
3,287144,1390231051,4547,23,44361,2746740,1.44
4,287144,1390231051,4547,23,44361,3873988,1.01
5,287144,1390231051,4547,23,44361,808046,1.01
6,287144,1390231051,4547,23,44361,2588498,1.01
7,287144,1390231051,4547,23,44361,2492893,1.02
8,287144,1390231051,4547,23,44361,2168413,1.02
9,287144,1390231051,4547,23,44361,1778345,1.07


In [13]:
sessions['numtracks'].describe(percentiles=[
    0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.9725
])

count    2.817870e+07
mean     4.142387e+01
std      8.674240e+01
min      1.000000e+00
2.5%     2.000000e+00
5%       3.000000e+00
25%      1.000000e+01
50%      2.100000e+01
75%      4.500000e+01
95%      1.410000e+02
97.2%    2.000000e+02
max      4.914000e+03
Name: numtracks, dtype: float64

In [14]:
(sessions['numtracks'] > 30).sum() / sessions.shape[0]

0.3694823078583087

In [15]:
sessions[sessions['track_playratio'].isin([np.nan])]

Unnamed: 0,session_id,timestamp,playtime,numtracks,user_id,track_id,track_playratio
0,287144,1390231051,4547,23,44361,4698874,
11,287144,1390231051,4547,23,44361,4698875,
13,287144,1390231051,4547,23,44361,4698876,
16,287144,1390231051,4547,23,44361,4698877,
17,287144,1390231051,4547,23,44361,4698878,
...,...,...,...,...,...,...,...
28178683,540638,1415960966,18220,78,24700,1916497,
28178688,540638,1415960966,18220,78,24700,1798283,
28178691,540638,1415960966,18220,78,24700,1174854,
28178692,2480032,1407938059,-1,1,33058,906373,


In [16]:
sessions['track_playratio'].describe(percentiles=[
    0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.9725
])

count    2.394273e+07
mean     1.131717e+00
std      2.026694e+00
min      0.000000e+00
2.5%     1.000000e-01
5%       5.000000e-01
25%      9.900000e-01
50%      1.000000e+00
75%      1.040000e+00
95%      1.910000e+00
97.2%    2.530000e+00
max      7.970000e+02
Name: track_playratio, dtype: float64

# Переход к истории прослушивания исполнителей

In [17]:
# переход от истории прослушиваний треков к истории прослушивания исполнителей
sessions_extend = sessions.merge(
    tracks,
    how='inner',
    on=['track_id']
)

In [18]:
print(sessions_extend.shape)
sessions_extend.head()

(30732788, 9)


Unnamed: 0,session_id,timestamp,playtime,numtracks,user_id,track_id,track_playratio,playcount,person_id
0,287144,1390231051,4547,23,44361,4698874,,,142266
1,287144,1390231051,4547,23,44361,838286,1.01,212.0,107103
2,982046,1405452797,6892,29,32894,838286,1.0,212.0,107103
3,982049,1405702985,15646,68,32894,838286,1.0,212.0,107103
4,1873088,1406217037,-1,1,23183,838286,,212.0,107103


In [23]:
# sessions_extend.groupby(['timestamp'], as_index=False)['track_id'].count()

In [24]:
del sessions, tracks

In [25]:
# возьмем сессии популярных исполнителей, отобранных выше
# sessions_extend = 
sessions_extend[
    sessions_extend['person_id'].isin(popular_persons)
    & (sessions_extend['numtracks'] > 30)
]

Unnamed: 0,session_id,timestamp,playtime,numtracks,user_id,track_id,track_playratio,playcount,person_id
3,982049,1405702985,15646,68,32894,838286,1.00,212.0,107103
5,961319,1416339976,8433,34,28231,838286,1.01,212.0,107103
6,2497483,1416525632,22703,88,37873,838286,1.01,212.0,107103
11,1706581,1413786198,28830,111,12776,838286,1.01,212.0,107103
13,386789,1406408030,7921,31,41047,838286,1.10,212.0,107103
...,...,...,...,...,...,...,...,...,...
30732747,2575972,1409144730,40506,187,29398,2376711,1.06,2434.0,297130
30732751,2575972,1409144730,40506,187,29398,2696663,0.84,87.0,338072
30732757,540634,1414662293,10173,43,24700,2123597,0.97,218.0,266608
30732758,540634,1414662293,10173,43,24700,2123592,1.23,151.0,266608


In [21]:
print(sessions_extend.shape)
sessions_extend.head()

(9195167, 9)


Unnamed: 0,session_id,timestamp,playtime,numtracks,user_id,track_id,track_playratio,playcount,person_id
3,982049,1405702985,15646,68,32894,838286,1.0,212.0,107103
5,961319,1416339976,8433,34,28231,838286,1.01,212.0,107103
6,2497483,1416525632,22703,88,37873,838286,1.01,212.0,107103
11,1706581,1413786198,28830,111,12776,838286,1.01,212.0,107103
13,386789,1406408030,7921,31,41047,838286,1.1,212.0,107103


In [22]:
%%time
session_neighbours = sessions_extend.groupby(['session_id'])['person_id'].unique()

CPU times: user 16.9 s, sys: 323 ms, total: 17.3 s
Wall time: 17.1 s


In [23]:
%%time
session_neighbours = [
    (person, session[session != person])
    for session in session_neighbours
    for person in session
]

CPU times: user 7.94 s, sys: 1.68 s, total: 9.62 s
Wall time: 9.63 s


In [24]:
# %%time
# session_neighbours = [
#     (person, dict_unique_counter(session[session != person]))
#     for session in session_neighbours
#     for person in session
# ]

In [25]:
session_neighbours = pd.DataFrame(
    session_neighbours,
    columns=['person_id', 'session_neighbours']
)

In [26]:
session_neighbours['len'] = session_neighbours['session_neighbours'].apply(len)
session_neighbours = session_neighbours[session_neighbours['len'] != 0]

In [27]:
# (session_neighbours['len'] > 10).sum() / session_neighbours.shape[0]

In [28]:
session_neighbours.head(15)

Unnamed: 0,person_id,session_neighbours,len
0,325704,"[324951, 321482, 209910, 45072]",4
1,324951,"[325704, 321482, 209910, 45072]",4
2,321482,"[325704, 324951, 209910, 45072]",4
3,209910,"[325704, 324951, 321482, 45072]",4
4,45072,"[325704, 324951, 321482, 209910]",4
5,435780,"[247876, 372354, 31691]",3
6,247876,"[435780, 372354, 31691]",3
7,372354,"[435780, 247876, 31691]",3
8,31691,"[435780, 247876, 372354]",3
9,192567,[31691],1


In [29]:
# # df_lst = []
# for _, group in session_neighbours[session_neighbours['person_id'].isin([341684])].groupby(['person_id']):
#     print(np.concatenate(group['session_neighbours'].values))
#     break

In [30]:
session_neighbours['person_id'].unique()

array([325704, 324951, 321482, ..., 345400, 212554, 299374])

In [31]:
# # расчет около 45 минут
# df_lst = []
# for person in tqdm(session_neighbours['person_id'].unique()):
#     mask_person = session_neighbours['person_id'].isin([person])
#     neighbours = np.concatenate(
#         session_neighbours.loc[mask_person, 'session_neighbours'].values
#     )
#     neighbours_string = ' '.join(map(str, neighbours))
#     df_lst.append((person, neighbours_string))
# #     break

In [32]:
def dict_unique_counter(arr):
    return dict(zip(*np.unique(arr, return_counts=True)))

In [33]:
df_lst = []
for i, person in enumerate(tqdm(session_neighbours['person_id'].unique())):
    mask_person = session_neighbours['person_id'].isin([person])
    neighbours = np.concatenate(
        session_neighbours.loc[mask_person, 'session_neighbours'].values
    )
    neighbours = dict_unique_counter(neighbours)
    df_lst.append((person, neighbours))
#     if i > 100:
#         break

100%|██████████| 22275/22275 [02:08<00:00, 173.77it/s]


In [34]:
session_neighbours = pd.DataFrame(
    df_lst,
    columns=['person_id', 'session_neighbours']
)

In [35]:
# %%time
# session_neighbours.to_pickle('../data/temp/session_neighbours.pkl')

In [36]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [37]:
vectorizer = DictVectorizer()
X = vectorizer.fit_transform(session_neighbours['session_neighbours'])
X

<22275x22275 sparse matrix of type '<class 'numpy.float64'>'
	with 28818056 stored elements in Compressed Sparse Row format>

In [38]:
from sklearn.decomposition import TruncatedSVD

In [39]:
tsvd = TruncatedSVD(n_components=1000)

In [40]:
X_tsvd = tsvd.fit_transform(X)

In [48]:
tsvd.components_

array([[ 3.90095707e-03,  3.08909882e-03,  1.13538446e-02, ...,
         2.59800441e-07,  9.00882023e-05,  5.06341267e-05],
       [ 7.85881694e-04,  9.30394956e-04, -1.30248867e-03, ...,
         2.57115968e-07, -7.21357397e-05, -1.03959507e-05],
       [-2.59863573e-03, -7.80258085e-04, -7.03425797e-03, ...,
        -2.40750953e-07,  1.13689253e-04,  8.39425401e-05],
       ...,
       [-7.86452418e-03,  3.17340769e-03, -2.68720522e-02, ...,
        -8.88258344e-05,  6.25100704e-04,  3.60528834e-04],
       [-1.46044778e-02,  3.93472795e-03,  6.33255740e-02, ...,
        -1.19814466e-04,  2.75576198e-04,  6.73703020e-04],
       [ 1.77641204e-02, -9.26178078e-04, -1.86980702e-02, ...,
        -1.18783116e-04, -5.51691742e-04,  1.66802672e-03]])

In [46]:
ebeddings = pd.concat([
    session_neighbours['person_id'],
    pd.DataFrame(X_tsvd)
], axis=1).set_index('person_id')
ebeddings

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
325704,166.002614,6.573576,-41.414301,2.244714,-28.489260,0.253409,-3.849836,1.928986,4.957353,-17.173485,...,2.481825,0.099077,-1.053812,-1.179852,2.233172,0.345858,2.008207,-0.756790,-0.113385,-0.444923
324951,132.861067,38.954129,-10.447080,-11.582624,19.816059,-9.987811,-10.942405,-13.587144,-9.747612,22.270913,...,0.952618,0.320554,-0.142172,1.139293,1.042123,2.049437,0.787838,-0.160299,0.022737,0.190225
321482,65.357117,-5.133094,-9.393205,-24.902278,16.969505,-14.350675,-9.814001,6.128002,-11.111390,11.775831,...,-2.699617,-0.321509,0.788891,-2.200686,-0.860055,-0.178337,-0.032785,2.211918,-0.155721,1.825814
209910,15.634815,-0.008990,-4.085451,-8.222346,3.304866,-4.064063,-1.140591,3.962524,-3.692036,2.084211,...,-1.423569,-0.757997,0.794564,-1.090794,-0.104120,0.540161,0.114869,0.670853,-0.165722,-0.047591
45072,33.761973,-1.265595,-10.571736,-16.344433,4.790306,-11.255527,0.724642,6.300351,-8.438057,8.059858,...,-0.257331,-0.717752,1.971345,-0.529604,-1.818806,-0.497168,-0.982844,2.399521,-0.373256,3.357787
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277026,0.037502,-0.035583,0.025601,-0.029715,-0.014053,-0.014528,-0.037604,-0.012443,0.000730,0.021101,...,0.051447,-0.025509,0.023111,-0.003785,-0.019794,-0.007556,0.006162,0.004009,-0.032558,0.027473
185078,0.109220,-0.103648,0.076914,-0.108989,0.047778,0.044147,-0.163251,0.025050,-0.090340,0.064043,...,-0.000152,0.046933,-0.011860,0.121103,-0.115020,-0.035394,-0.060408,-0.079502,0.114709,-0.053723
345400,0.034339,-0.024479,-0.023742,0.093918,-0.002006,-0.019618,-0.023518,0.020015,-0.000108,-0.029962,...,-0.039639,-0.005406,0.002221,0.023458,-0.032905,-0.023366,0.016205,0.008100,0.028412,0.041258
212554,0.497735,-0.138605,-0.228052,-0.139281,0.148449,-0.193796,0.146380,0.026670,-0.093244,0.149415,...,-0.102378,0.000532,0.049269,-0.046504,-0.009925,-0.068488,-0.010377,-0.030665,0.043187,-0.007858


In [76]:
# persons.merge(session_neighbours[['person_id']], on=['person_id'])

persons[persons['person_id'].isin([281542, 56683, 276125, 200082])]

Unnamed: 0,person_id,person_name,person_MBID
1373,56683,Bowie,5441c29d-3602-4898-b1a1-b77fa23b8e50
21043,276125,Pink+Floyd+&+Floyd,83d91898-7763-47d7-b03b-b92132375c47
25139,56683,David+Bowie+&+Bing+Crosby,5441c29d-3602-4898-b1a1-b77fa23b8e50
27674,56683,David+Bowie+&+Pet+Shop+Boys,5441c29d-3602-4898-b1a1-b77fa23b8e50
32700,56683,David+Bowie+&+Phillip+Glass,
40008,56683,David+Bowie+&+Trent+Reznor,5441c29d-3602-4898-b1a1-b77fa23b8e50
40030,200082,Lil+Wayne+Feat.+Drake+&+Rick+Ross,
44602,276125,Pink+Floyd+&+The+Orb,83d91898-7763-47d7-b03b-b92132375c47
44799,56683,David+Bowie+&+Pat+Metheny,5441c29d-3602-4898-b1a1-b77fa23b8e50
49995,56683,David+Bowie+&+Freddie+Mercury,5441c29d-3602-4898-b1a1-b77fa23b8e50


In [75]:
most_popular_persons = persons_playcount.sort_values(['playcount'], ascending=False).head(15)
most_popular_persons

Unnamed: 0,person_id,playcount
373252,373260,9400856.0
281536,281542,8003401.0
56679,56683,7038295.0
31356,31359,6413547.0
276119,276125,6337974.0
200076,200082,6247765.0
459777,459788,5781531.0
139130,139135,5767201.0
429264,429274,5751087.0
447087,447098,5547182.0


In [79]:
from scipy.spatial.distance import cosine

In [114]:
dist = [(i, cosine(ebeddings.loc[200082], row)) for i, row in ebeddings.iterrows()]
dist = pd.DataFrame(dist, columns=['person_id', 'cosine_dist'])
dist = dist.sort_values(['cosine_dist'])
dist

Unnamed: 0,person_id,cosine_dist
359,200082,0.000000
384,449356,0.043523
409,374349,0.047748
394,161570,0.052074
391,47600,0.054110
...,...,...
21329,79789,1.002057
21768,384701,1.002142
20536,408928,1.002309
20847,399501,1.002712


In [115]:
top_20 = dist['person_id'].iloc[1:51].values

In [1]:
persons[persons['person_id'].isin(top_20)].head(60)

NameError: name 'persons' is not defined

In [2]:
# session_neighbours = pd.read_csv('../data/temp/session_neighbours_500.csv', chunksize=1000)

In [None]:
# df_lst = []
# for df in tqdm(session_neighbours):
#     df['neighbours_dct'] = df['session_neighbours'].str.split(' ').apply(dict_unique_counter)
#     df_lst.append(df[['person_id', 'neighbours_dct']])
# #     break

79it [02:52,  1.61it/s]

In [11]:
# vectorizer = CountVectorizer(max_features=10000)

# X = vectorizer.fit_transform(session_neighbours['session_neighbours'])

# X

# print(vectorizer.get_feature_names())

# print(X.toarray())

In [None]:
# session_neighbours['session_neighbours'] = session_neighbours['session_neighbours'].str.split(' ')

In [None]:
df_lst = []
for _, (person, neighbours) in tqdm(session_neighbours.iterrows()):
    df_lst.append(
        (person, dict_unique_counter(neighbours.split(' ')))
    )
#     break

49940it [04:35, 11.22s/it]  

In [13]:
df_lst

[(332749,
  {'100758': 1,
   '100889': 1,
   '100904': 2,
   '101590': 1,
   '101611': 1,
   '101774': 1,
   '101903': 2,
   '102300': 1,
   '102352': 1,
   '102835': 1,
   '102923': 12,
   '103018': 1,
   '103260': 2,
   '103498': 1,
   '103673': 1,
   '103871': 7,
   '104139': 7,
   '104207': 1,
   '104243': 1,
   '104521': 4,
   '104676': 4,
   '104885': 4,
   '105144': 1,
   '105256': 3,
   '10617': 1,
   '106222': 2,
   '106237': 1,
   '106350': 1,
   '107039': 3,
   '107103': 2,
   '107168': 1,
   '107364': 1,
   '10764': 1,
   '107743': 1,
   '107750': 1,
   '107846': 2,
   '107920': 1,
   '108114': 4,
   '108228': 2,
   '108267': 5,
   '108734': 1,
   '108742': 3,
   '108766': 1,
   '109145': 4,
   '109210': 1,
   '109301': 1,
   '109433': 8,
   '109444': 7,
   '109595': 1,
   '109597': 1,
   '109784': 1,
   '109854': 2,
   '11005': 1,
   '110264': 4,
   '110578': 1,
   '110729': 1,
   '110792': 1,
   '110855': 6,
   '11092': 1,
   '111203': 1,
   '111262': 2,
   '111494': 1,
 

In [4]:
print(session_neighbours.shape)
session_neighbours.head()

(190037, 2)


Unnamed: 0,person_id,session_neighbours
0,332749,288070 124041 11423 442798 291362 52354 70915 ...
1,288070,332749 269833 5399 88671 109097 313066 147287 ...
2,154295,288626 341684 325050 46425 390636 360406 10326...
3,288626,154295 341684 325050 17514 203180 100758 29122...
4,341684,154295 288626 325050 84554 219840 318259 21984...


In [5]:
# session_neighbours.loc[0, 'session_neighbours']

'288070 124041 11423 442798 291362 52354 70915 245488 417177 399353 50321 359758 455930 399329 42218 21950 249900 332244 286859 110855 159300 159611 62678 84244 161015 114966 258705 371043 196132 287752 67354 369381 338261 27165 441362 452099 262896 121972 165405 70070 106222 55207 368250 155863 379822 234386 227562 434646 392962 86376 354892 73357 291837 360509 204114 364910 199934 85842 12631 62568 363805 348947 364597 386855 163656 353623 318475 214743 398072 52023 55136 258927 82359 62444 448656 34180 452823 47307 284448 221307 208657 108267 203935 328397 174073 273651 415453 3452 449356 304049 297016 117020 433041 277494 211094 323879 83974 413521 461101 235831 57272 74545 47552 383822 159844 42456 385714 460815 49971 279548 431811 55438 444371 41200 273573 283744 449220 242068 149679 76404 304591 267416 433523 14125 114283 411805 47154 253248 371765 236107 393820 42032 400168 74637 261027 314219 8102 303270 349103 362541 154377 305019 203487 273495 426900 123386 40656 233878 2048

In [None]:
# %%time
# session_neighbours['neighbours_counts'] = session_neighbours['session_neighbours'].apply(dict_unique_counter)

In [55]:
df_lst = []
for _, (person, neighbours) in tqdm(session_neighbours.iterrows()):
    neighbours_counts = dict_unique_counter(neighbours)
    df_lst.append((person, neighbours_counts))
#     break

100it [00:01, 79.74it/s]


In [54]:
session_neighbours = pd.DataFrame(
    df_lst,
    columns=['person_id', 'neighbours_counts']
)

[(332749,
  {'100758': 1,
   '100889': 1,
   '100904': 2,
   '101590': 1,
   '101611': 1,
   '101774': 1,
   '101903': 2,
   '102300': 1,
   '102352': 1,
   '102835': 1,
   '102923': 12,
   '103018': 1,
   '103260': 2,
   '103498': 1,
   '103673': 1,
   '103871': 7,
   '104139': 7,
   '104207': 1,
   '104243': 1,
   '104521': 4,
   '104676': 4,
   '104885': 4,
   '105144': 1,
   '105256': 3,
   '10617': 1,
   '106222': 2,
   '106237': 1,
   '106350': 1,
   '107039': 3,
   '107103': 2,
   '107168': 1,
   '107364': 1,
   '10764': 1,
   '107743': 1,
   '107750': 1,
   '107846': 2,
   '107920': 1,
   '108114': 4,
   '108228': 2,
   '108267': 5,
   '108734': 1,
   '108742': 3,
   '108766': 1,
   '109145': 4,
   '109210': 1,
   '109301': 1,
   '109433': 8,
   '109444': 7,
   '109595': 1,
   '109597': 1,
   '109784': 1,
   '109854': 2,
   '11005': 1,
   '110264': 4,
   '110578': 1,
   '110729': 1,
   '110792': 1,
   '110855': 6,
   '11092': 1,
   '111203': 1,
   '111262': 2,
   '111494': 1,
 

In [39]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)
D = [{11: 1, 12: 2}, {11: 3, 13: 1}]
X = v.fit_transform(D)
X

array([[1., 2., 0.],
       [3., 0., 1.]])

# DRAFT

In [None]:
# %%time
# # не хватает мощности
# session_neighbours = (session_neighbours#.head(1000)
#     .groupby(['person_id'], as_index=False)
#     .agg({'session_neighbours' : lambda x: list(np.concatenate(x.values))})
# )

In [16]:
# 
sessions_extend[sessions_extend['playtime'].isin([-1])]

Unnamed: 0,session_id,timestamp,playtime,numtracks,user_id,track_id,track_playratio,playcount,person_id
4,1873088,1406217037,-1,1,23183,838286,,212.0,107103
19,361604,1400260299,-1,1,40718,838286,,212.0,107103
40,2700519,1405970598,-1,1,24690,838286,,212.0,107103
58,846,1411031704,-1,1,40433,838286,,212.0,107103
82,2604837,1410191003,-1,1,377,838286,,212.0,107103
...,...,...,...,...,...,...,...,...,...
30732660,2704365,1392347333,-1,1,25560,366126,,4.0,45178
30732664,373369,1408739136,-1,1,40091,4789796,1.0,,552076
30732704,2751633,1408490773,-1,1,36788,3517368,,166.0,439634
30732765,1885736,1408391273,-1,1,26603,2749494,,112.0,344789


In [17]:
sessions_extend[sessions_extend['track_playratio'].isin([np.nan])]

Unnamed: 0,session_id,timestamp,playtime,numtracks,user_id,track_id,track_playratio,playcount,person_id
0,287144,1390231051,4547,23,44361,4698874,,,142266
4,1873088,1406217037,-1,1,23183,838286,,212.0,107103
9,1591983,1421070443,6740,28,25592,838286,,212.0,107103
10,1591217,1399022203,3066,14,25458,838286,,212.0,107103
19,361604,1400260299,-1,1,40718,838286,,212.0,107103
...,...,...,...,...,...,...,...,...,...
30732763,1885731,1407078600,749,4,26603,3465265,,259.0,432719
30732765,1885736,1408391273,-1,1,26603,2749494,,112.0,344789
30732766,1885735,1408300690,-1,1,26603,3208862,,1297.0,397734
30732769,2480038,1407956764,940,5,33058,2014661,,1466.0,252514


In [25]:
sessions_extend['person_id'].nunique()

560926

In [18]:
(sessions_extend['numtracks'] >= 5).sum() / sessions_extend.shape[0]

0.9114987875489852

In [29]:
user_person_statistics = (sessions_extend
    .groupby(['user_id', 'person_id'], as_index=False)
    ['track_playratio'].median()
#     .agg({
#         'track_playratio':['count', 'sum', 'mean', 'median'],
#     })
)
user_person_statistics

Unnamed: 0,user_id,person_id,track_playratio
0,1,11467,1.00
1,1,11617,1.00
2,1,19627,1.01
3,1,28510,0.53
4,1,42218,1.43
...,...,...,...
6305840,45175,361085,2.18
6305841,45175,390280,1.03
6305842,45175,426475,0.88
6305843,45175,438476,


In [30]:
user_person_statistics.columns

Index(['user_id', 'person_id', 'track_playratio'], dtype='object')

In [32]:
from surprise import Dataset
# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')

Dataset ml-100k could not be found. Do you want to download it? [Y/n] y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /home/user/.surprise_data/ml-100k


In [31]:
from surprise import SVD

# Use the famous SVD algorithm.
svd = SVD()

In [20]:
users_listen_count = (sessions_extend
    .groupby(['user_id'], as_index=False)['track_playratio']
    .count()
    .rename(columns={'track_playratio':'listen_count'})
)
users_listen_count

Unnamed: 0,user_id,listen_count
0,1,404
1,2,672
2,3,1821
3,4,1013
4,5,173
...,...,...
45170,45171,174
45171,45172,16
45172,45173,364
45173,45174,351


In [24]:
(users_listen_count['listen_count'] >= 50).sum() / users_listen_count.shape[0]

0.8976646375207527