In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import json
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from sklearn.preprocessing import MinMaxScaler

warnings.filterwarnings('ignore')

In [2]:
DATA_PATH = './data/'

In [3]:
catalogue_feature_names = ['type', 'availability', 'duration',
                           'feature_1', 'feature_2', 'feature_3',
                           'feature_4', 'feature_5', 'attributes']

In [4]:
with open(os.path.join(DATA_PATH, 'catalogue.json'), 'r') as f:
    catalogue = json.load(f)
    
catalogue = {int(k): v for k, v in catalogue.items()}

In [5]:
catalogue[1]

{'type': 'movie',
 'availability': ['purchase', 'rent'],
 'duration': 120,
 'feature_1': 6610431.116079764,
 'feature_2': 0.7732243944,
 'feature_3': 3,
 'feature_4': 1.1120138405,
 'feature_5': 0.6547073468,
 'attributes': [2786, 385, 2799, 3730, 886, 7, 11700, 42, 20, 388, 1934]}

In [6]:
catalogue_df = pd.DataFrame(index=catalogue.keys(), columns=catalogue_feature_names).sort_index()

#Делаю из файла json DataFrame в pandas, также создаю множество с атрибутами
attributes = set()
for key, value in tqdm_notebook(catalogue.items()):
    for col_name in catalogue_feature_names:
        catalogue_df.loc[key][col_name] =  value[col_name]
        if col_name == 'attributes':
            attributes.update(value[col_name])

HBox(children=(IntProgress(value=0, max=10200), HTML(value='')))




In [7]:
catalogue_df['type'] = catalogue_df['type'].astype(str)
catalogue_df['duration'] = catalogue_df['duration'].astype(int)
catalogue_df['feature_1'] = catalogue_df['feature_1'].astype(float)
catalogue_df['feature_2'] = catalogue_df['feature_2'].astype(float)
catalogue_df['feature_3'] = catalogue_df['feature_3'].astype(int)
catalogue_df['feature_4'] = catalogue_df['feature_4'].astype(float)
catalogue_df['feature_5'] = catalogue_df['feature_5'].astype(float)

In [8]:
# Теперь переведу категориальные фичи в числовые признаки
type_mapper = {'series': 0, 'movie': 1, 'multipart_movie': 2}
#attributes_mapper = {key:value for key, value in zip(attributes, range(len(attributes)))}
availability_mapper = {
    (): 0,
    ('purchase',): 1,
    ('rent',): 2,
    ('subscription',): 3,
    ('purchase', 'rent',): 4,
    ('purchase', 'subscription',): 5,
    ('rent', 'subscription',): 6,
    ('purchase', 'rent', 'subscription',): 7,
}

catalogue_df['type'] = catalogue_df['type'].map(type_mapper)

catalogue_df['availability'] = catalogue_df['availability'].apply(lambda x: availability_mapper[tuple(x)])

In [10]:
with open(os.path.join(DATA_PATH, 'test_users.json'), 'r') as f:
    test_users = json.load(f)['users']

In [11]:
transactions = pd.read_csv(
    os.path.join(DATA_PATH, 'transactions.csv'),
    dtype={
        'element_uid': np.uint16,
        'user_uid': np.uint32,
        'consumption_mode': 'category',
        'ts': np.float64,
        'watched_time': np.uint64,
        'device_type': np.uint8,
        'device_manufacturer': np.uint8
    }
)

In [12]:
transactions.shape

(9643012, 7)

In [17]:
transactions.columns

Index(['element_uid', 'user_uid', 'consumption_mode', 'ts', 'watched_time',
       'device_type', 'device_manufacturer'],
      dtype='object')

In [18]:
transactions.head(2)

Unnamed: 0,element_uid,user_uid,consumption_mode,ts,watched_time,device_type,device_manufacturer
0,3336,5177,S,44305180.0,4282,0,50
1,481,593316,S,44305180.0,2989,0,11


In [19]:
transactions = transactions.join(catalogue_df, on='element_uid')

In [20]:
transactions.shape

(9643012, 16)

In [21]:
transactions.head(2)

Unnamed: 0,element_uid,user_uid,consumption_mode,ts,watched_time,device_type,device_manufacturer,type,availability,duration,feature_1,feature_2,feature_3,feature_4,feature_5,attributes
0,3336,5177,S,44305180.0,4282,0,50,1,7,90,41661080.0,0.739609,45,1.141929,0.654707,"[19924, 28181, 6732, 23032, 270, 24805, 43, 14..."
1,481,593316,S,44305180.0,2989,0,11,1,5,50,42934190.0,0.750161,11,1.119409,0.592716,"[30070, 30071, 30072, 30073, 51, 52, 30074, 42..."


In [22]:
transactions.columns

Index(['element_uid', 'user_uid', 'consumption_mode', 'ts', 'watched_time',
       'device_type', 'device_manufacturer', 'type', 'availability',
       'duration', 'feature_1', 'feature_2', 'feature_3', 'feature_4',
       'feature_5', 'attributes'],
      dtype='object')

In [24]:
transactions = transactions[['ts', 'user_uid', 'element_uid',
                         'availability', 'watched_time', 'device_type', 'device_manufacturer', 'duration'
                        ]]


transactions.columns = ['session_start_datetime', 'user_id',
                     'video_id', 'vod_type', 'session_duration', 'device_type', 'device_os', 
                      'video_duration']
transactions['watching_percentage'] = transactions['session_duration'] / (transactions['video_duration'] * 60)

In [26]:
transactions.head()

Unnamed: 0,session_start_datetime,user_id,video_id,vod_type,session_duration,device_type,device_os,video_duration,watching_percentage
0,44305180.0,5177,3336,7,4282,0,50,90,0.792963
1,44305180.0,593316,481,5,2989,0,11,50,0.996333
2,44305180.0,262355,4128,7,833,0,50,100,0.138833
3,44305180.0,74296,6272,7,2530,0,99,100,0.421667
4,44305180.0,340623,5543,4,6282,0,50,70,1.495714


In [29]:
#transactions.to_pickle('./data/df_neural.pkl')

In [18]:
transactions[transactions.type == 0]

Unnamed: 0,element_uid,user_uid,consumption_mode,ts,watched_time,device_type,device_manufacturer,type,availability,duration,feature_1,feature_2,feature_3,feature_4,feature_5,attributes
7,5651,490059,S,4.430518e+07,9390,0,50,0,0,20,4.193306e+07,0.677350,0,1.138604,0.654707,"[8165, 35309, 35310, 20267, 270, 34399, 43, 25]"
15,5951,408050,S,4.430517e+07,20886,0,11,0,5,40,4.330547e+07,0.744924,8,1.135231,0.592716,"[13236, 25922, 8704, 34318, 34319, 7, 34320, 3..."
16,2429,499047,S,4.430517e+07,368576,0,99,0,5,40,3.771280e+07,0.812768,16,1.117582,0.654707,"[33886, 33880, 33887, 19602, 6089, 7, 33570, 3..."
24,817,493902,S,4.430517e+07,4911,0,99,0,0,40,4.312661e+07,0.625982,0,1.126575,0.654707,"[27350, 22145, 12659, 24032, 270, 33755, 35433..."
29,2771,133315,S,4.430517e+07,10557,0,50,0,5,40,3.855457e+07,0.712737,12,1.138604,0.654707,"[3224, 34069, 34070, 34071, 270, 15686, 123, 4..."
31,10108,483549,S,4.430517e+07,6478,0,50,0,3,20,4.306765e+07,0.731484,8,1.138604,0.680410,"[35487, 35488, 35489, 35490, 35491, 7, 35492, ..."
33,5845,401587,S,4.430517e+07,39037,3,99,0,5,60,4.162279e+07,0.786378,17,1.138604,0.680410,"[548, 287, 1408, 2401, 7, 25852, 33650, 33651,..."
35,3417,412761,S,4.430517e+07,5929,0,50,0,5,50,1.659832e+07,0.812768,17,1.100457,0.000000,"[309, 33911, 33912, 33913, 9101, 52, 33914, 42..."
77,4650,265411,S,4.430515e+07,1265,0,50,0,0,20,4.193306e+07,0.600535,0,1.130076,0.654707,"[8165, 34477, 34478, 34479, 270, 15713, 34480,..."
88,5741,255313,S,4.430515e+07,8900,0,50,0,3,40,3.671941e+07,0.753610,6,1.136924,0.680410,"[1379, 19984, 33531, 16082, 52, 15384, 33514, ..."


In [19]:
target = ((transactions['consumption_mode'] == 'R') | \
          (transactions['consumption_mode'] == 'P') | \
          (((transactions['type'] == 1) | (transactions['type'] == 2)) & \
            (transactions['watched_time'] / 40 >= transactions['duration'])) | \
          ((transactions['type'] == 0) & (transactions['watched_time'] / 40 >= 300))).astype(np.int8)

In [16]:
target.sum()

NameError: name 'target' is not defined

In [21]:
transactions['target'] = target

In [22]:
transactions.head().T

Unnamed: 0,0,1,2,3,4
element_uid,3336,481,4128,6272,5543
user_uid,5177,593316,262355,74296,340623
consumption_mode,S,S,S,S,P
ts,4.43052e+07,4.43052e+07,4.43052e+07,4.43052e+07,4.43052e+07
watched_time,4282,2989,833,2530,6282
device_type,0,0,0,0,0
device_manufacturer,50,11,50,99,50
type,1,1,1,1,1
availability,7,5,7,7,4
duration,90,50,100,100,70


In [23]:
bookmarks = pd.read_csv(
    os.path.join(DATA_PATH, 'bookmarks.csv'),
    dtype={
        'element_uid': np.uint16,
        'user_uid': np.uint32,
        'ts': np.float64
    }
)

In [24]:
bookmarks.head(3)

Unnamed: 0,user_uid,element_uid,ts
0,301135,7185,44305160.0
1,301135,4083,44305160.0
2,301135,10158,44305160.0


In [25]:
bookmarks = bookmarks.set_index(['user_uid', 'element_uid'], verify_integrity=True)

In [26]:
bookmarks.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,ts
user_uid,element_uid,Unnamed: 2_level_1
524752,2557,41730790.0
524752,8919,41730770.0
5174,3637,41730760.0
161137,9700,41730760.0
26252,8460,41730680.0


In [27]:
transactions = transactions.join(bookmarks, on=['user_uid', 'element_uid'], rsuffix='_bookmarks', how='outer')

In [28]:
transactions.isna().sum()

element_uid                  0
user_uid                     0
consumption_mode        683494
ts                      683494
watched_time            683494
device_type             683494
device_manufacturer     683494
type                    683494
availability            683494
duration                683494
feature_1               683494
feature_2               683494
feature_3               683494
feature_4               683494
feature_5               683494
attributes              683494
target                  683494
ts_bookmarks           9378290
dtype: int64

In [29]:
transactions.shape

(10326506, 18)

In [30]:
transactions['is_bookmarks'] = transactions.ts_bookmarks.notna()

In [31]:
transactions['is_ts_more_ts_bookmarks'] = (transactions['ts'] > transactions['ts_bookmarks']).astype(np.int8)
transactions['is_ts_more_ts_bookmarks'].loc[transactions.ts_bookmarks.isna()] = 2

In [32]:
transactions_cat_features = ['device_type', 'device_manufacturer', 
                             'is_ts_more_ts_bookmarks', 'consumption_mode', 
                             'is_bookmarks']

In [33]:
transactions.head().T

Unnamed: 0,0,1,2,3,4
element_uid,3336,481,4128,6272,5543
user_uid,5177,593316,262355,74296,340623
consumption_mode,S,S,S,S,P
ts,4.43052e+07,4.43052e+07,4.43052e+07,4.43052e+07,4.43052e+07
watched_time,4282,2989,833,2530,6282
device_type,0,0,0,0,0
device_manufacturer,50,11,50,99,50
type,1,1,1,1,1
availability,7,5,7,7,4
duration,90,50,100,100,70


In [34]:
ratings = pd.read_csv(
    os.path.join(DATA_PATH, 'ratings.csv'),
    dtype={
        'element_uid': np.uint16,
        'user_uid': np.uint32,
        'ts': np.float64,
        'rating': np.uint8
    }
)

In [35]:
ratings.head(5)

Unnamed: 0,user_uid,element_uid,rating,ts
0,571252,1364,10,44305170.0
1,63140,3037,10,44305140.0
2,443817,4363,8,44305140.0
3,359870,1364,10,44305060.0
4,359870,3578,9,44305060.0


In [36]:
ratings.set_index(['user_uid', 'element_uid'], verify_integrity=True, inplace=True)

In [37]:
ratings.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,ts
user_uid,element_uid,Unnamed: 2_level_1,Unnamed: 3_level_1
571252,1364,10,44305170.0
63140,3037,10,44305140.0
443817,4363,8,44305140.0
359870,1364,10,44305060.0
359870,3578,9,44305060.0


In [38]:
transactions = transactions.join(ratings, on=['user_uid', 'element_uid'], rsuffix='_raitings', how='outer')

In [39]:
transactions.shape

(10403324, 22)

In [40]:
transactions['is_ts_more_ts_raitings'] = (transactions['ts'] > transactions['ts_raitings']).astype(np.int8)
transactions['is_ts_more_ts_raitings'].loc[transactions.ts_raitings.isna()] = 2

In [41]:
transactions.head(3).T

Unnamed: 0,0,1,2
element_uid,3336,481,4128
user_uid,5177,593316,262355
consumption_mode,S,S,S
ts,4.43052e+07,4.43052e+07,4.43052e+07
watched_time,4282,2989,833
device_type,0,0,0
device_manufacturer,50,11,50
type,1,1,1
availability,7,5,7
duration,90,50,100


In [42]:
# !mkdir mydata

In [44]:
transactions.to_pickle('./data/joins.pkl')