In [1]:
import datetime
import numpy as np

from itertools import islice, cycle
from more_itertools import pairwise

import pandas as pd

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import seaborn as sns
sns.set(style='whitegrid')
sns.set(rc={'figure.figsize':(17, 9)})

from IPython.core.display import display, HTML, clear_output
display(HTML('<style>.container { width:80% !important; }</style>'))
display(HTML('<style>.prompt { min-width:10ex !important; }</style>'))
display(HTML('<style>div#notebook { font-size:12px !important; }</style>'))

from preprocessing import leave_last_out, transform_indices, reindex_data, generate_interactions_matrix
from datetime import datetime

  from IPython.core.display import display, HTML, clear_output
  from IPython.core.display import display, HTML, clear_output


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
users_df = pd.read_csv('./dataset/users_processed.csv')
items_df = pd.read_csv('./dataset/items_processed.csv')
interactions_df = pd.read_csv('./dataset/interactions_processed.csv', parse_dates=['last_watch_dt'])
#submission = pd.read_csv('sample_submission.csv')

# User features preprocessing

In [4]:
def ohe(features : list, df, items = False):
    if items:
        ohe_df = df.item_id
    else:
        ohe_df = df.user_id
    for feat in features:
        ohe_feat_df = pd.get_dummies(df[feat], prefix = feat)
        ohe_df = pd.concat([ohe_df,ohe_feat_df],axis=1)
    return ohe_df

In [5]:
# One-hot encoding of cathegorical features
user_cat_features = ['age','income','sex','kids_flg']


user_ohe_df  =  ohe(user_cat_features,users_df)

# Item features preprocessing

In [6]:
item_cat_feats = ['content_type','release_year_cat','for_kids','age_rating','studios','countries','directors']
item_ohe_df = ohe(item_cat_feats,items_df, items = True)

# Interactions filtering

In [7]:
interactions_df['last_watch_dt'] = interactions_df['last_watch_dt'].apply(lambda x: int(x.timestamp()))

In [8]:
num_interaction_pu = interactions_df.groupby('user_id')['item_id'].count().sort_values(ascending = False)

In [9]:
cold_users = num_interaction_pu[num_interaction_pu < 2].index

In [10]:
warm_users_history = interactions_df[~interactions_df.user_id.isin(cold_users)]

In [11]:
warm_users_history

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,1620691200,4250,72
1,699317,1659,1622246400,8317,100
2,656683,7107,1620518400,10,0
3,864613,7638,1625443200,14483,100
4,964868,9506,1619740800,6725,100
...,...,...,...,...,...
5476244,438585,7829,1627862400,6804,100
5476245,786732,4880,1620777600,753,0
5476247,546862,9673,1618272000,2308,49
5476249,384202,16197,1618790400,6203,100


In [12]:
cold_users_history = interactions_df[interactions_df.user_id.isin(cold_users)]

In [13]:
cold_users_history

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
10,791466,8199,1627344000,713,9
26,181144,10440,1628380800,27408,40
37,161176,10440,1627516800,22,0
38,21723,1819,1618099200,8535,100
60,77216,8143,1620432000,660,11
...,...,...,...,...,...
5476222,1007900,9728,1628726400,416,6
5476225,882138,4716,1621555200,1594,2
5476238,805174,13125,1624752000,1,0
5476246,648596,12225,1628812800,76,0


In [14]:
training, holdout = leave_last_out(warm_users_history, userid='user_id', timeid='last_watch_dt')

In [15]:
train_val, data_index = transform_indices(training, 'user_id', 'item_id')
holdout_val = reindex_data(holdout, data_index, fields="items") 

# generate interaction matrix

In [16]:
train_val

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,97578,8939,1620691200,4250,72
1,386502,1563,1622246400,8317,100
2,362969,6683,1620518400,10,0
3,478343,7184,1625443200,14483,100
4,533634,8939,1619740800,6725,100
...,...,...,...,...,...
5476244,242535,7364,1627862400,6804,100
5476245,434911,4600,1620777600,753,0
5476247,302267,9096,1618272000,2308,49
5476249,212377,15241,1618790400,6203,100


In [17]:
data_description = dict(
    users = data_index['users'].name,
    items = data_index['items'].name,
    feedback = 'watched_pct',
    n_users = len(data_index['users']),
    n_items = len(data_index['items']),
    
)

In [18]:
train_matrix = generate_interactions_matrix(train_val, data_description, rebase_users = False)