#Imports

In [None]:
import numpy as np 
import pandas as pd 
pd.set_option('display.max_colwidth', None)


import scipy.sparse as sp
from itertools import islice, cycle
from more_itertools import pairwise
from tqdm.auto import tqdm

#Load data/preprocessing

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/MTS_teta/Kaggle_recomend_sys/interactions.csv')
users = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/MTS_teta/Kaggle_recomend_sys/users.csv')
items = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/MTS_teta/Kaggle_recomend_sys/items.csv')

##Interactions

----

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1562617 entries, 0 to 1562616
Data columns (total 5 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   user_id     1562617 non-null  int64  
 1   item_id     1562617 non-null  int64  
 2   progress    1562617 non-null  int64  
 3   rating      323571 non-null   float64
 4   start_date  1562617 non-null  object 
dtypes: float64(1), int64(3), object(1)
memory usage: 59.6+ MB


In [None]:
df['start_date'] = pd.to_datetime(df['start_date'])
# find duplicates
duplicates = df.duplicated(subset=['user_id', 'item_id'], keep=False)
df_duplicates = df[duplicates].sort_values(by=['user_id', 'start_date'])
# stay only unique 
df = df[~duplicates]

In [None]:
# add info with max progress/rating and min start_date in duplicates samples
df_duplicates = df_duplicates.groupby(['user_id', 'item_id']).agg({
    'progress': 'max',
    'rating': 'max',
    'start_date': 'min'
})
df = df.append(df_duplicates.reset_index(), ignore_index=True)

In [None]:
df['progress'] = df['progress'].astype(np.int8)
df['rating'] = df['rating'].astype(pd.SparseDtype(np.float32, np.nan))

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1562505 entries, 0 to 1562504
Data columns (total 5 columns):
 #   Column      Non-Null Count    Dtype               
---  ------      --------------    -----               
 0   user_id     1562505 non-null  int64               
 1   item_id     1562505 non-null  int64               
 2   progress    1562505 non-null  int8                
 3   rating      323563 non-null   Sparse[float32, nan]
 4   start_date  1562505 non-null  datetime64[ns]      
dtypes: Sparse[float32, nan](1), datetime64[ns](1), int64(2), int8(1)
memory usage: 39.7 MB


In [None]:
df.to_pickle('/content/drive/MyDrive/Colab Notebooks/MTS_teta/Kaggle_recomend_sys/interactions_preprocessed.pickle')

##Items

----

In [None]:
users.head()

Unnamed: 0,user_id,age,sex
0,0,45_54,1.0
1,1,25_34,0.0
2,2,45_54,0.0
3,3,65_inf,0.0
4,4,18_24,0.0


In [None]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137254 entries, 0 to 137253
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   user_id  137254 non-null  int64  
 1   age      137244 non-null  object 
 2   sex      135640 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 3.1+ MB


In [None]:
users.nunique()

user_id    137254
age             6
sex             2
dtype: int64

In [None]:
users['age'] = users['age'].astype('category')
users['sex'] = users['sex'].astype(pd.SparseDtype(np.float32, np.nan))

In [None]:
interaction_users = df['user_id'].unique()

common_users = len(np.intersect1d(interaction_users, users['user_id']))
users_only_in_interaction = len(np.setdiff1d(interaction_users, users['user_id']))
users_only_features = len(np.setdiff1d(users['user_id'], interaction_users))
total_users = common_users + users_only_in_interaction + users_only_features
print(f'Кол-во пользователей - {total_users}')
print(f'Кол-во пользователей c взаимодействиями и фичами - {common_users} ({common_users / total_users * 100:.2f}%)')
print(f'Кол-во пользователей только c взаимодействиями - {users_only_in_interaction} ({users_only_in_interaction / total_users * 100:.2f}%)')
print(f'Кол-во пользователей только c фичами - {users_only_features} ({users_only_features / total_users * 100:.2f}%)')

Кол-во пользователей - 164771
Кол-во пользователей c взаимодействиями и фичами - 130808 (79.39%)
Кол-во пользователей только c взаимодействиями - 27517 (16.70%)
Кол-во пользователей только c фичами - 6446 (3.91%)


In [None]:
users.to_pickle('/content/drive/MyDrive/Colab Notebooks/MTS_teta/Kaggle_recomend_sys/users_preprocessed.pickle')

##Users

----

In [None]:
items.head()

Unnamed: 0,id,title,genres,authors,year
0,248031,Ворон-челобитчик,"Зарубежные детские книги,Сказки,Зарубежная классика,Литература 19 века,Русская классика",Михаил Салтыков-Щедрин,1886
1,256084,Скрипка Ротшильда,"Классическая проза,Литература 19 века,Русская классика",Антон Чехов,1894
2,134166,Испорченные дети,"Зарубежная классика,Классическая проза,Литература 19 века,Русская классика",Михаил Салтыков-Щедрин,1869
3,281311,Странный человек,"Пьесы и драматургия,Литература 19 века",Михаил Лермонтов,1831
4,213473,Господа ташкентцы,"Зарубежная классика,Классическая проза,Литература 19 века,Русская классика",Михаил Салтыков-Щедрин,1873


In [None]:
items.info(memory_usage='full') # year has object type ???

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63758 entries, 0 to 63757
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       63758 non-null  int64 
 1   title    63758 non-null  object
 2   genres   63753 non-null  object
 3   authors  56700 non-null  object
 4   year     49508 non-null  object
dtypes: int64(1), object(4)
memory usage: 2.4+ MB


In [None]:
def num_bytes_format(num_bytes, float_prec=4):
    units = ['bytes', 'Kb', 'Mb', 'Gb', 'Tb', 'Pb', 'Eb']
    for unit in units[:-1]:
        if abs(num_bytes) < 1000:
            return f'{num_bytes:.{float_prec}f} {unit}'
        num_bytes /= 1000
    return f'{num_bytes:.4f} {units[-1]}'

In [None]:
num_bytes = items.memory_usage(deep=True).sum()
num_bytes_format(num_bytes)

'30.0965 Mb'

In [None]:
items.nunique()

id         63758
title      58093
genres     11091
authors    17024
year        1074
dtype: int64

In [None]:
items['year'].value_counts().tail(25)

1892, 1921                            1
1860, 1866                            1
1938-2003                             1
1992–1993                             1
1872, 1896–1904                       1
1608,1623                             1
2005, 2008                            1
2010, 2013                            1
1962, 1964, 1966                      1
2016, 2017                            1
1969, 1974                            1
1932, 1943                            1
1961, 1989                            1
1892, 1895, 1901, 1902, 1903, 1916    1
1922, 1940                            1
1925–1928                             1
1929, 1928                            1
1910, 1919                            1
1898, 1897, 1901                      1
1968, 1996, 2003                      1
1902, 1914, 1901, 1892                1
1890-1951                             1
1890, 1893                            1
1943-45                               1
1888, 1897                            1


In [None]:
items[items['year'] == '1898, 1897, 1901']

Unnamed: 0,id,title,genres,authors,year
43260,86545,"«Мальчик, который рисовал кошек» и другие истории о вещах странных и примечательных","Ужасы,Мистика,Зарубежная классика,Литература 19 века,Литература 20 века",Лафкадио Хирн,"1898, 1897, 1901"


In [None]:
for col in ['genres', 'authors', 'year']:
    items[col] = items[col].astype('category')

In [None]:
items.info(memory_usage='full')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63758 entries, 0 to 63757
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   id       63758 non-null  int64   
 1   title    63758 non-null  object  
 2   genres   63753 non-null  category
 3   authors  56700 non-null  category
 4   year     49508 non-null  category
dtypes: category(3), int64(1), object(1)
memory usage: 2.5+ MB


In [None]:
num_bytes = items.memory_usage(deep=True).sum()
num_bytes_format(num_bytes)

'18.6994 Mb'

In [None]:
interaction_items = df['item_id'].unique()

common_items = len(np.intersect1d(interaction_items, items['id']))
items_only_in_interaction = len(np.setdiff1d(interaction_items, items['id']))
items_only_features = len(np.setdiff1d(items['id'], interaction_items))
total_items = common_items + items_only_in_interaction + items_only_features
print(f'Кол-во книг - {total_items}')
print(f'Кол-во книг c взаимодействиями и фичами - {common_items} ({common_items / total_items * 100:.2f}%)')
print(f'Кол-во книг только c взаимодействиями - {items_only_in_interaction} ({items_only_in_interaction / total_items * 100:.2f}%)')
print(f'Кол-во книг только c фичами - {items_only_features} ({items_only_features / total_items * 100:.2f}%)')

Кол-во книг - 63758
Кол-во книг c взаимодействиями и фичами - 63758 (100.00%)
Кол-во книг только c взаимодействиями - 0 (0.00%)
Кол-во книг только c фичами - 0 (0.00%)


In [None]:
items.to_pickle('/content/drive/MyDrive/Colab Notebooks/MTS_teta/Kaggle_recomend_sys/items_preprocessed.pickle')