# Preprocessing 

In [1]:
import pandas as pd
import numpy as np

In [2]:
interactions = pd.read_csv('../dataset/raw_datasets/interactions.csv')
items = pd.read_csv('../dataset/raw_datasets/items.csv')
users = pd.read_csv('../dataset/raw_datasets/users.csv')

## Первичный preprocessing датасета interactions 

In [3]:
interactions.head()

Unnamed: 0,user_id,item_id,progress,rating,start_date
0,126706,14433,80,,2018-01-01
1,127290,140952,58,,2018-01-01
2,66991,198453,89,,2018-01-01
3,46791,83486,23,5.0,2018-01-01
4,79313,188770,88,5.0,2018-01-01


In [4]:
print(interactions.info())
print(interactions.describe(include='all'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1533078 entries, 0 to 1533077
Data columns (total 5 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   user_id     1533078 non-null  int64  
 1   item_id     1533078 non-null  int64  
 2   progress    1533078 non-null  int64  
 3   rating      285356 non-null   float64
 4   start_date  1533078 non-null  object 
dtypes: float64(1), int64(3), object(1)
memory usage: 58.5+ MB
None
             user_id       item_id      progress         rating  start_date
count   1.533078e+06  1.533078e+06  1.533078e+06  285356.000000     1533078
unique           NaN           NaN           NaN            NaN         730
top              NaN           NaN           NaN            NaN  2019-05-14
freq             NaN           NaN           NaN            NaN        3045
mean    7.981475e+04  1.604259e+05  4.765807e+01       4.583012         NaN
std     4.596514e+04  9.291314e+04  3.835227e+01       0.866997

In [5]:
interactions.isna().sum()

user_id             0
item_id             0
progress            0
rating        1247722
start_date          0
dtype: int64

In [6]:
interactions.shape

(1533078, 5)

Преобразование форматов данных с целью оптимизации памяти

In [7]:
interactions['user_id'] = interactions['user_id'].astype('int32')
interactions['item_id'] = interactions['item_id'].astype('int32')
interactions['progress'] = interactions['progress'].astype('int8')
interactions['rating'] = interactions['rating'].astype(pd.SparseDtype(np.float32, np.nan))
interactions['start_date'] = pd.to_datetime(interactions['start_date'])

In [8]:
interactions = interactions.sort_values('start_date')

In [9]:
print(interactions['rating'].min(), interactions['rating'].max())
print(interactions['progress'].min(), interactions['progress'].max())

1.0 5.0
0 100


Проверка на наличие полных дубликатов и очистка

In [10]:
duplicate_rows = interactions.duplicated().sum()
print(duplicate_rows)

0


In [11]:
duplicate_pairs = interactions.duplicated(subset=['user_id', 'item_id'], keep='last')
print(f"Дубликатов пар user-item: {duplicate_pairs.sum()}")

Дубликатов пар user-item: 80


In [12]:
interactions = interactions[~duplicate_pairs]

In [13]:
interactions.head()

Unnamed: 0,user_id,item_id,progress,rating,start_date
0,126706,14433,80,,2018-01-01
1437,97128,245669,1,,2018-01-01
1436,47427,46915,54,,2018-01-01
1435,95067,165632,6,,2018-01-01
1434,110053,141651,86,,2018-01-01


In [56]:
interactions.to_pickle('../dataset/preprocessed_datasets/preprocessed_interactions.pkl')

## Первичный preprocessing датасета users

In [15]:
users.head()

Unnamed: 0,user_id,age,sex
0,1,45_54,
1,2,18_24,0.0
2,3,65_inf,0.0
3,4,18_24,0.0
4,5,35_44,0.0


In [16]:
print(users.info())
print(users.describe(include='all'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142888 entries, 0 to 142887
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   user_id  142888 non-null  int64  
 1   age      142742 non-null  object 
 2   sex      136626 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 3.3+ MB
None
              user_id     age            sex
count   142888.000000  142742  136626.000000
unique            NaN       6            NaN
top               NaN   18_24            NaN
freq              NaN   55181            NaN
mean     79763.077690     NaN       0.326695
std      46068.679277     NaN       0.469006
min          1.000000     NaN       0.000000
25%      39851.750000     NaN       0.000000
50%      79757.500000     NaN       0.000000
75%     119651.250000     NaN       1.000000
max     159611.000000     NaN       1.000000


In [36]:
users['age'] = users['age'].astype('category')
users['sex'] = users['sex'].astype(pd.SparseDtype(np.int8, np.nan))

In [57]:
users.to_pickle('../dataset/preprocessed_datasets/preprocessed_users.pkl')

## Первичный preprocessing датасета items

In [42]:
print(items.info())
print(items.describe(include='all'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59599 entries, 0 to 59598
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       59599 non-null  int64 
 1   title    59599 non-null  object
 2   genres   59568 non-null  object
 3   authors  52714 non-null  object
 4   year     46720 non-null  object
dtypes: int64(1), object(4)
memory usage: 2.3+ MB
None
                   id          title                          genres  \
count    59599.000000          59599                           59568   
unique            NaN          57358                           10769   
top               NaN  Стихотворения  Современная русская литература   
freq              NaN             47                            2272   
mean    160665.157402            NaN                             NaN   
std      92726.533331            NaN                             NaN   
min          1.000000            NaN                             NaN   
25

In [48]:
items.nunique()

id         59599
title      57358
genres     10769
authors    17265
year        1053
dtype: int64

In [52]:
for col in ['genres', 'authors', 'year']:
    items[col] = items[col].astype('category')

In [58]:
items.to_pickle('../dataset/preprocessed_datasets/preprocessed_items.pkl')