In [3]:
import os
import re

import pandas as pd
from tqdm import tqdm

In [180]:
PROJECT_FOLDER = os.path.abspath('..')
DATA_FOLDER = os.path.join(PROJECT_FOLDER, 'local', 'prepared')

In [None]:
cleared_columns = ['readerID', 'catalogueRecordID', 'startDate']
books_circulations = set()
user_circulations = set()
cleared_circulations = pd.DataFrame(columns=cleared_columns)

for i in range(1, 17):
    circulation = pd.read_csv(f'{DATA_FOLDER}/circulaton_{i}.csv', sep=';', encoding='cp1251')
    circulation = circulation[circulation['state'] == 'На руках']
    cleared_circulations = cleared_circulations.append(circulation[circulation['state'] == 'На руках'][cleared_columns])
    books_circulations.update(circulation['catalogueRecordID'])
    user_circulations.update(circulation['readerID'])

cleared_circulations.columns = ['user_id', 'book_id', 'dt']
cleared_circulations.head()

cleared_circulations.to_csv(f'{PROJECT_FOLDER}/data/circulations.csv', index=False)

In [141]:
user_book_interactions = pd.read_csv(f'{DATA_FOLDER}/user_book_interaction_full.csv')
user_book_interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4186 entries, 0 to 4185
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dt       4186 non-null   object
 1   user_id  4186 non-null   int64 
 2   book_id  4186 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 98.2+ KB


In [142]:
cleared_circulations = pd.read_csv(f'{DATA_FOLDER}/circulations.csv')
cleared_circulations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 580305 entries, 0 to 580304
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   user_id  580305 non-null  int64 
 1   book_id  580305 non-null  int64 
 2   dt       580305 non-null  object
dtypes: int64(2), object(1)
memory usage: 13.3+ MB


In [143]:
user_book_interactions = pd.concat([cleared_circulations, user_book_interactions])
user_book_interactions['dt'] = pd.to_datetime(user_book_interactions['dt'])
user_book_interactions.sort_values(by='dt', inplace=True)
user_book_interactions.drop_duplicates(inplace=True)
user_book_interactions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 583919 entries, 0 to 266675
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   user_id  583919 non-null  int64         
 1   book_id  583919 non-null  int64         
 2   dt       583919 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(2)
memory usage: 17.8 MB


In [178]:
test_ids = user_book_interactions['user_id'].value_counts().index.tolist()[:1000]
user_book_interactions_train = pd.DataFrame(columns=user_book_interactions.columns)
user_book_interactions_test = pd.DataFrame(columns=['user_id', 'book_id_1', 'book_id_2', 'book_id_3', 'book_id_4', 'book_id_5'])

for user_id in test_ids:
    inter_train_data = user_book_interactions[user_book_interactions['user_id'] == user_id].reset_index(drop=True)
    inter_test_data = inter_train_data[-5:].reset_index(drop=True)
    test_data = pd.DataFrame({'user_id': inter_test_data['user_id'][0],
                              **{f'book_id_{i + 1}': book_id for (i, book_id) in enumerate(inter_test_data['book_id'])}}, index=[0])
    train_data = inter_train_data.drop(index=list(range(len(inter_train_data) - 1, len(inter_train_data) - 6, -1)))
    user_book_interactions_train = user_book_interactions_train.append(train_data)
    user_book_interactions_test = user_book_interactions_test.append(test_data)

user_book_interactions_train.reset_index(drop=True)
user_book_interactions_test.reset_index(drop=True)

Unnamed: 0,user_id,book_id_1,book_id_2,book_id_3,book_id_4,book_id_5
0,34,1866509,1812326,1866509,2523335,1205629
1,23,393777,393777,612654,1697148,2398298
2,51,508164,1554620,269798,820431,777209
3,64,1824182,1824122,237084,1942107,1609359
4,30,1143317,757226,6532,316778,316778
...,...,...,...,...,...,...
995,93349,62659,325012,251134,403597,382214
996,341588,1177013,75164,20815,1277551,1240811
997,243259,2172326,2089907,2522575,1853331,2105936
998,573560,1800029,1832712,1591474,1750522,1730614


In [179]:
user_book_interactions_train.to_csv(f'{PROJECT_FOLDER}/local/prepared/train_interactions.csv', index=False)
user_book_interactions_test.to_csv(f'{PROJECT_FOLDER}/local/prepared/test_interactions.csv', index=False)
user_book_interactions.to_csv(f'{PROJECT_FOLDER}/local/prepared/interactions.csv', index=False)

In [117]:
books = pd.read_csv(f'{PROJECT_FOLDER}/data/books.csv')
books['author_id'] = books['author_id'].fillna(0).astype('uint16')
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 993023 entries, 0 to 993022
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                993023 non-null  int64  
 1   bbk               833572 non-null  object 
 2   year              862153 non-null  float64
 3   author            660735 non-null  object 
 4   annotation        299376 non-null  object 
 5   volume            788456 non-null  float64
 6   title_additional  42063 non-null   object 
 7   age_restriction   78098 non-null   float64
 8   rubrics           823637 non-null  object 
 9   author_id         993023 non-null  uint16 
 10  title             982677 non-null  object 
 11  available         993023 non-null  int64  
dtypes: float64(3), int64(2), object(6), uint16(1)
memory usage: 85.2+ MB


In [118]:
books.drop_duplicates(inplace=True)
books.drop_duplicates(subset=['id'], inplace=True)
books.sort_values(by=['title', 'author_id'], inplace=True)
books.reset_index(inplace=True, drop=True)

In [119]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977271 entries, 0 to 977270
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                977271 non-null  int64  
 1   bbk               818572 non-null  object 
 2   year              846413 non-null  float64
 3   author            648910 non-null  object 
 4   annotation        293366 non-null  object 
 5   volume            773215 non-null  float64
 6   title_additional  41732 non-null   object 
 7   age_restriction   75634 non-null   float64
 8   rubrics           808703 non-null  object 
 9   author_id         977271 non-null  uint16 
 10  title             967246 non-null  object 
 11  available         977271 non-null  int64  
dtypes: float64(3), int64(2), object(6), uint16(1)
memory usage: 83.9+ MB


In [120]:
sum(books.duplicated(subset=['title', 'author_id']))

382172

In [121]:
books_small = books[['id', 'author_id', 'available', 'title']]

In [122]:
books_small.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977271 entries, 0 to 977270
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   id         977271 non-null  int64 
 1   author_id  977271 non-null  uint16
 2   available  977271 non-null  int64 
 3   title      967246 non-null  object
dtypes: int64(2), object(1), uint16(1)
memory usage: 24.2+ MB


In [123]:
main_id_dict = {}
index_to_delete = []
curr_title = None
curr_author_id = None
curr_id = None
curr_available = None

for i in tqdm(range(len(books_small))):
    id, author_id, available, title = books_small.loc[i, ].to_list()

    if title == curr_title and author_id == curr_author_id:
        index_to_delete.append(i)
        if curr_id in main_id_dict:
            main_id_dict[curr_id][0].append(id)
            main_id_dict[curr_id][1].append(available)
        else:
            main_id_dict[curr_id] = ([id], [curr_available, available])
    else:
        curr_author_id = author_id
        curr_title = title
        curr_id = id
        curr_available = available


100%|██████████| 977271/977271 [02:30<00:00, 6480.75it/s]


In [124]:
books.drop(index=index_to_delete, inplace=True)
books.reset_index(drop=True, inplace=True)

In [127]:
sum(books.duplicated(subset=['title', 'author_id', 'rubrics']))

7232

In [182]:
books

Unnamed: 0,id,bbk,year,author,annotation,volume,title_additional,age_restriction,rubrics,author_id,title,available
0,1875408,84(7Сое),1937.0,Э. Б. Синклер,,159.0,,,Художественная литература,37927,!NO PASARAN!,1
1,1579386,84,1973.0,Edited and translated by Marquez R. Drawings b...,,223.0,,,Художественная литература,0,!Patria o Muerte!,1
2,1619574,32.972.1,2008.0,,,,,,Программное обеспечение,0,!С: Бухгалтерия 8.0: Практический самоучитель,1
3,1750954,84(2)6-5,1999.0,Упр. культуры Администрации Тамбов. обл. Тамбо...,,109.0,,,Художественная литература,0,""" И славен буду я""",1
4,149372,83.32Рос6,1986.0,,,319.0,,,Художественная литература,0,""" И тебе я в песне отзовусь...""",9
...,...,...,...,...,...,...,...,...,...,...,...,...
602853,10115,84(4Вел),2013.0,,,416.0,,,Художественная литература,65356,,57
602854,610314,84(2Рос=Рус)6,2003.0,,,191.0,,,Художественная литература,65376,,13
602855,644172,84(2Рос=Рус)6,2003.0,,,287.0,,,Художественная литература,65376,,13
602856,1391565,84(2Рос=Рус)6,1984.0,,,519.0,,,Художественная литература,65454,,8


In [184]:
books.to_csv(f'{PROJECT_FOLDER}/data/books_new.csv', index=False)

In [130]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 602858 entries, 0 to 602857
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                602858 non-null  int64  
 1   bbk               518424 non-null  object 
 2   year              527194 non-null  float64
 3   author            420311 non-null  object 
 4   annotation        194681 non-null  object 
 5   volume            495596 non-null  float64
 6   title_additional  33563 non-null   object 
 7   age_restriction   40516 non-null   float64
 8   rubrics           513274 non-null  object 
 9   author_id         602858 non-null  uint16 
 10  title             592833 non-null  object 
 11  available         602858 non-null  int64  
dtypes: float64(3), int64(2), object(6), uint16(1)
memory usage: 51.7+ MB


In [166]:
books_small = books[['id']]

In [171]:
reverse_main_id_dict = {}

for i in tqdm(range(len(books_small))):
    id = books_small.loc[i, ].to_list()[0]

    if id in main_id_dict:
        duplicated, available = main_id_dict[id]
        books.loc[i, 'available'] = sum(available)

        for dup_id in duplicated:
            if dup_id in reverse_main_id_dict:
                raise ValueError()

            reverse_main_id_dict[dup_id] = id

100%|██████████| 602858/602858 [15:39<00:00, 641.49it/s] 


In [174]:
user_book_interactions.reset_index(drop=True, inplace=True)

In [175]:
for i, book_id in tqdm(enumerate(user_book_interactions['book_id'])):
    if book_id in reverse_main_id_dict:
        user_book_interactions.loc[i, 'book_id'] = reverse_main_id_dict[book_id]

583919it [11:26, 850.33it/s] 


In [173]:
len(set(user_book_interactions['book_id']))

193122

In [177]:
len(set(user_book_interactions['book_id']))

166821