In [1]:
import pandas as pd
import data_handler as dh
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
goodreads_books = pd.read_csv('../preprocessed_original/goodreads_books.csv')
csv_path = '../preprocessed_original/used/books.csv'
# goodreads_books.head()

In [3]:
goodreads_books_copy = goodreads_books.copy()
books_handler = dh.DataHandler(goodreads_books_copy)


In [4]:
books_handler.null_values()

book_id                     0
work_id                   524
isbn                   983373
series                1621280
title                      11
authors                   537
language_code         1060153
average_rating            524
publication_year       599625
text_reviews_count        524
ratings_count             524
image_url                 490
dtype: int64

# language_code preprocessing

In [5]:
book_occurances = books_handler.column_occurrences('language_code')
print(len(book_occurances))
print(book_occurances.values.sum())

226
1300502


In [6]:
# books with ukr language code, has reviews, and more than 10 ratings
ukr_books = goodreads_books_copy[(goodreads_books_copy['language_code'] == 'ben') & (goodreads_books_copy['text_reviews_count'] > 0) & (goodreads_books_copy['ratings_count'] > 200)]
print(ukr_books.shape)

(371, 12)


In [7]:



# these are the language codes that have more than 1000 occurances, so they will be kept, but saved in a dictionary with their synonyms
# e.g. 'en-US' will be replaced with 'eng', keeping the same language code but with a more general name
# this will be used in the replace method
# the keys are the main language codes and the values are the synonyms found from the language_code column, which will be replaced with the key
language_code_dict = {'eng': ['eng', 'en-US', 'en-GB', 'en-CA'],
                        'lat': ['spa', 'rum'],
                        'ita': ['ita'],
                        'ara': ['ara'],
                        'fre': ['fre'],
                        'ger': ['ger', 'nl'],
                        'mkh': ['ind', 'msa'],
                        'por': ['por'],
                        'tur': ['tur'],
                        'per': ['per'],
                        'urj': ['fin', 'hun', 'est'],
                        'gre': ['gre'],
                        'gmq': ['swe', 'dan', 'nor'],
                        'wen': ['cze', 'pol', 'slo'],
                        'jpn': ['jpn'],
                        'sla': ['rus', 'scr', 'srp', 'bul', 'ukr'],
                        'vie': ['vie', 'tha', 'zho'],
                        'bat': ['lit', 'lav']}

# invert the dictionary so that the keys are the synonyms and the values are the main language codes
language_code_dict = {v: k for k, v_list in language_code_dict.items() for v in v_list}


In [8]:
# replace the language codes with the main language codes
books_handler.df = books_handler.column_replace_values('language_code', language_code_dict)

# change any language codes that are not a nan or in the dictionary to 'other'
books_handler.df ['language_code'] = books_handler.df ['language_code'].apply(lambda x: x if x in language_code_dict.values() or pd.isna(x) else 'other')


In [9]:
book_occurances = books_handler.column_occurrences('language_code')
print(len(book_occurances))
print(book_occurances.values.sum())
print(book_occurances.values.min())


19
1300502
4176


# work_id/book_id preprocessing

In [10]:
df_copy = books_handler.df.copy()
df_copy.isna().sum()

book_id                     0
work_id                   524
isbn                   983373
series                1621280
title                      11
authors                   537
language_code         1060153
average_rating            524
publication_year       599625
text_reviews_count        524
ratings_count             524
image_url                 490
dtype: int64

In [11]:
print(df_copy['work_id'].nunique())

1521962


In [12]:
# find books with duplicate work_id
df_copy[df_copy.duplicated(subset=['work_id'], keep=False)].sort_values(by=['work_id'])
# find all books with work_id 40
df_copy[df_copy['work_id'] == 40]

Unnamed: 0,book_id,work_id,isbn,series,title,authors,language_code,average_rating,publication_year,text_reviews_count,ratings_count,image_url
52099,2729856,40.0,067142503X,,Hidden Persuaders,2553.0,,3.91,,3.0,10.0,https://images.gr-assets.com/books/1440119792m...
740235,20696551,40.0,0671810359,,The Hidden Persuaders,2553.0,,3.91,,1.0,1.0,https://s.gr-assets.com/assets/nophoto/book/11...
946094,3730,40.0,097884310X,,The Hidden Persuaders,2553.0,,3.91,2007.0,46.0,615.0,https://images.gr-assets.com/books/1395145591m...
1578727,7826764,40.0,,,The Hidden Persuaders (C-288),2553.0,eng,3.91,1958.0,1.0,2.0,https://s.gr-assets.com/assets/nophoto/book/11...


In [13]:
# find all books with the same work_id, and keep only the one with the highest ratings_count
df_copy = df_copy.sort_values('ratings_count', ascending=False).drop_duplicates('work_id', keep='first')
df_copy[df_copy['work_id'] == 40]



Unnamed: 0,book_id,work_id,isbn,series,title,authors,language_code,average_rating,publication_year,text_reviews_count,ratings_count,image_url
946094,3730,40.0,097884310X,,The Hidden Persuaders,2553.0,,3.91,2007.0,46.0,615.0,https://images.gr-assets.com/books/1395145591m...


In [14]:
# drop the rows with null values in the work_id column
df_copy.isna().sum()


book_id                     0
work_id                     1
isbn                   615778
series                1151284
title                       9
authors                    13
language_code          733088
average_rating              1
publication_year       390207
text_reviews_count          1
ratings_count               1
image_url                   1
dtype: int64

In [15]:
df_copy = df_copy.dropna(subset=['work_id'])
df_copy.isna().sum()

book_id                     0
work_id                     0
isbn                   615777
series                1151283
title                       9
authors                    12
language_code          733087
average_rating              0
publication_year       390206
text_reviews_count          0
ratings_count               0
image_url                   0
dtype: int64

In [16]:
# save the dataframe to a csv file
df_copy.to_csv(csv_path, index=False)