In [254]:
import time
from datetime import date

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# to ignore warnings
import warnings
warnings.filterwarnings('ignore')

## Read and check datasets

In [180]:
books_df = pd.read_csv('../data/processed_data/Books.csv')
users_df = pd.read_csv('../data/processed_data/Users.csv')
ratings_df = pd.read_csv('../data/processed_data/Ratings.csv')

print(f"books_df.shape = {books_df.shape}")
print(f"users_df.shape = {users_df.shape}")
print(f"ratings_df.shape = {ratings_df.shape}")

books_df.shape = (271360, 8)
users_df.shape = (278858, 3)
ratings_df.shape = (1149780, 3)


In [150]:
missing_info_df = pd.DataFrame({
    'missing_count': books_df.isnull().sum(),
    'missing_percentage': (books_df.isnull().sum() / len(books_df)) * 100
}).reset_index().rename(columns={'index': 'column'})

# author - 0, image-url-l - 0
missing_info_df

Unnamed: 0,column,missing_count,missing_percentage
0,isbn,0,0.0
1,title,0,0.0
2,author,0,0.0
3,publication_year,0,0.0
4,publisher,2,0.000737
5,image-url-s,0,0.0
6,image-url-m,0,0.0
7,image-url-l,0,0.0


In [119]:
# Should be zero records
books_df[books_df['publication_year'] > date.today().year]

Unnamed: 0,isbn,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l


In [6]:
books_df.head(2)

Unnamed: 0,isbn,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...


In [7]:
users_df.head(2)

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0


In [8]:
ratings_df.head(2)

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5


## Datasets Analysis for SQL tables

### books_df

In [181]:
books_df.head(2)

Unnamed: 0,isbn,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...


In [182]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   isbn              271360 non-null  object
 1   title             271360 non-null  object
 2   author            271360 non-null  object
 3   publication_year  271360 non-null  int64 
 4   publisher         271358 non-null  object
 5   image-url-s       271360 non-null  object
 6   image-url-m       271360 non-null  object
 7   image-url-l       271360 non-null  object
dtypes: int64(1), object(7)
memory usage: 16.6+ MB


In [183]:
missing_info_df = pd.DataFrame({
    'missing_count': books_df.isnull().sum(),
    'missing_percentage': (books_df.isnull().sum() / len(books_df)) * 100
}).reset_index().rename(columns={'index': 'column'})

missing_info_df

Unnamed: 0,column,missing_count,missing_percentage
0,isbn,0,0.0
1,title,0,0.0
2,author,0,0.0
3,publication_year,0,0.0
4,publisher,2,0.000737
5,image-url-s,0,0.0
6,image-url-m,0,0.0
7,image-url-l,0,0.0


For the book table of the database, all the fields of the books dataset will be used, but the book table will have only one field 'image', which will contain the values of the column 'image-url-m'.

In [184]:
books_df.drop(['image-url-s', 'image-url-l'], axis=1, inplace=True)
books_df.rename(columns={'image-url-m': 'image_url'}, inplace=True)
books_df.head(2)

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...


#### Explore the isbn column

First, we should make sure that all entries in the 'isbn' field have the same length.

In [185]:
books_df['isbn_len'] = books_df['isbn'].apply(lambda x: len(x))
books_df.head(2)

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url,isbn_len
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,10
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,10


In [186]:
print(f"books_df['isbn_len'].unique() = {books_df['isbn_len'].unique()}")

books_df['isbn_len'].unique() = [10 11 13]


In [187]:
print(f"len(books_df[books_df['isbn_len'] == 10]) = {len(books_df[books_df['isbn_len'] == 10])}")
print(f"len(books_df[books_df['isbn_len'] == 11]) = {len(books_df[books_df['isbn_len'] == 11])}")
print(f"len(books_df[books_df['isbn_len'] == 13]) = {len(books_df[books_df['isbn_len'] == 13])}")

len(books_df[books_df['isbn_len'] == 10]) = 271356
len(books_df[books_df['isbn_len'] == 11]) = 1
len(books_df[books_df['isbn_len'] == 13]) = 3


Let's consider books whose isbn contains more than 10 characters

In [188]:
books_df[books_df['isbn_len'] == 11]

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url,isbn_len
111808,0486404242\t,War in Kind: And Other Poems (Dover Thrift Edi...,Stephen Crane,1998,Dover Publications,http://images.amazon.com/images/P/0486404242.0...,11


In [189]:
books_df[books_df['isbn_len'] == 13]

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url,isbn_len
171206,3518365479<90,"Suhrkamp TaschenbÃ?Â¼cher, Nr.47, Frost",Thomas Bernhard,1972,Suhrkamp,http://images.amazon.com/images/P/3518365479.0...,13
251424,3442248027 3,Diamond Age. Die Grenzwelt.,Neal Stephenson,2000,Goldmann,http://images.amazon.com/images/P/3442248027.0...,13
251649,0385722206 0,Balzac and the Little Chinese Seamstress : A N...,DAI SIJIE,2002,Anchor,http://images.amazon.com/images/P/0385722206.0...,13


The obtained results show that such records with len(isbn) > 10 contain data incorrectly processed by the authors of the dataset.  
In order to understand what to do with such books, it is worth considering whether books with the specified isbn are found in the ratings dataset.

In [196]:
incorrect_isbn_set = set(books_df[books_df['isbn_len'] != 10]['isbn'])
print(f"incorrect_isbn_set = {incorrect_isbn_set}\n")
ratings_df[ratings_df['isbn'].apply(lambda x: x in incorrect_isbn_set)]

incorrect_isbn_set = {'0486404242\t', '0385722206  0', '3518365479<90', '3442248027  3'}



Unnamed: 0,user_id,isbn,rating
443229,106173,3518365479<90,0
624769,151546,3442248027 3,10
647512,156422,0385722206 0,9
680026,165507,3442248027 3,0


##### Additional checking

In [197]:
ratings_df[ratings_df['isbn'] == '0486404242\t']

Unnamed: 0,user_id,isbn,rating


In [198]:
ratings_df[ratings_df['isbn'] == '3518365479<90']

Unnamed: 0,user_id,isbn,rating
443229,106173,3518365479<90,0


In [199]:
ratings_df[ratings_df['isbn'] == '3442248027  3']

Unnamed: 0,user_id,isbn,rating
624769,151546,3442248027 3,10
680026,165507,3442248027 3,0


In [200]:
ratings_df[ratings_df['isbn'] == '0385722206  0']

Unnamed: 0,user_id,isbn,rating
647512,156422,0385722206 0,9


#### Solving the problem with isbn in books_df

To decide what to do with records with incorrect isbns, you need to check whether the books_df and ratings_df datasets have records with corrected isbn versions:
1) If they exist, it is enough to simply delete the corresponding record from the books_df dataset;
2) If not, then it is worth correcting the isbn in books_df and ratings_df.

In [201]:
books_df[books_df['isbn'] == '0486404242']

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url,isbn_len
111653,486404242,War in Kind: And Other Poems (Dover Thrift Edi...,Stephen Crane,1998,Dover Publications,http://images.amazon.com/images/P/0486404242.0...,10


In [202]:
ratings_df[ratings_df['isbn'] == '0486404242']

Unnamed: 0,user_id,isbn,rating
50975,11676,486404242,8
262132,60277,486404242,0
262411,60283,486404242,0
389126,93910,486404242,0
444954,106751,486404242,0
608853,147280,486404242,0
617584,149422,486404242,6
835853,201898,486404242,0
932291,226245,486404242,7
944570,228913,486404242,0


Since there is already a book with the correct isbn in books_df and the content of the corresponding entry is exactly the same as the content of the entry with the incorrect isbn, such an entry can be deleted.  
The ratings_df dataset does not contain any records with 'isbn' = '0486404242\t', so it does not need to be changed.

In [203]:
print(f"len(books_df) before deleting '0486404242\t' = {len(books_df)}")
books_df = books_df[books_df['isbn'] != '0486404242\t']
print(f"len(books_df) after deleting '0486404242\t' = {len(books_df)}")

len(books_df) before deleting '0486404242	' = 271360
len(books_df) after deleting '0486404242	' = 271359


Books with isbn = '3518365479<90', '3442248027  3' and '0385722206  0' are also found in the ratings_df dataset.  
Let's check whether books with the specified isbns exists in the books or ratings dataset.

In [204]:
books_df[books_df['isbn'] == '3518365479']

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url,isbn_len


In [205]:
ratings_df[ratings_df['isbn'] == '3518365479']

Unnamed: 0,user_id,isbn,rating


In [206]:
books_df[books_df['isbn'] == '3442248027']

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url,isbn_len


In [207]:
ratings_df[ratings_df['isbn'] == '3442248027']

Unnamed: 0,user_id,isbn,rating


In [208]:
books_df[books_df['isbn'] == '0385722206']

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url,isbn_len
5248,385722206,Balzac and the Little Chinese Seamstress : A N...,DAI SIJIE,2002,Anchor,http://images.amazon.com/images/P/0385722206.0...,10


In [209]:
ratings_df[ratings_df['isbn'] == '0385722206']

Unnamed: 0,user_id,isbn,rating
1978,277453,0385722206,0
5967,278418,0385722206,0
17018,2977,0385722206,0
22204,4516,0385722206,8
24334,5680,0385722206,0
...,...,...,...
1132923,271705,0385722206,0
1133792,272207,0385722206,0
1143438,274808,0385722206,0
1146737,275970,0385722206,10


Since books with isbn='3518365479' and isbn='3442248027' does not exist in the books and ratings datasets, it is worth correcting corresponding isbns.  
For the book with isbn='0385722206' there are already records in the books_df and ratings_df datasets, then:
- For the books_df dataset, the corresponding record will simply be deleted;
- For ratings_df the isbn value has been corrected.

In [210]:
print(f"len(books_df) before deleting '0385722206  0' = {len(books_df)}")
books_df = books_df[books_df['isbn'] != '0385722206  0']
print(f"len(books_df) after deleting '0385722206  0' = {len(books_df)}")

len(books_df) before deleting '0385722206  0' = 271359
len(books_df) after deleting '0385722206  0' = 271358


In [211]:
books_df['isbn'] = books_df['isbn'].apply(lambda isbn: isbn[:10])
books_df.head(2)

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url,isbn_len
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,10
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,10


In [212]:
books_df['correct_isbn_len'] = books_df['isbn'].apply(lambda isbn: len(isbn))
books_df.head(2)

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url,isbn_len,correct_isbn_len
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,10,10
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,10,10


In [215]:
print(f"books_df['correct_isbn_len'].unique() = {books_df['correct_isbn_len'].unique()}\n")
books_df[books_df['isbn_len'] > 10]

books_df['correct_isbn_len'].unique() = [10]



Unnamed: 0,isbn,title,author,publication_year,publisher,image_url,isbn_len,correct_isbn_len
171206,3518365479,"Suhrkamp TaschenbÃ?Â¼cher, Nr.47, Frost",Thomas Bernhard,1972,Suhrkamp,http://images.amazon.com/images/P/3518365479.0...,13,10
251424,3442248027,Diamond Age. Die Grenzwelt.,Neal Stephenson,2000,Goldmann,http://images.amazon.com/images/P/3442248027.0...,13,10


The results show that the data in books_df was successfully corrected.

In [229]:
books_df.drop(['isbn_len', 'correct_isbn_len'], axis=1, inplace=True)
books_df.head(2)

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...


#### Solving the problem with isbn in ratings_df

In [216]:
ratings_df['isbn_len'] = ratings_df['isbn'].apply(lambda isbn: len(isbn))
ratings_df.head(2)

Unnamed: 0,user_id,isbn,rating,isbn_len
0,276725,034545104X,0,10
1,276726,0155061224,5,10


In [217]:
print(f"ratings_df['isbn_len'].unique() = {ratings_df['isbn_len'].unique()}")

ratings_df['isbn_len'].unique() = [10  9 11  8 12 13 14]


Since ratings_df has records with len(isbn) IN \[8, 9, 11, 12, 13, 14], after correcting records with isbn IN incorrect_isbn_set, all records with incorrect isbn will be removed.

In [219]:
ratings_df[ratings_df['isbn'].apply(lambda isbn: isbn in incorrect_isbn_set)]

Unnamed: 0,user_id,isbn,rating,isbn_len
443229,106173,3518365479<90,0,13
624769,151546,3442248027 3,10,13
647512,156422,0385722206 0,9,13
680026,165507,3442248027 3,0,13


In [222]:
ratings_df['isbn'] = ratings_df['isbn'].apply(lambda isbn: isbn[:10] if isbn in incorrect_isbn_set else isbn)
ratings_df['correct_isbn_len'] = ratings_df['isbn'].apply(lambda isbn: len(isbn))
ratings_df.head(2)

Unnamed: 0,user_id,isbn,rating,isbn_len,correct_isbn_len
0,276725,034545104X,0,10,10
1,276726,0155061224,5,10,10


In [223]:
print(f"ratings_df['correct_isbn_len'].unique() = {ratings_df['correct_isbn_len'].unique()}")

ratings_df['correct_isbn_len'].unique() = [10  9 11  8 12 13 14]


In [224]:
ratings_df[ratings_df['isbn'].apply(lambda isbn: isbn in incorrect_isbn_set)]

Unnamed: 0,user_id,isbn,rating,isbn_len,correct_isbn_len


In [225]:
ratings_df[ratings_df['correct_isbn_len'] != ratings_df['isbn_len']]

Unnamed: 0,user_id,isbn,rating,isbn_len,correct_isbn_len
443229,106173,3518365479,0,13,10
624769,151546,3442248027,10,13,10
647512,156422,385722206,9,13,10
680026,165507,3442248027,0,13,10


In [226]:
print(f"len(ratings_df) before cleaning: {len(ratings_df)}")
ratings_df = ratings_df[ratings_df['correct_isbn_len'] == 10]
print(f"len(ratings_df) after cleaning: {len(ratings_df)}\n")
print(f"ratings_df['correct_isbn_len'].unique(): {ratings_df['correct_isbn_len'].unique()}\n")
ratings_df.head(2)

len(ratings_df) before cleaning: 1149780
len(ratings_df) after cleaning: 1139367

ratings_df['correct_isbn_len'].unique(): [10]



Unnamed: 0,user_id,isbn,rating,isbn_len,correct_isbn_len
0,276725,034545104X,0,10,10
1,276726,0155061224,5,10,10


The obtained results show that the ratings_df dataframe cleanup was successful.

In [228]:
ratings_df.drop(['isbn_len', 'correct_isbn_len'], axis=1, inplace=True)
ratings_df.head(2)

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5


### users_df

In [230]:
users_df.head(2)

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0


In [231]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   user_id   278858 non-null  int64  
 1   location  278858 non-null  object 
 2   age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


In [232]:
users_df.describe()

Unnamed: 0,user_id,age
count,278858.0,168096.0
mean,139429.5,34.751434
std,80499.51502,14.428097
min,1.0,0.0
25%,69715.25,24.0
50%,139429.5,32.0
75%,209143.75,44.0
max,278858.0,244.0


In [233]:
missing_info_df = pd.DataFrame({
    'missing_count': users_df.isnull().sum(),
    'missing_percentage': (users_df.isnull().sum() / len(users_df)) * 100
}).reset_index().rename(columns={'index': 'column'})

missing_info_df

Unnamed: 0,column,missing_count,missing_percentage
0,user_id,0,0.0
1,location,0,0.0
2,age,110762,39.719857


The following conclusions can be drawn from the obtained results:
- The number of unique values for the **user_id** field is equal to the number of elements in the dataset and starts with 1, which indicates the fact that the data of the dataset was obtained from a certain database.
- For the "Book Recommendation System" application, information about the place of residence of users is not important (at least for the 1st version of the system), so the database will not have a **location** field.
- Despite the fact that the **age** field has a large number of missing values (~40%), it can still be made a field in the user table (so the user.age field simply will not have the NOT NULL constraint). However, from the database point of view, the 'age' field in the 'user' table:
  - Is not required for recommender system;
  - In the database, it is better to store 'birth_date', not 'age'. Also, on the UI during registration, users enter data about their birthday much more often than their age.
  - Therefore, for the current version of the program, the 'user' table will not store data about the user's age.

### ratings_df

In [234]:
ratings_df.head(2)

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5


In [239]:
missing_info_df = pd.DataFrame({
    'missing_count': ratings_df.isnull().sum(),
    'missing_percentage': (ratings_df.isnull().sum() / len(ratings_df)) * 100
}).reset_index().rename(columns={'index': 'column'})

missing_info_df

Unnamed: 0,column,missing_count,missing_percentage
0,user_id,0,0.0
1,isbn,0,0.0
2,rating,0,0.0


In [240]:
ratings_df.describe()

Unnamed: 0,user_id,rating
count,1139367.0,1139367.0
mean,140382.3,2.861988
std,80551.53,3.853216
min,2.0,0.0
25%,70379.0,0.0
50%,140974.0,0.0
75%,210959.0,7.0
max,278854.0,10.0


For ratings_df, it is worth checking whether this dataset contains any other users and books, in addition to those specified in the users_df and books_df dataframes.

#### Check users

In [241]:
users_df.head(2)

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0


In [242]:
# Perform a left merge to find user_ids in ratings_df not in users_df
merged_df = ratings_df.merge(users_df, on='user_id', how='left', indicator=True)

# Filter the rows where the merge indicator shows they are not in users_df
missing_users_df = merged_df[merged_df['_merge'] == 'left_only']

# Extract the unique user_id values
missing_user_ids = missing_users_df['user_id'].unique()

print(f"User IDs in ratings_df but not in users_df: {missing_user_ids}")

User IDs in ratings_df but not in users_df: []


##### An alternative method of checking

In [243]:
unique_users_df_ids = set(users_df['user_id'].unique())

print(f"len(users_df) = {len(users_df)}")
print(f"len(unique_users_df_ids) = {len(unique_users_df_ids)}")

len(users_df) = 278858
len(unique_users_df_ids) = 278858


In [244]:
ratings_df[ratings_df['user_id'].apply(lambda user_id: user_id not in unique_users_df_ids)]

Unnamed: 0,user_id,isbn,rating


The obtained results show that ratings_df contains ratings of only those users who are present in users_df.   
This is important, because when transferring data to the database, the rating table will have the user_id field as FK, and if there is no corresponding user in the user table, an error will be received.

#### Check books

In [245]:
# Perform a left merge to find user_ids in ratings_df not in users_df
merged_df = ratings_df.merge(books_df, on='isbn', how='left', indicator=True)

# Filter the rows where the merge indicator shows they are not in books_df
missing_books_df = merged_df[merged_df['_merge'] == 'left_only']

# Extract the unique isbn values
missing_book_isbns = missing_books_df['isbn'].unique()

print(f"Book isbns in ratings_df but not in books_df: {missing_book_isbns}")

Book isbns in ratings_df but not in books_df: ['3257224281' '0600570967' '3442437407' ... '0140260676' '0385258259'
 '0670849871']


In [247]:
unique_ratings_df_isbns = ratings_df['isbn'].unique()

print(f"len(unique_ratings_df_isbns) = {len(unique_ratings_df_isbns)}")
print(f"len(missing_book_isbns) = {len(missing_book_isbns)}")

len(unique_ratings_df_isbns) = 332289
len(missing_book_isbns) = 62139


Let's check, using the example of several books, whether there are really books in the dataframe ratings_df, about which there is no data in books_df.

In [248]:
missing_book_isbns[:5]

array(['3257224281', '0600570967', '3442437407', '033390804X',
       '8440682697'], dtype=object)

In [249]:
books_df[books_df['isbn'] == '3257224281']

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url


In [251]:
books_df[books_df['isbn'] == '0600570967']

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url


In [250]:
books_df[books_df['isbn'].apply(lambda isbn: isbn in missing_book_isbns)]

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url


The obtained results show that there are books in the dataframe ratings_df, about which there is no data in books_df.

##### An alternative method of checking

In [252]:
unique_books_df_isbns = set(books_df['isbn'].unique())

print(f"len(books_df) = {len(books_df)}")
print(f"len(unique_books_df_isbns) = {len(unique_books_df_isbns)}")

len(books_df) = 271358
len(unique_books_df_isbns) = 271358


In [257]:
print(f"len(ratings_df) = {len(ratings_df)}")
print(f"len(ratings_df[ratings_df['isbn'].isin(unique_books_df_isbns)]) = {len(ratings_df[ratings_df['isbn'].isin(unique_books_df_isbns)])}")

len(ratings_df) = 1139367
len(ratings_df[ratings_df['isbn'].isin(unique_books_df_isbns)]) = 1031136


In [259]:
ratings_df[~ratings_df['isbn'].isin(unique_books_df_isbns)].head()

Unnamed: 0,user_id,isbn,rating
6,276736,3257224281,8
7,276737,0600570967,6
25,276748,3442437407,0
26,276751,033390804X,0
30,276760,8440682697,10


In [262]:
books_df[books_df['isbn'] == '3257224281']

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url


In [261]:
missing_books_df = ratings_df[~ratings_df['isbn'].isin(unique_books_df_isbns)]

# Extract the unique isbn values
missing_book_isbns = missing_books_df['isbn'].unique()

print(f"len(missing_book_isbns): {len(missing_book_isbns)}")
print(f"Book isbns in ratings_df but not in books_df: {missing_book_isbns}")

len(missing_book_isbns): 62139
Book isbns in ratings_df but not in books_df: ['3257224281' '0600570967' '3442437407' ... '0140260676' '0385258259'
 '0670849871']


#### Solve the problem with books

Since in the database the rating table will have the isbn field as a FK to the isbn field of the book table, it is necessary that the ratings_df dataframe contains only those books that are available in books_df.  
Therefore, we need to edit the ratings_df dataframe accordingly.

In [263]:
unique_books_df_isbns = set(books_df['isbn'].unique())

print(f"len(books_df) = {len(books_df)}")
print(f"len(unique_books_df_isbns) = {len(unique_books_df_isbns)}")

len(books_df) = 271358
len(unique_books_df_isbns) = 271358


In [264]:
# Filter ratings_df to include only rows where isbn is present in books_df
print(f"len(ratings_df) before filtering: {len(ratings_df)}")
ratings_df = ratings_df[ratings_df['isbn'].isin(unique_books_df_isbns)]
print(f"len(ratings_df) after filtering: {len(ratings_df)}")

len(ratings_df) before filtering: 1139367
len(ratings_df) after filtering: 1031136


#### Additional check of the 'rating' column

In [265]:
ratings_df.head(2)

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5


Let's check the range of values in which the values of the 'rating' column lie

In [267]:
print(f"ratings_df['rating'].min() = {ratings_df['rating'].min()}; ratings_df['rating'].max() = {ratings_df['rating'].max()}")

ratings_df['rating'].min() = 0; ratings_df['rating'].max() = 10


### Save results into separate files

#### Additional validation of datasets before saving

In [271]:
books_df['isbn_len'] = books_df['isbn'].apply(lambda x: len(x))
ratings_df['isbn_len'] = ratings_df['isbn'].apply(lambda x: len(x))

# There should be only 1 value - 10
print(f"books_df['isbn_len'].unique() = {books_df['isbn_len'].unique()}")
print(f"ratings_df['isbn_len'].unique() = {ratings_df['isbn_len'].unique()}\n")

books_df.drop('isbn_len', axis=1, inplace=True)
ratings_df.drop('isbn_len', axis=1, inplace=True)

# len(ratings_df) should be equal to 1031136
print(f"len(ratings_df) = {len(ratings_df)}")

books_df['isbn_len'].unique() = [10]
ratings_df['isbn_len'].unique() = [10]

len(ratings_df) = 1031136


In [272]:
books_df.head(2)

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...


In [273]:
ratings_df.head(2)

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5


#### Save results into separate files

In [274]:
books_df.to_csv('../data/db_data/Books.csv', index=False)
check_books_df = pd.read_csv('../data/db_data/Books.csv')

print(f"books_df.shape = {books_df.shape}; check_books_df.shape = {check_books_df.shape}")
check_books_df.head(3)

books_df.shape = (271358, 6); check_books_df.shape = (271358, 6)


Unnamed: 0,isbn,title,author,publication_year,publisher,image_url
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...


In [275]:
users_df.to_csv('../data/db_data/Users.csv', index=False)
check_users_df = pd.read_csv('../data/db_data/Users.csv')

print(f"users_df.shape = {users_df.shape}; check_users_df.shape = {check_users_df.shape}")
check_users_df.head(3)

users_df.shape = (278858, 3); check_users_df.shape = (278858, 3)


Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",


In [276]:
ratings_df.to_csv('../data/db_data/Ratings.csv', index=False)
check_ratings_df = pd.read_csv('../data/db_data/Ratings.csv')

print(f"ratings_df.shape = {ratings_df.shape}; check_ratings_df.shape = {check_ratings_df.shape}")
check_ratings_df.head(3)

ratings_df.shape = (1031136, 3); check_ratings_df.shape = (1031136, 3)


Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0


In [277]:
check_books_df['isbn_len'] = check_books_df['isbn'].apply(lambda x: len(x))
check_ratings_df['isbn_len'] = check_ratings_df['isbn'].apply(lambda x: len(x))

# There should be only 1 value - 10
print(f"check_books_df['isbn_len'].unique() = {check_books_df['isbn_len'].unique()}")
print(f"check_ratings_df['isbn_len'].unique() = {check_ratings_df['isbn_len'].unique()}\n")

# len(ratings_df) should be equal to 1031136
print(f"len(check_ratings_df) = {len(check_ratings_df)}")

check_books_df['isbn_len'].unique() = [10]
check_ratings_df['isbn_len'].unique() = [10]

len(check_ratings_df) = 1031136
