In [4]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# to ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Context

## Functions

### Fix dataset errors

In [38]:
# Function to fix the erroneous rows
def fix_row(row):
    # Shift values to the right
    for col in ['image-url-l', 'image-url-m', 'image-url-s', 'publisher', 'publication_year']:
        row[col] = row.shift(1)[col]
        
    title_author_split = row['title'].split(';')
    row['title'] = title_author_split[0][:-2]
    row['author'] = title_author_split[1][:-1]
    return row

## Read the datasets, view several records and rename columns

In [6]:
books_df = pd.read_csv('../data/original_data/Books.csv')
users_df = pd.read_csv('../data/original_data/Users.csv')
ratings_df = pd.read_csv('../data/original_data/Ratings.csv')

print(f"books_df.shape = {books_df.shape}")
print(f"users_df.shape = {users_df.shape}")
print(f"ratings_df.shape = {ratings_df.shape}")

books_df.shape = (271360, 8)
users_df.shape = (278858, 3)
ratings_df.shape = (1149780, 3)


### books_df

In [7]:
books_df.head(3)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...


In [8]:
books_df.tail(3)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...
271359,0767409752,A Guided Tour of Rene Descartes' Meditations o...,Christopher Biffle,2000,McGraw-Hill Humanities/Social Sciences/Languages,http://images.amazon.com/images/P/0767409752.0...,http://images.amazon.com/images/P/0767409752.0...,http://images.amazon.com/images/P/0767409752.0...


#### Rename columns

Since the column names available in the dataframe are quite long, it will be inconvenient to access them in the program, so it was decided to rename the columns using the 'snake_case' naming convention. Also, the names of some columns were shortened.

In [9]:
books_df.rename(
    columns = {
        "ISBN": "ISBN".lower(),
        "Book-Title": "title",
        "Book-Author": "author",
        "Year-Of-Publication": "publication_year",
        "Publisher": "Publisher".lower(),
        "Image-URL-S": "Image-URL-S".lower(),
        "Image-URL-M": "Image-URL-M".lower(),
        "Image-URL-L": "Image-URL-L".lower()
    }, 
    inplace=True
)
books_df.head(3)

Unnamed: 0,isbn,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...


### users_df

In [10]:
users_df.head(3)

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",


In [11]:
users_df.tail(3)

Unnamed: 0,User-ID,Location,Age
278855,278856,"brampton, ontario, canada",
278856,278857,"knoxville, tennessee, usa",
278857,278858,"dublin, n/a, ireland",


In [12]:
users_df.rename(
    columns = {
        "User-ID": "user_id",
        "Location": "Location".lower(),
        "Age": "Age".lower()
    },
    inplace = True
)
users_df.head(3)

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",


### ratings_df

In [13]:
ratings_df.head(3)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0


In [14]:
ratings_df.tail(3)

Unnamed: 0,User-ID,ISBN,Book-Rating
1149777,276709,515107662,10
1149778,276721,590442449,10
1149779,276723,5162443314,8


In [15]:
ratings_df.rename(
    columns = {
        "User-ID": "user_id",
        "ISBN": "ISBN".lower(),
        "Book-Rating": "rating"
    },
    inplace = True
)
ratings_df.head(3)

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0


# Data quality assessment

## Missing data examination

### Determine the number of missing values for each column

#### books_df

In [16]:
print(f"books_df.shape = {books_df.shape}")
books_df.info()

books_df.shape = (271360, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   isbn              271360 non-null  object
 1   title             271360 non-null  object
 2   author            271358 non-null  object
 3   publication_year  271360 non-null  object
 4   publisher         271358 non-null  object
 5   image-url-s       271360 non-null  object
 6   image-url-m       271360 non-null  object
 7   image-url-l       271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


In [17]:
missing_info_df = pd.DataFrame({
    'missing_count': books_df.isnull().sum(),
    'missing_percentage': (books_df.isnull().sum() / len(books_df)) * 100
}).reset_index().rename(columns={'index': 'column'})

missing_info_df

Unnamed: 0,column,missing_count,missing_percentage
0,isbn,0,0.0
1,title,0,0.0
2,author,2,0.000737
3,publication_year,0,0.0
4,publisher,2,0.000737
5,image-url-s,0,0.0
6,image-url-m,0,0.0
7,image-url-l,3,0.001106


##### Fill NaN values for 'author' column 

In [18]:
books_df[books_df['author'].isnull()]

Unnamed: 0,isbn,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l
118033,751352497,A+ Quiz Masters:01 Earth,,1999,Dorling Kindersley,http://images.amazon.com/images/P/0751352497.0...,http://images.amazon.com/images/P/0751352497.0...,http://images.amazon.com/images/P/0751352497.0...
187689,9627982032,The Credit Suisse Guide to Managing Your Perso...,,1995,Edinburgh Financial Publishing,http://images.amazon.com/images/P/9627982032.0...,http://images.amazon.com/images/P/9627982032.0...,http://images.amazon.com/images/P/9627982032.0...


<img src="books_dataset_EDA_images/fillna_books_A_quiz_masters.png">

In [19]:
books_df.loc[118033, 'author'] = 'Unknown'

<img src="books_dataset_EDA_images/fillna_books_The_Credit_Suisse_Guide.png">

In [20]:
books_df[books_df['author'].str.contains('Larissa Anne', case=False, na=False)]

Unnamed: 0,isbn,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l


In [21]:
books_df.loc[187689, 'author'] = 'Downes, Larissa Anne'

In [22]:
books_df[books_df['author'].isnull()]

Unnamed: 0,isbn,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l


In [23]:
books_df[books_df['isbn'].apply(lambda x: x in ['0751352497', '9627982032'])]

Unnamed: 0,isbn,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l
118033,751352497,A+ Quiz Masters:01 Earth,Unknown,1999,Dorling Kindersley,http://images.amazon.com/images/P/0751352497.0...,http://images.amazon.com/images/P/0751352497.0...,http://images.amazon.com/images/P/0751352497.0...
187689,9627982032,The Credit Suisse Guide to Managing Your Perso...,"Downes, Larissa Anne",1995,Edinburgh Financial Publishing,http://images.amazon.com/images/P/9627982032.0...,http://images.amazon.com/images/P/9627982032.0...,http://images.amazon.com/images/P/9627982032.0...


#### users_df

In [24]:
print(f"users_df.shape = {users_df.shape}")
users_df.info()

users_df.shape = (278858, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   user_id   278858 non-null  int64  
 1   location  278858 non-null  object 
 2   age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


In [25]:
missing_info_df = pd.DataFrame({
    'missing_count': users_df.isnull().sum(),
    'missing_percentage': (users_df.isnull().sum() / len(users_df)) * 100
}).reset_index().rename(columns={'index': 'column'})

missing_info_df

Unnamed: 0,column,missing_count,missing_percentage
0,user_id,0,0.0
1,location,0,0.0
2,age,110762,39.719857


Given the fact that the 'age' column contains ~40% missing values and is not intended to be used to build a recommendation model, it can be removed from the dataset.

#### ratings_df

In [26]:
print(f"ratings_df.shape = {ratings_df.shape}")
ratings_df.info()

ratings_df.shape = (1149780, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   user_id  1149780 non-null  int64 
 1   isbn     1149780 non-null  object
 2   rating   1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [27]:
missing_info_df = pd.DataFrame({
    'missing_count': ratings_df.isnull().sum(),
    'missing_percentage': (ratings_df.isnull().sum() / len(ratings_df)) * 100
}).reset_index().rename(columns={'index': 'column'})

missing_info_df

Unnamed: 0,column,missing_count,missing_percentage
0,user_id,0,0.0
1,isbn,0,0.0
2,rating,0,0.0


## Explore the data type and contents of each field

### Assess data type of each field

#### books_df

In [28]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   isbn              271360 non-null  object
 1   title             271360 non-null  object
 2   author            271360 non-null  object
 3   publication_year  271360 non-null  object
 4   publisher         271358 non-null  object
 5   image-url-s       271360 non-null  object
 6   image-url-m       271360 non-null  object
 7   image-url-l       271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


In [29]:
for column in books_df.select_dtypes(include='object').columns:
    column_type = type(books_df[column].iloc[0])
    print(f"type({column}) = {column_type}")
    
    all_entries_are_same_type = books_df[column].apply(lambda x: isinstance(x, column_type)).all()
    print(f"All entries of the column {column} have the type {column_type}: {all_entries_are_same_type}")
    
    if all_entries_are_same_type == False:
        all_entries_non_null_are_same_type = books_df[books_df[column].isnull() == False][column].apply(lambda x: isinstance(x, column_type)).all()
        print(f"All non-null entries of the column {column} have the type {column_type}: {all_entries_non_null_are_same_type}")
    print("")

type(isbn) = <class 'str'>
All entries of the column isbn have the type <class 'str'>: True

type(title) = <class 'str'>
All entries of the column title have the type <class 'str'>: True

type(author) = <class 'str'>
All entries of the column author have the type <class 'str'>: True

type(publication_year) = <class 'int'>
All entries of the column publication_year have the type <class 'int'>: False
All non-null entries of the column publication_year have the type <class 'int'>: False

type(publisher) = <class 'str'>
All entries of the column publisher have the type <class 'str'>: False
All non-null entries of the column publisher have the type <class 'str'>: True

type(image-url-s) = <class 'str'>
All entries of the column image-url-s have the type <class 'str'>: True

type(image-url-m) = <class 'str'>
All entries of the column image-url-m have the type <class 'str'>: True

type(image-url-l) = <class 'str'>
All entries of the column image-url-l have the type <class 'str'>: False
All no

The obtained results show that all fields have object data type.  
For the **'publication_year'** column, it would be more appropriate to make the **int** data type rather than **object (str)**.  
For the other columns, the results are as expected because each of them stores string information.

##### Explore 'publication_year' column

Let's consider the problem with the records of the 'Year-Of-Publication' column (why not all non-null entries of this column have the type 'int').

In [30]:
books_df.shape

(271360, 8)

In [31]:
books_df[books_df['publication_year'].apply(lambda x: isinstance(x, int) == False)]

Unnamed: 0,isbn,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l
196608,0425176967,Self-Portrait With Ghosts,Kelly Dwyer,2000,Berkley Publishing Group,http://images.amazon.com/images/P/0425176967.0...,http://images.amazon.com/images/P/0425176967.0...,http://images.amazon.com/images/P/0425176967.0...
196609,0373239971,Baby'S First Christmas (The Baby Of The Month ...,Marie Ferrarella,1995,Silhouette,http://images.amazon.com/images/P/0373239971.0...,http://images.amazon.com/images/P/0373239971.0...,http://images.amazon.com/images/P/0373239971.0...
196610,0373201613,Man She Married (By Request) (By Request),Ann Major,1999,Harlequin,http://images.amazon.com/images/P/0373201613.0...,http://images.amazon.com/images/P/0373201613.0...,http://images.amazon.com/images/P/0373201613.0...
196611,159071024X,The Gilded Chamber,Rebecca Kohn,2004,Rugged Land,http://images.amazon.com/images/P/159071024X.0...,http://images.amazon.com/images/P/159071024X.0...,http://images.amazon.com/images/P/159071024X.0...
196612,0066210747,The Bielski Brothers: The True Story of Three ...,Peter Duffy,2003,HarperCollins,http://images.amazon.com/images/P/0066210747.0...,http://images.amazon.com/images/P/0066210747.0...,http://images.amazon.com/images/P/0066210747.0...
...,...,...,...,...,...,...,...,...
262139,0897332032,"The Age of Agony: The Art of Healing, 1700-1800",Guy R. Williams,1986,Academy Chicago Publishers,http://images.amazon.com/images/P/0897332032.0...,http://images.amazon.com/images/P/0897332032.0...,http://images.amazon.com/images/P/0897332032.0...
262140,0897332857,The Age of Miracles,Guy Williams,1987,Academy Chicago Publishers,http://images.amazon.com/images/P/0897332857.0...,http://images.amazon.com/images/P/0897332857.0...,http://images.amazon.com/images/P/0897332857.0...
262141,0897333918,The Common Stream: Two Thousand Years of the E...,Rowland Parker,1994,Academy Chicago Publishers,http://images.amazon.com/images/P/0897333918.0...,http://images.amazon.com/images/P/0897333918.0...,http://images.amazon.com/images/P/0897333918.0...
262142,0906540631,The Essene Odyssey: The Mystery of the True Te...,Hugh J. Schonfield,1993,Element Books,http://images.amazon.com/images/P/0906540631.0...,http://images.amazon.com/images/P/0906540631.0...,http://images.amazon.com/images/P/0906540631.0...


From the output, we can see that some records contain an int value for the 'publication_year' column, but have a data type of str.  
Before casting the 'publication_year' column to type int, we need to check that all records contain numeric values in that column.

In [39]:
# Convert the 'publication_year' column to strings
books_df['publication_year'] = books_df['publication_year'].astype(str)

# Identify records where 'publication_year' is not an integer
non_integer_records = books_df[~books_df['publication_year'].str.isdigit()]
print(f"len(non_integer_records) = {len(non_integer_records)}")

len(non_integer_records) = 3


In [40]:
non_integer_records

Unnamed: 0,isbn,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l
209538,078946697X,"DK Readers: Creating the X-Men, How It All Beg...",2000,DK Publishing Inc,http://images.amazon.com/images/P/078946697X.0...,http://images.amazon.com/images/P/078946697X.0...,http://images.amazon.com/images/P/078946697X.0...,
220731,2070426769,"Peuple du ciel, suivi de 'Les Bergers\"";Jean-M...",2003,Gallimard,http://images.amazon.com/images/P/2070426769.0...,http://images.amazon.com/images/P/2070426769.0...,http://images.amazon.com/images/P/2070426769.0...,
221678,0789466953,"DK Readers: Creating the X-Men, How Comic Book...",2000,DK Publishing Inc,http://images.amazon.com/images/P/0789466953.0...,http://images.amazon.com/images/P/0789466953.0...,http://images.amazon.com/images/P/0789466953.0...,


In [41]:
for index, row in non_integer_records.iterrows():
    print(f"{index}) isbn={row['isbn']}; title={row['title']}")

209538) isbn=078946697X; title=DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)\";Michael Teitelbaum"
220731) isbn=2070426769; title=Peuple du ciel, suivi de 'Les Bergers\";Jean-Marie Gustave Le ClÃ?Â©zio"
221678) isbn=0789466953; title=DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)\";James Buckley"


From the obtained results, it can be concluded that the dataset contains errors (for 3 records, 'title' and 'author' are separated by the sign ';', not ',').  
Therefore, the 'title' column for these records contains both the title of the book and its author.

In [42]:
non_integer_records = non_integer_records.apply(fix_row, axis=1)
non_integer_records

Unnamed: 0,isbn,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l
209538,078946697X,"DK Readers: Creating the X-Men, How It All Beg...",Michael Teitelbaum,2000,DK Publishing Inc,http://images.amazon.com/images/P/078946697X.0...,http://images.amazon.com/images/P/078946697X.0...,http://images.amazon.com/images/P/078946697X.0...
220731,2070426769,"Peuple du ciel, suivi de 'Les Bergers",Jean-Marie Gustave Le ClÃ?Â©zio,2003,Gallimard,http://images.amazon.com/images/P/2070426769.0...,http://images.amazon.com/images/P/2070426769.0...,http://images.amazon.com/images/P/2070426769.0...
221678,0789466953,"DK Readers: Creating the X-Men, How Comic Book...",James Buckley,2000,DK Publishing Inc,http://images.amazon.com/images/P/0789466953.0...,http://images.amazon.com/images/P/0789466953.0...,http://images.amazon.com/images/P/0789466953.0...


In [43]:
for index, row in non_integer_records.iterrows():
    print(f"{index}) isbn={row['isbn']}; title={row['title']}")

209538) isbn=078946697X; title=DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)
220731) isbn=2070426769; title=Peuple du ciel, suivi de 'Les Bergers
221678) isbn=0789466953; title=DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)


In [44]:
error_indices = non_integer_records.index
error_indices

Index([209538, 220731, 221678], dtype='int64')

In [45]:
books_df.loc[error_indices] = books_df.loc[error_indices].apply(fix_row, axis=1)

# Identify records where 'publication_year' is not an integer
non_integer_records = books_df[~books_df['publication_year'].str.isdigit()]
print(f"len(non_integer_records) = {len(non_integer_records)}")

len(non_integer_records) = 0


In [48]:
books_df.head(3)

Unnamed: 0,isbn,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...


In [49]:
books_df.loc[error_indices]

Unnamed: 0,isbn,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l
209538,078946697X,"DK Readers: Creating the X-Men, How It All Beg...",Michael Teitelbaum,2000,DK Publishing Inc,http://images.amazon.com/images/P/078946697X.0...,http://images.amazon.com/images/P/078946697X.0...,http://images.amazon.com/images/P/078946697X.0...
220731,2070426769,"Peuple du ciel, suivi de 'Les Bergers",Jean-Marie Gustave Le ClÃ?Â©zio,2003,Gallimard,http://images.amazon.com/images/P/2070426769.0...,http://images.amazon.com/images/P/2070426769.0...,http://images.amazon.com/images/P/2070426769.0...
221678,0789466953,"DK Readers: Creating the X-Men, How Comic Book...",James Buckley,2000,DK Publishing Inc,http://images.amazon.com/images/P/0789466953.0...,http://images.amazon.com/images/P/0789466953.0...,http://images.amazon.com/images/P/0789466953.0...


From the obtained results, it can be assumed that all values ​​of the column 'publication_year' can now be cast to the data type int.

In [46]:
books_df['publication_year'] = books_df['publication_year'].astype(int)
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   isbn              271360 non-null  object
 1   title             271360 non-null  object
 2   author            271360 non-null  object
 3   publication_year  271360 non-null  int32 
 4   publisher         271358 non-null  object
 5   image-url-s       271360 non-null  object
 6   image-url-m       271360 non-null  object
 7   image-url-l       271360 non-null  object
dtypes: int32(1), object(7)
memory usage: 15.5+ MB


In [47]:
missing_info_df = pd.DataFrame({
    'missing_count': books_df.isnull().sum(),
    'missing_percentage': (books_df.isnull().sum() / len(books_df)) * 100
}).reset_index().rename(columns={'index': 'column'})

missing_info_df

Unnamed: 0,column,missing_count,missing_percentage
0,isbn,0,0.0
1,title,0,0.0
2,author,0,0.0
3,publication_year,0,0.0
4,publisher,2,0.000737
5,image-url-s,0,0.0
6,image-url-m,0,0.0
7,image-url-l,0,0.0


The results show that solving the problem with the 'publication_year' column also solved the problem with the missing values of the 'image-url-l' column.

#### users_df

In [50]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   user_id   278858 non-null  int64  
 1   location  278858 non-null  object 
 2   age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


#### ratings_df

In [51]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   user_id  1149780 non-null  int64 
 1   isbn     1149780 non-null  object
 2   rating   1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


### Check for duplication

#### books_df

In [52]:
print(f"books_df.shape = {books_df.shape}")
books_df.nunique()

books_df.shape = (271360, 8)


isbn                271360
title               242135
author              102021
publication_year       116
publisher            16804
image-url-s         271044
image-url-m         271044
image-url-l         271044
dtype: int64

From the obtained results it can be concluded that:
1) The dataset contains books that have the same title but different ISBNs.
2) For some books, the cover image is missing.
3) Since each record is characterized by a unique ISBN value, it could be set as a dataframe index. However, in the future, this column will be used to join dataframes, so making it a dataframe index makes no sense.

#### users_df

In [53]:
print(f"users_df.shape = {users_df.shape}")
users_df.nunique()

users_df.shape = (278858, 3)


user_id     278858
location     57339
age            165
dtype: int64

Since each record in the users_df is characterized by a unique 'User-ID' value, it could be set as a dataframe index.  
However, in the future, this column will be used to join dataframes, so making it a dataframe index makes no sense.

#### ratings_df 

In [54]:
print(f"ratings_df.shape = {ratings_df.shape}")
ratings_df.nunique()

ratings_df.shape = (1149780, 3)


user_id    105283
isbn       340556
rating         11
dtype: int64

The obtained results show that:
1) The number of unique 'User-ID's in ratings_df is smaller than in users_df. This means that not all users rated the books.
2) The number of unique ISBNs in ratings_df is greater than in users_df. This means that the rating_df dataframe contains information about books that is not in books_df.

### Analyse statistics summary

#### books_df

In [55]:
books_df.describe(include='all')

Unnamed: 0,isbn,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l
count,271360.0,271360,271360,271360.0,271358,271360,271360,271360
unique,271360.0,242135,102021,,16804,271044,271044,271044
top,195153448.0,Selected Poems,Agatha Christie,,Harlequin,http://images.amazon.com/images/P/185326119X.0...,http://images.amazon.com/images/P/185326119X.0...,http://images.amazon.com/images/P/185326119X.0...
freq,1.0,27,632,,7535,2,2,2
mean,,,,1959.761273,,,,
std,,,,257.992836,,,,
min,,,,0.0,,,,
25%,,,,1989.0,,,,
50%,,,,1995.0,,,,
75%,,,,2000.0,,,,


The following conclusions can be drawn from the obtained results:
1) The fact that the title 'Selected Poems' appears 27 times in the books_df dataframe confirms the fact that it contains books with the same title but different ISBNs.
2) The largest number of books in the dataset is by Agatha Christie (632 books).
3) The fact that a certain URL occurs 2 times may indicate the presence of duplicates or errors in the dataset, since each book should have an individual cover image.

#### users_df

In [56]:
users_df.describe(include='all')

Unnamed: 0,user_id,location,age
count,278858.0,278858,168096.0
unique,,57339,
top,,"london, england, united kingdom",
freq,,2506,
mean,139429.5,,34.751434
std,80499.51502,,14.428097
min,1.0,,0.0
25%,69715.25,,24.0
50%,139429.5,,32.0
75%,209143.75,,44.0


#### ratings_df

In [57]:
ratings_df.describe(include='all')

Unnamed: 0,user_id,isbn,rating
count,1149780.0,1149780.0,1149780.0
unique,,340556.0,
top,,971880107.0,
freq,,2502.0,
mean,140386.4,,2.86695
std,80562.28,,3.854184
min,2.0,,0.0
25%,70345.0,,0.0
50%,141010.0,,0.0
75%,211028.0,,7.0


The obtained results show that:
1) The value of the 'rating' column varies in the range \[0; 10].
2) The average value of the 'Book-Rating' column is ~2.87/10, which indicates that users mostly gave poor ratings to the books they read.