In [1]:
import time
from datetime import date

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# to ignore warnings
import warnings
warnings.filterwarnings('ignore')

## Read and check datasets

In [2]:
books_df = pd.read_csv('../data/db_data/Books.csv')
users_df = pd.read_csv('../data/db_data/Users.csv')
ratings_df = pd.read_csv('../data/db_data/Ratings.csv')

print(f"books_df.shape = {books_df.shape}")
print(f"users_df.shape = {users_df.shape}")
print(f"ratings_df.shape = {ratings_df.shape}")

books_df.shape = (271358, 6)
users_df.shape = (278858, 3)
ratings_df.shape = (1031136, 3)


In [3]:
books_df['isbn_len'] = books_df['isbn'].apply(lambda x: len(x))
ratings_df['isbn_len'] = ratings_df['isbn'].apply(lambda x: len(x))

# There should be only 1 value - 10
print(f"books_df['isbn_len'].unique() = {books_df['isbn_len'].unique()}")
print(f"ratings_df['isbn_len'].unique() = {ratings_df['isbn_len'].unique()}\n")

books_df.drop('isbn_len', axis=1, inplace=True)
ratings_df.drop('isbn_len', axis=1, inplace=True)

# len(ratings_df) should be equal to 1031136
print(f"len(ratings_df) = {len(ratings_df)}")

books_df['isbn_len'].unique() = [10]
ratings_df['isbn_len'].unique() = [10]

len(ratings_df) = 1031136


## Datasets Analysis for SQL tables

### books_df

In [4]:
books_df.head(2)

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...


In [5]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271358 entries, 0 to 271357
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   isbn              271358 non-null  object
 1   title             271358 non-null  object
 2   author            271358 non-null  object
 3   publication_year  271358 non-null  int64 
 4   publisher         271356 non-null  object
 5   image_url         271358 non-null  object
dtypes: int64(1), object(5)
memory usage: 12.4+ MB


In [11]:
books_df_filled = books_df.fillna('')

# Determine the maximum length of the record in each column with object data type
max_length_series = books_df_filled.select_dtypes(include=['object']).applymap(len).max()

max_length_series

isbn          10
title        256
author       143
publisher    134
image_url     60
dtype: int64

In [13]:
books_df[books_df['title'].apply(lambda title: len(title) >= 100)]

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url
22,1879384493,If I'd Known Then What I Know Now: Why Not Lea...,J. R. Parrish,2003,Cypress House,http://images.amazon.com/images/P/1879384493.0...
101,0446677450,"Rich Dad, Poor Dad: What the Rich Teach Their ...",Robert T. Kiyosaki,2000,Warner Books,http://images.amazon.com/images/P/0446677450.0...
106,0684822733,"Love, Miracles, and Animal Healing : A heartwa...",Pam Proctor,1996,Fireside,http://images.amazon.com/images/P/0684822733.0...
159,0061030147,Cybill Disobedience: How I Survived Beauty Pag...,Cybill Shepherd,2001,Avon Books,http://images.amazon.com/images/P/0061030147.0...
276,0380730448,The Adrian Mole Diaries : The Secret Diary of ...,Sue Townsend,1997,Perennial,http://images.amazon.com/images/P/0380730448.0...
...,...,...,...,...,...,...
271241,0321196775,A Brief History of Western Civilization : The ...,Mark Kishlansky,2004,Longman,http://images.amazon.com/images/P/0321196775.0...
271283,1584857447,True Stories: Girls' Inspiring Stories of Cour...,Trula Magruder,2003,American Girl,http://images.amazon.com/images/P/1584857447.0...
271319,3453047192,Die amerikanische Zumutung: PlÃ¤doyers gegen d...,Rolf Winter,1990,W. Heyne,http://images.amazon.com/images/P/3453047192.0...
271327,8420614556,Lewis Carroll: A Traves Del Espejo Y Lo Que Al...,Lewis Carroll,1986,Lectorum Pubns (Adult),http://images.amazon.com/images/P/8420614556.0...


In [20]:
example_titles_no = 10
example_long_titles = list(books_df[books_df['title'].apply(lambda title: len(title) >= 200)][:example_titles_no]['title'])

[print(f"{index + 1}) title length: {len(example_long_titles[index])}; title: {example_long_titles[index]}") for index in range(example_titles_no)]

1) title length: 200; title: Frankenstein: Complete, Authoritative Text With Biographical, Historical, and Cultural Contexts, Critical History, and Essays from Contemporary Critic ... tive (Case Studies in Contemporary Criticism)
2) title length: 200; title: The Girlfriends' Guide to Surviving the First Year of Motherhood: Wise and Witty Advice on Everything from Coping With Postpartum Mood Swings to Salva ...  to  Fitting into That Favorite Pair of Jeans
3) title length: 200; title: The House of Mirth: Complete, Authoritative Text With Biographical and Historical Contexts, Critical History, and Essays from Five Contemporary Critic ... ives (Case Studies in Contemporary Criticism)
4) title length: 201; title: The American Country Inn And Bed &amp; Breakfast Cookbook, Volume I : More than 1,700 crowd-pleasing recipes from 500 American Inns (American Country Inn &amp; Bed &amp; Breakfast Cookbook (Hardcover))
5) title length: 204; title: The New Strong's exhaustive concordance of the Bib

[None, None, None, None, None, None, None, None, None, None]

In [22]:
example_authors_no = 5

example_long_authors = list(books_df[books_df['author'].apply(lambda author: len(author) >= 100)][:example_authors_no]['author'])
example_authors_no = len(example_long_authors)

[print(f"{index + 1}) author name length: {len(example_long_authors[index])}; author name: {example_long_authors[index]}") for index in range(example_authors_no)]

1) author name length: 122; author name: Based on the Larger Work'the Book of Garden Flowers'by G.a.R.Phillips Compiled by David Pycraft Illustrated by Joan Lupton
2) author name length: 107; author name: Staceyann Chin, Dot Antoniades, Aileen Reyes, Meaghan Williams, Miriam Stanley, Amy Ouzoonian Kyrce Swenson
3) author name length: 143; author name: Dale D. Johnson, Theodore Clymer, Roselmina Indrisano, Richard L. Venezky, James F. Baumann, Elfrieda Hiebert, and Marian Toth P. David Pearson


[None, None, None]

In [27]:
example_publishers_no = 5

example_long_publishers = list(books_df[books_df['publisher'].apply(lambda publisher: len(publisher) >= 100 if type(publisher) != float else False)][:example_publishers_no]['publisher'])
example_publishers_no = len(example_long_publishers)

[print(f"{index + 1}) publisher name length: {len(example_long_publishers[index])}; publisher name: {example_long_publishers[index]}") for index in range(example_publishers_no)]

1) publisher name length: 121; publisher name: Instituto Nacional de InvestigaÃ§Ã£o CientÃ­fica, Centro de Estudos ClÃ¡ssicos e HumanÃ­sticos da Universidade de Coimbra
2) publisher name length: 134; publisher name: Published by Natural Heritage/Natural History Inc. for the Ontario Heritage Foundation, Ontario Ministry of Culture and Communications
3) publisher name length: 115; publisher name: Narcissus Publications in association with Central Europe Review and Central and East European New Media Initiative


[None, None, None]