# Goodreads EDA and Recommendations Algorithm Development

## Setup
---

In [70]:
import getpass
import pandas as pd
import tensorflow as tf
from pymongo import MongoClient

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [71]:
password = getpass.getpass("MongoDB password: ")

MongoDB password:  ········


## Pull Goodreads Data
---

In [72]:
client = MongoClient(f'mongodb://book_group:{password}@macragge.reika.io:47017/?authSource=books')

In [73]:
db = client['books']
collection = db['books']

In [74]:
# Fetch data from MongoDB
data = list(collection.find(limit=10000))  # Retrieve all documents as a list of dictionaries

In [75]:
# Convert to Pandas DataFrame
df = pd.DataFrame(data)

In [76]:
client.close()

## Data Preprocessing & Exploration
---
### Understand the Data

In [77]:
# data preview
df.head()

Unnamed: 0,_id,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,66da49047084538b3e00f9c2,312853122.0,1,[],US,,"[{'count': '3', 'name': 'to-read'}, {'count': ...",,False,4.0,...,9.0,,1984.0,https://www.goodreads.com/book/show/5333265-w-...,https://images.gr-assets.com/books/1310220028m...,5333265,3,5400751,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film
1,66da49047084538b3e00f9c3,743509986.0,6,[],US,,"[{'count': '2634', 'name': 'to-read'}, {'count...",,False,3.23,...,10.0,Abridged,2001.0,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...,1333909,10,1323437,Good Harbor,Good Harbor
2,66da49047084538b3e00f9c4,,7,[189911],US,eng,"[{'count': '58', 'name': 'to-read'}, {'count':...",B00071IKUY,False,4.03,...,,Book Club Edition,1987.0,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,7327624,140,8948723,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","The Unschooled Wizard (Sun Wolf and Starhawk, ..."
3,66da49047084538b3e00f9c5,743294297.0,3282,[],US,eng,"[{'count': '7615', 'name': 'to-read'}, {'count...",,False,3.49,...,7.0,,2009.0,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,6066819,51184,6243154,Best Friends Forever,Best Friends Forever
4,66da49047084538b3e00f9c6,850308712.0,5,[],US,,"[{'count': '32', 'name': 'to-read'}, {'count':...",,False,3.4,...,,,,https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...,287140,15,278577,Runic Astrology: Starcraft and Timekeeping in ...,Runic Astrology: Starcraft and Timekeeping in ...


In [78]:
# available columns
for c in df.columns:
    print(c)

_id
isbn
text_reviews_count
series
country_code
language_code
popular_shelves
asin
is_ebook
average_rating
kindle_asin
similar_books
description
format
link
authors
publisher
num_pages
publication_day
isbn13
publication_month
edition_information
publication_year
url
image_url
book_id
ratings_count
work_id
title
title_without_series


### Data Cleaning

Drop
- _id - identifier
- isbn - identifier
- link - URL link to Goodread's entry of the book
- url
- image_url
- book_id
- work_id

Features
- text_reviews_count
- series
- country_code
- language_code
- popular_shelves
- is_ebook
- average_rating
- description
- format
- authors
- publisher
- num_pages
- publication_day
- publication_month
- edition_information
- publication_year
- ratings_count
- title
- title_without_series

Target
- needs to be added? user preference?
- similar_books - see if we can make our model match goodreads?

Unknown
- asin - unknown
- kindle_asin - unknown
- isbn13

In [80]:
# Ensure numerical columns are present in DataFrame

numerical_columns = [
    'text_reviews_count',
    'average_rating',
    'num_pages',
    'ratings_count'
]
df[numerical_columns] = df[numerical_columns].fillna(0) # Handle missing values appropriately

In [81]:
# Check data types in df_numerical
print(df.dtypes)

_id                     object
isbn                    object
text_reviews_count      object
series                  object
country_code            object
language_code           object
popular_shelves         object
asin                    object
is_ebook                object
average_rating          object
kindle_asin             object
similar_books           object
description             object
format                  object
link                    object
authors                 object
publisher               object
num_pages               object
publication_day         object
isbn13                  object
publication_month       object
edition_information     object
publication_year        object
url                     object
image_url               object
book_id                 object
ratings_count           object
work_id                 object
title                   object
title_without_series    object
dtype: object


In [82]:
# Check for missing values
print(df.isnull().sum())

# Example of filling missing values
#df = df.fillna(0)  # Or use another appropriate value or method

# Or drop rows/columns with missing values
#df = df.dropna()  # Be cautious with dropping data

_id                     0
isbn                    0
text_reviews_count      0
series                  0
country_code            0
language_code           0
popular_shelves         0
asin                    0
is_ebook                0
average_rating          0
kindle_asin             0
similar_books           0
description             0
format                  0
link                    0
authors                 0
publisher               0
num_pages               0
publication_day         0
isbn13                  0
publication_month       0
edition_information     0
publication_year        0
url                     0
image_url               0
book_id                 0
ratings_count           0
work_id                 0
title                   0
title_without_series    0
dtype: int64


In [83]:
# collapse the lists in the series column to a string

df['series'] = df['series'].astype(str)

df['series'].value_counts()

series
[]             6884
['1029237']       4
['644928']        3
['150218']        3
['1086611']       3
               ... 
['349220']        1
['928472']        1
['603363']        1
['990720']        1
['167714']        1
Name: count, Length: 3051, dtype: int64

In [84]:
# collapse the lists in the series column to a string

df['similar_books'] = df['similar_books'].astype(str)

df['similar_books'].value_counts()

similar_books
[]                                                                                                                                                                                                                        5276
['31242', '374380', '20564', '383206', '7891', '6335178', '31175', '372811', '77395', '856190', '686278', '5797', '32110', '3102', '264', '99329', '31667']                                                                  5
['8359929', '723742', '297130', '7570244', '397904', '22889', '89395', '1688926', '64694', '89115', '126816']                                                                                                                4
['87580', '837422', '429024', '12923', '588747', '472966', '207313', '175516', '1137702', '1275404', '6138', '733957', '29981', '1153738', '189746', '2677', '272751', '535856']                                             3
['160010', '16810', '3102', '606805', '517188', '18799', '91494', '7628', '11013', '11230', '2

In [85]:
# flatten the nested structure in df['popular_shelves']

# Explode the 'popular_shelves' list column
df = df.explode('popular_shelves')

# Extract 'name' and 'count' into new columns
df['shelf_name'] = df['popular_shelves'].apply(lambda x: x['name'] if isinstance(x, dict) else None)
df['shelf_count'] = df['popular_shelves'].apply(lambda x: x['count'] if isinstance(x, dict) else None)

# Drop the original 'popular_shelves' column
df = df.drop('popular_shelves', axis=1)

In [86]:
# flatten the nested structure in df['authors']

# Explode the 'popular_shelves' list column
df = df.explode('authors')

# Extract 'name' and 'count' into new columns
df['author_id'] = df['authors'].apply(lambda x: x['author_id'] if isinstance(x, dict) else None)
df['author_role'] = df['authors'].apply(lambda x: x['role'] if isinstance(x, dict) else None)

# Drop the original 'popular_shelves' column
df = df.drop('authors', axis=1)

### Explore and Analyze the Data

In [87]:
df.head()

Unnamed: 0,_id,isbn,text_reviews_count,series,country_code,language_code,asin,is_ebook,average_rating,kindle_asin,...,image_url,book_id,ratings_count,work_id,title,title_without_series,shelf_name,shelf_count,author_id,author_role
0,66da49047084538b3e00f9c2,312853122,1,[],US,,,False,4.0,,...,https://images.gr-assets.com/books/1310220028m...,5333265,3,5400751,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film,to-read,3,604031,
0,66da49047084538b3e00f9c2,312853122,1,[],US,,,False,4.0,,...,https://images.gr-assets.com/books/1310220028m...,5333265,3,5400751,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film,p,1,604031,
0,66da49047084538b3e00f9c2,312853122,1,[],US,,,False,4.0,,...,https://images.gr-assets.com/books/1310220028m...,5333265,3,5400751,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film,collection,1,604031,
0,66da49047084538b3e00f9c2,312853122,1,[],US,,,False,4.0,,...,https://images.gr-assets.com/books/1310220028m...,5333265,3,5400751,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film,w-c-fields,1,604031,
0,66da49047084538b3e00f9c2,312853122,1,[],US,,,False,4.0,,...,https://images.gr-assets.com/books/1310220028m...,5333265,3,5400751,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film,biography,1,604031,


In [88]:
df['author_role'].value_counts()

author_role
                                  623010
Translator                         44245
Contributor                        27683
Illustrator                        22489
Narrator                           16388
                                   ...  
shrH wt`lyq/ s`d s`yd ldywh jy         1
Author, Illustrator                    1
Author, Editor                         1
curatore                               1
Vu                                     1
Name: count, Length: 179, dtype: int64

In [89]:
df['similar_books'].value_counts()

similar_books
[]                                                                                                                                                                                                                 175553
['7455727', '7352447', '12377549', '2412563', '11570454', '597659', '13127347', '2865290', '3369511', '6034660', '6987625', '17734542', '7015378']                                                                   3200
['148409', '249203', '2136064', '738180', '12053207', '679497', '773020', '147759', '10332278', '3330933', '18770233', '723198', '2046835', '9978110']                                                               2900
['47197', '914487', '30018', '1827529', '318389', '252230', '4734404', '261494', '1528674', '95562', '798127']                                                                                                       2700
['6614259', '10711562', '955513', '6882995', '14164155', '231514', '450161', '300411', '23299963', '621638', '1355

In [90]:
df['average_rating'].value_counts()

average_rating
4.00    14983
3.98    13065
3.77    12594
3.79    12312
3.83    12109
        ...  
1.09        3
1.75        3
2.17        2
1.33        2
            1
Name: count, Length: 272, dtype: int64

## Recommendation System
---
Implement the Recommendation System with TensorFlow

### Prepare the Data for TensorFlow

In [92]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

In [None]:
# Make sure df contains only the numerical columns
print(df.head())

In [94]:
df.columns

Index(['_id', 'isbn', 'text_reviews_count', 'series', 'country_code',
       'language_code', 'asin', 'is_ebook', 'average_rating', 'kindle_asin',
       'similar_books', 'description', 'format', 'link', 'publisher',
       'num_pages', 'publication_day', 'isbn13', 'publication_month',
       'edition_information', 'publication_year', 'url', 'image_url',
       'book_id', 'ratings_count', 'work_id', 'title', 'title_without_series',
       'shelf_name', 'shelf_count', 'author_id', 'author_role'],
      dtype='object')

In [98]:
feature_columns = [
    'text_reviews_count', 'series', 'country_code',
    'language_code', 'is_ebook', 'average_rating',
    'description', 'format', 'publisher',
    'num_pages', 'publication_day', 'publication_month',
    'edition_information', 'publication_year',
    'ratings_count', 'title', 'title_without_series',
    'shelf_name', 'shelf_count', 'author_id', 'author_role'
]

# Remove similar_books target from features data
y = df['similar_books'].values
X = df[feature_columns]

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

### Build and Train the Model

### Evaluate the Model

### Make Recommendations