In [1]:
#data wrangling and clean up


In [2]:
#import libraries
import pandas as pd
import numpy as np
import os
from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [3]:
#load dataset
file = 'GoodReads_100k_books.csv'
book_data= pd.read_csv(file)

In [4]:
#look at the original data
print(book_data.head())
print(book_data.shape)
print(book_data.columns)
print(book_data.info())
print(book_data.describe(include='all'))

                             author bookformat  \
0              Laurence M. Hauptman  Hardcover   
1  Charlotte Fiell,Emmanuelle Dirix  Paperback   
2                     Andy Anderson  Paperback   
3              Carlotta R. Anderson  Hardcover   
4                     Jean Leveille        NaN   

                                                desc  \
0  Reveals that several hundred thousand Indians ...   
1  Fashion Sourcebook - 1920s is the first book i...   
2  The seminal history and analysis of the Hungar...   
3  "All-American Anarchist" chronicles the life a...   
4  Aujourdâ€™hui, lâ€™oiseau nous invite Ã  sa ta...   

                                               genre  \
0  History,Military History,Civil War,American Hi...   
1          Couture,Fashion,Historical,Art,Nonfiction   
2                                   Politics,History   
3                                      Labor,History   
4                                                NaN   

                         

In [5]:
#review columns and drop ones thats are not relavant. Need genre, rating, title and desc for the recommender.
book_drop_col=book_data.drop(columns=['bookformat', 'img', 'pages', 'reviews', 'isbn', 'isbn13', 'totalratings', 'link'])

#reorder the columns
book_reorder=book_drop_col[['title', 'author', 'desc', 'genre', 'rating']]

print(book_reorder.head())

                                               title  \
0  Between Two Fires: American Indians in the Civ...   
1                           Fashion Sourcebook 1920s   
2                                         Hungary 56   
3  All-American Anarchist: Joseph A. Labadie and ...   
4                              Les oiseaux gourmands   

                             author  \
0              Laurence M. Hauptman   
1  Charlotte Fiell,Emmanuelle Dirix   
2                     Andy Anderson   
3              Carlotta R. Anderson   
4                     Jean Leveille   

                                                desc  \
0  Reveals that several hundred thousand Indians ...   
1  Fashion Sourcebook - 1920s is the first book i...   
2  The seminal history and analysis of the Hungar...   
3  "All-American Anarchist" chronicles the life a...   
4  Aujourdâ€™hui, lâ€™oiseau nous invite Ã  sa ta...   

                                               genre  rating  
0  History,Military History,

In [6]:
#look at missing values
print(book_reorder.isnull().sum())

title         1
author        0
desc       6772
genre     10467
rating        0
dtype: int64


In [7]:
#fill missing desc with 'No description available'
book_reorder['desc']=book_reorder['desc'].fillna('No description available.')

In [8]:
#drop missing title or genre as it will not impact our ability to continue
book_reorder=book_reorder.dropna(subset=['title', 'genre'])

In [9]:
#drop any duplicates if present
books_nodup=book_reorder.drop_duplicates()

In [10]:
#correct data errors in the genre
books_nodup.loc[:, 'genre'] = books_nodup['genre'].str.replace('[^a-zA-Z]', ' ', regex=True)
books_nodup.loc[:, 'genre'] = books_nodup['genre'].str.lower()

In [11]:
#normalize and standarize
scaler=StandardScaler()
books_nodup.loc[:, 'rating']=scaler.fit_transform(books_nodup[['rating']])

In [12]:
#save as a csv for next portion
books_nodup.to_csv('books_nodup.csv', index=False)