In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
books = pd.read_csv('data/book_data.csv')

In [3]:
books.dtypes

book_authors          object
book_desc             object
book_edition          object
book_format           object
book_isbn             object
book_pages            object
book_rating          float64
book_rating_count      int64
book_review_count      int64
book_title            object
genres                object
image_url             object
dtype: object

In [7]:
books.head(2)

Unnamed: 0,book_authors,book_desc,book_edition,book_format,book_isbn,book_pages,book_rating,book_rating_count,book_review_count,book_title,genres,image_url,cleaned_description
0,Suzanne Collins,Winning will make you famous. Losing means cer...,,hardcover,9780440000000.0,374 pages,4.33,5519135,160706,The Hunger Games,Young Adult|Fiction|Science Fiction|Dystopia|F...,https://images.gr-assets.com/books/1447303603l...,winning will make you famous losing means cert...
1,J.K. Rowling|Mary GrandPré,There is a door at the end of a silent corrido...,US Edition,paperback,9780440000000.0,870 pages,4.48,2041594,33264,Harry Potter and the Order of the Phoenix,Fantasy|Young Adult|Fiction,https://images.gr-assets.com/books/1255614970l...,there is a door at the end of a silent corrido...


In [5]:
#cleaning book format
books.book_format = books.book_format.str.strip().str.lower()

In [6]:
#cleaning descriptions
books['cleaned_description'] = books.book_desc.str.replace('-',' ').str.replace(r'([^a-zA-Z0-9_ ])','').str.lower()

In [8]:
#cleaning book pages
books.book_pages = books.book_pages.str.extract(r'(\d+?) pages?').astype(float)

In [11]:
descriptions = books[['book_title','book_authors','book_desc','genres','image_url','cleaned_description']]

In [12]:
descriptions.head(1)

Unnamed: 0,book_title,book_authors,book_desc,genres,image_url,cleaned_description
0,The Hunger Games,Suzanne Collins,Winning will make you famous. Losing means cer...,Young Adult|Fiction|Science Fiction|Dystopia|F...,https://images.gr-assets.com/books/1447303603l...,winning will make you famous losing means cert...


In [13]:
descriptions.to_csv('data/descriptions.csv')

#### Seperating Genres and authors into individual rows

In [25]:
#cleaning author names
books.book_authors = list(books.book_authors.str.split('|'))
#Creating a new row for every listed genre
authors = books.apply(lambda x: pd.Series(x.book_authors),axis=1).stack().reset_index(level=1, drop=True)
authors.name = 'author'
books = books.drop('book_authors', axis=1).join(authors).drop_duplicates()

In [26]:
#cleaning genre
books.genres = list(books.genres.str.split('|'))
#Creating a new row for every listed genre
genres = books.apply(lambda x: pd.Series(x.genres),axis=1).stack().reset_index(level=1, drop=True)
genres.name = 'genre'
books = books.drop('genres', axis=1).join(genres).drop_duplicates()

In [27]:
#selecting columns
books = books[['book_title','author','genre' ,'book_rating', 'book_pages','book_format', 'book_desc','image_url']]
len(books)

360186

In [11]:
books.head()

Unnamed: 0,book_title,author,genre,book_rating,book_pages,book_format,book_desc,image_url
0,The Hunger Games,Suzanne Collins,Young Adult,4.33,374.0,hardcover,winning will make you famous losing means cert...,https://images.gr-assets.com/books/1447303603l...
0,The Hunger Games,Suzanne Collins,Fiction,4.33,374.0,hardcover,winning will make you famous losing means cert...,https://images.gr-assets.com/books/1447303603l...
0,The Hunger Games,Suzanne Collins,Science Fiction,4.33,374.0,hardcover,winning will make you famous losing means cert...,https://images.gr-assets.com/books/1447303603l...
0,The Hunger Games,Suzanne Collins,Dystopia,4.33,374.0,hardcover,winning will make you famous losing means cert...,https://images.gr-assets.com/books/1447303603l...
0,The Hunger Games,Suzanne Collins,Fantasy,4.33,374.0,hardcover,winning will make you famous losing means cert...,https://images.gr-assets.com/books/1447303603l...


In [13]:
#write to csv
#360,186 books repeats for genre and multiple authors 
books.to_csv('data/clean_books.csv')