# Goodreads Data Cleaning

Performing data cleaning tasks to ensure data is ready for modelling

### Imports

In [1]:
import json
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style('whitegrid')
plt.style.use('seaborn-darkgrid')
%matplotlib inline

### Directories

Ensuring the output data directory is created

In [3]:
OUTPUT_DATA_DIR = "./output_data/"

if not os.path.exists(OUTPUT_DATA_DIR):
    os.makedirs(OUTPUT_DATA_DIR)

In [4]:
def construct_data_path(dataset_name):
    """Constructs the path to `dataset_name`.
    
    Parameters
    ----------
    dataset_name: str
        The name of the dataset.
    
    Returns
    -------
    str
        A path to the dataset.
    
    """
    return os.path.join('data', '{}.json'.format(dataset_name))

In [5]:
def load_and_sample_dataset(data_path, sample_percentage):
    """Used to load `sample_percentage`% of the data at `data_path`.
    
    Parameters
    ----------
    data_path: str
        A string representing the path to the data
    sample_percentage: float
        A number between 0 and 100 representing the fraction
        of the data to sample.
    
    Returns
    -------
    pd.DataFrame
        A pandas dataframe object loaded from `data_path` after
        loading `sample_percentage`% of the data.
    
    """
    data = []
    idx = 0
    modulus = 100 // sample_percentage
    with open(data_path, 'r') as data_file:
        for row in data_file:
            if idx % modulus == 0:
                data.append(json.loads(row))
            idx += 1
    return pd.DataFrame(data)

In [6]:
interactions_df = load_and_sample_dataset(construct_data_path('goodreads_interactions_poetry'), 10)

In [7]:
books_df = load_and_sample_dataset(construct_data_path('goodreads_books_poetry'), 100)

In [8]:
reviews_df = load_and_sample_dataset(construct_data_path('goodreads_reviews_poetry'), 100)

In [27]:
all_interactions_df = pd.merge(interactions_df, books_df, how='left', left_on="book_id", right_on="book_id")

In [10]:
pd.set_option('display.max_columns', None)

### Pre-Processing Data

We clean the data with the following steps:
*  removing incorrect records
*  constructing the monotonic chain `shelve->read->rate->recommend`
  * recommend corresponds to a rating that is greater than 3

In [14]:
def remove_incorrect_records(data_df):
    """Removes records that do not properly follow the chain.
    
    Removing data where `is_read = False` but it has been 
    rated `rating > 0`.
    
    Parameters
    ----------
    data_df: pd.DataFrame
        The DataFrame from which incorrect records will be removed.
    
    Returns
    -------
    pd.DataFrame
        The DataFrame obtained from `data_df` after removing
        incorrect records.
    
    """
    corrupt_files = data_df[(data_df['is_read'] == False) & (all_interactions_df['rating'] > 0)].index
    return data_df[~data_df.index.isin(corrupt_files)]

In [28]:
def create_chain(data_df):
    """Creates variables for the chain in `data_df`.
    
    4 new indicator variables for the actions shelved, read,
    rated, recommended.
    
    Parameters
    ----------
    data_df: pd.DataFrame
        The DataFrame augmented with the chain variables.
    
    """
    data_df['shelved'] = 1
    data_df['read'] = data_df['is_read'].apply(lambda x: 1 if x == True else 0)
    data_df['rated'] = data_df['rating'].apply(lambda x: 1 if x > 0 else 0)
    data_df['recommended'] = data_df['rating'].apply(lambda x: 1 if x > 3 else 0)
    return data_df

In [29]:
all_interactions_df = create_chain(all_interactions_df)

In [30]:
all_interactions_df

Unnamed: 0,user_id,book_id,review_id,is_read,rating,review_text_incomplete,date_added,date_updated,read_at,started_at,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,similar_books,description,format,link,authors,publisher,num_pages,publication_day,isbn13,publication_month,edition_information,publication_year,url,image_url,ratings_count,work_id,title,title_without_series,shelved,read,rated,recommended
0,8842281e1d1347389f2ab93d60773d4d,1384,1bad0122cebb4aa9213f9fe1aa281f66,True,4,,Wed May 09 09:33:44 -0700 2007,Wed May 09 09:33:44 -0700 2007,,,0140449116,135,[],US,eng,"[{'count': '246940', 'name': 'to-read'}, {'cou...",,false,3.73,B01JGOEE2A,"[12914, 904843, 235719, 279456, 1536, 27417, 1...",The Odysseyis literature's grandest evocation ...,Paperback,https://www.goodreads.com/book/show/1384.The_O...,"[{'author_id': '903', 'role': ''}, {'author_id...",Penguin Classics,324,30,9780140449112,1,,2003,https://www.goodreads.com/book/show/1384.The_O...,https://images.gr-assets.com/books/1465683853m...,2114,3356006,The Odyssey,The Odyssey,1,1,1,1
1,06316bec7a49286f1f98d5acce24f923,2696,da74a6ad1ddedbd36dc512f5c3aa6435,True,3,,Tue Jun 05 09:17:50 -0700 2012,Tue Jun 05 09:17:50 -0700 2012,,,0140424385,2071,[],US,eng,"[{'count': '3097', 'name': 'to-read'}, {'count...",,false,3.48,B018HCIHXY,"[3049, 449589, 119079, 51799, 765427, 429679, ...",The procession that crosses Chaucer's pages is...,Paperback,https://www.goodreads.com/book/show/2696.The_C...,"[{'author_id': '1838', 'role': ''}, {'author_i...",Penguin Classics,504,30,9780140424386,1,,2003,https://www.goodreads.com/book/show/2696.The_C...,https://images.gr-assets.com/books/1261208589m...,150099,986234,The Canterbury Tales,The Canterbury Tales,1,1,1,0
2,220ef9c058a2132e6a9827f93a821d87,52820,fa2b40573025d0ac40d81f973342a005,True,5,,Sun Jun 10 09:12:01 -0700 2012,Wed Jun 13 07:30:54 -0700 2012,,,0156605171,51,[],US,eng,"[{'count': '364', 'name': 'to-read'}, {'count'...",,false,3.95,B003WUYQ00,"[553832, 1565, 264365, 275472, 1468, 3258, 773...",Critics have heralded Richard Wilbur's transla...,Paperback,https://www.goodreads.com/book/show/52820.The_...,"[{'author_id': '29837', 'role': ''}, {'author_...",Mariner Books,336,20,9780156605175,10,,1965,https://www.goodreads.com/book/show/52820.The_...,https://s.gr-assets.com/assets/nophoto/book/11...,1119,15585694,The Misanthrope/ Tartuffe,The Misanthrope/ Tartuffe,1,1,1,1
3,01ec1a320ffded6b2dd47833f2c8e4fb,1724365,e32f1d54e90b027989d57130c8277254,False,0,,Mon Feb 18 20:45:40 -0800 2013,Mon Feb 18 20:45:40 -0800 2013,,,0061430226,762,[],US,en-US,"[{'count': '4391', 'name': 'to-read'}, {'count...",,false,3.89,B0012OYBRG,"[2987912, 389098, 7517712, 1777371, 7824102, 5...",An ancient race of lycanthropes has survived t...,Hardcover,https://www.goodreads.com/book/show/1724365.Sh...,"[{'author_id': '391385', 'role': ''}]",Harper,312,29,9780061430220,1,,2008,https://www.goodreads.com/book/show/1724365.Sh...,https://s.gr-assets.com/assets/nophoto/book/11...,3672,1721769,Sharp Teeth,Sharp Teeth,1,0,0,0
4,96a4eae3201cf9eb0fd36a900611c925,22151696,ecb99bfa47f905b664ae9882a998bfb5,False,0,,Tue Oct 27 06:28:29 -0700 2015,Tue Oct 27 06:28:29 -0700 2015,,,1449461077,1186,[],US,eng,"[{'count': '28986', 'name': 'to-read'}, {'coun...",,false,4.05,B00KI7OP24,"[23627418, 20821097, 22618173, 21327039, 19230...","A sequel to the hugely popular, best-selling L...",Paperback,https://www.goodreads.com/book/show/22151696-l...,"[{'author_id': '7012565', 'role': ''}]",Andrews McMeel Publishing,248,16,9781449461072,9,,2014,https://www.goodreads.com/book/show/22151696-l...,https://images.gr-assets.com/books/1404192714m...,15314,41496977,Lullabies,Lullabies,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273430,9176ecf11d54ab4be384a12353a73e24,1420,84f2d6098a66ad3633a6fdb16717c5c7,True,5,,Mon Jun 04 11:45:53 -0700 2012,Mon Jun 04 11:45:53 -0700 2012,,,0521618746,5737,[],US,eng,"[{'count': '1659', 'name': 'school'}, {'count'...",,false,4.01,B008TVM1JU,"[383337, 92250, 52823, 18545, 12287, 752900, 1...","One of the greatest plays of all time, the com...",Paperback,https://www.goodreads.com/book/show/1420.Hamlet,"[{'author_id': '947', 'role': ''}, {'author_id...",Cambridge University Press,289,1,9780521618748,8,,2005,https://www.goodreads.com/book/show/1420.Hamlet,https://images.gr-assets.com/books/1351051208m...,526122,1885548,Hamlet,Hamlet,1,1,1,1
273431,3f17a8e0da315e199b8ee13044307b0d,5289,4e0f168317dfa1893ed302ecc9bb7709,True,4,,Mon Jun 04 14:24:19 -0700 2012,Sat Oct 20 06:03:54 -0700 2012,,,0007144350,162,[],US,eng,"[{'count': '15421', 'name': 'to-read'}, {'coun...",,false,4.45,,"[18526, 5294, 79936, 1577108, 926156, 7954157,...","In print since 1948, this is a single-volume c...",Mass Market Paperback,https://www.goodreads.com/book/show/5289.Compl...,"[{'author_id': '3565', 'role': ''}, {'author_i...",HarperCollins Publishers,1246,4,9780007144358,8,,2003,https://www.goodreads.com/book/show/5289.Compl...,https://s.gr-assets.com/assets/nophoto/book/11...,10992,21905772,Complete Works of Oscar Wilde,Complete Works of Oscar Wilde,1,1,1,1
273432,bfc558b791304f0ce74ad1c3a6ab08f7,1371,641f69bd80baba71c2b9ca2f0e1a34bc,True,4,,Mon Jun 04 15:04:07 -0700 2012,Mon Jun 04 15:04:07 -0700 2012,,,0140275363,2755,[],US,eng,"[{'count': '8816', 'name': 'classics'}, {'coun...",,false,3.83,,"[229575, 1468, 12914, 1519, 1291673, 1536, 209...",The Iliadis one of the two great epics of Home...,,https://www.goodreads.com/book/show/1371.The_I...,"[{'author_id': '903', 'role': ''}, {'author_id...",,683,29,9780140275360,4,Deluxe Edition,1999,https://www.goodreads.com/book/show/1371.The_I...,https://s.gr-assets.com/assets/nophoto/book/11...,247108,3293141,The Iliad,The Iliad,1,1,1,1
273433,148e49f7ed14137ff13a9a31c770487d,6003865,6fa7f7adb165a73f409c3e57cb231737,False,0,,Tue Nov 20 14:18:49 -0800 2012,Tue Nov 20 14:18:49 -0800 2012,,,9770148105,47,[],US,ara,"[{'count': '2936', 'name': 'to-read'}, {'count...",,false,4.06,,"[6748418, 6718801, 6698834, 7139681, 6348484, ...",,,https://www.goodreads.com/book/show/6003865,"[{'author_id': '1349868', 'role': ''}]",'jml m ktb,116,,,,,,https://www.goodreads.com/book/show/6003865,https://s.gr-assets.com/assets/nophoto/book/11...,660,6178797,أجمل ما كتب,أجمل ما كتب,1,0,0,0
