# Goodreads Data Cleaning

Performing data cleaning tasks to ensure data is ready for modelling

### Imports

In [1]:
import json
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style('whitegrid')
plt.style.use('seaborn-darkgrid')
%matplotlib inline

### Directories

Ensuring the output data directory is created

In [3]:
OUTPUT_DATA_DIR = "./output_data/"

if not os.path.exists(OUTPUT_DATA_DIR):
    os.makedirs(OUTPUT_DATA_DIR)

In [4]:
def construct_data_path(dataset_name):
    """Constructs the path to `dataset_name`.
    
    Parameters
    ----------
    dataset_name: str
        The name of the dataset.
    
    Returns
    -------
    str
        A path to the dataset.
    
    """
    return os.path.join('data', '{}.json'.format(dataset_name))

In [5]:
def load_and_sample_dataset(data_path, sample_percentage):
    """Used to load `sample_percentage`% of the data at `data_path`.
    
    Parameters
    ----------
    data_path: str
        A string representing the path to the data
    sample_percentage: float
        A number between 0 and 100 representing the fraction
        of the data to sample.
    
    Returns
    -------
    pd.DataFrame
        A pandas dataframe object loaded from `data_path` after
        loading `sample_percentage`% of the data.
    
    """
    data = []
    idx = 0
    modulus = 100 // sample_percentage
    with open(data_path, 'r') as data_file:
        for row in data_file:
            if idx % modulus == 0:
                data.append(json.loads(row))
            idx += 1
    return pd.DataFrame(data)

In [6]:
interactions_df = load_and_sample_dataset(construct_data_path('goodreads_interactions_poetry'), 10)

In [7]:
books_df = load_and_sample_dataset(construct_data_path('goodreads_books_poetry'), 100)

In [8]:
reviews_df = load_and_sample_dataset(construct_data_path('goodreads_reviews_poetry'), 100)

In [85]:
all_interactions_df = pd.merge(interactions_df, books_df, how='left', left_on="book_id", right_on="book_id")

In [10]:
pd.set_option('display.max_columns', None)

### Pre-Processing Data

We clean the data with the following steps:
*  removing incorrect records
*  constructing the monotonic chain `shelve->read->rate->recommend`
  * recommend corresponds to a rating that is greater than 3
* add formatted date columns in the form 'yyyy-mm'
* add a flag to indicate if the book was translated
* add a column indicating the primary author's id
* add a column indicating if the book is part of a series
* add a column with how many books are in the series (1 indicates a solo publication - not part of a series)
* add a column that combines the book title and description for easier text processing later

In [14]:
def remove_incorrect_records(data_df):
    """Removes records that do not properly follow the chain.
    
    Removing data where `is_read = False` but it has been 
    rated `rating > 0`.
    
    Parameters
    ----------
    data_df: pd.DataFrame
        The DataFrame from which incorrect records will be removed.
    
    Returns
    -------
    pd.DataFrame
        The DataFrame obtained from `data_df` after removing
        incorrect records.
    
    """
    corrupt_files = data_df[(data_df['is_read'] == False) & (all_interactions_df['rating'] > 0)].index
    return data_df[~data_df.index.isin(corrupt_files)]

In [28]:
def create_chain(data_df):
    """Creates variables for the chain in `data_df`.
    
    4 new indicator variables for the actions shelved, read,
    rated, recommended.
    
    Parameters
    ----------
    data_df: pd.DataFrame
        The DataFrame augmented with the chain variables.
    
    """
    data_df['shelved'] = 1
    data_df['read'] = data_df['is_read'].apply(lambda x: 1 if x == True else 0)
    data_df['rated'] = data_df['rating'].apply(lambda x: 1 if x > 0 else 0)
    data_df['recommended'] = data_df['rating'].apply(lambda x: 1 if x > 3 else 0)
    return data_df

In [31]:
month_map = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06',
             'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'}

def reformat_date_cols(data_df, date_cols):
    """Reformats date columns in `data_df` for easier analysis.
    
    The columns in `date_cols` are formatted in a way that is
    easier to visualize. The reformatted date columns have the
    format 'yyyy-mm'
    
    Parameters
    ----------
    data_df: pd.DataFrame
        A pandas DataFrame to which date columns will be added.
    date_cols: collection
        A collection of date columns to be reformatted
    
    Returns
    -------
    pd.DataFrame
        A pandas DataFrame obtained from `data_df` by augmenting it with
        reformatted date columns for the columns in `date_cols`.
    
    """
    for date_col in date_cols:
        col_name = date_col.split("_")[1]
        month_name = "month_{}".format(col_name)
        year_name = "year_{}".format(col_name)
        data_df[month_name] = data_df[date_col].apply(lambda x: month_map[x.split()[1]])
        data_df[year_name] = data_df[date_col].apply(lambda x: x.split()[-1])
        data_df['year_month_{}'.format(col_name)] = data_df[year_name] + "-" + data_df[month_name]
        data_df = data_df.drop(columns=[month_name, year_name])
    return data_df

In [32]:
def format_publication_date(data_df):
    """Formats the publication date in `data_df`.
    
    The formatted publication date has the form 'yyyy-mm'.
    
    Parameters
    ----------
    data_df: pd.DataFrame
        The pandas DataFrame for which the publication date will be formatted.
    
    Returns
    -------
    pd.DataFrame
        The pandas DataFrame obtained from `data_df` after adding a column
        for the formatted publication date.
    
    """
    data_df['formatted_pub_month'] = data_df['publication_month'].apply(lambda x: str(x).zfill(2))
    data_df['formatted_pub_year'] = data_df['publication_year'].apply(lambda x: str(x))
    data_df['pub_date'] = data_df['formatted_pub_year'] + "-" + data_df['formatted_pub_month']
    data_df = data_df.drop(columns=['formatted_pub_year', 'formatted_pub_month'])
    return data_df

In [33]:
def format_date_cols(data_df):
    """Formats the date columns of `data_df`.
    
    Parameters
    ----------
    data_df: pd.DataFrame
        The pandas DataFrame to be formatted.
    
    Returns
    -------
    pd.DataFrame
        The DataFrame obtained from `data_df` after
        formatting its columns.
    
    """
    data_df = reformat_date_cols(data_df, ['date_added', 'date_updated'])
    return format_publication_date(data_df)

In [58]:
def is_translated(authors_lst):
    """Indicates if the book was translated based on `authors_lst`.
    
    Parameters
    ----------
    authors_lst: list
        A list of dictionaries indicating the authors.
    
    Returns
    -------
    bool
        True if `authors_lst` contains a translator. Otherwise, False.
    
    """
    for author_dict in authors_lst:
        if author_dict['role'] == 'Translation':
            return True
    return False

In [59]:
def get_author_id(authors_lst):
    """Retrieves the id of the main author.
    
    Parameters
    ----------
    authors_lst: list
        A list of dictionaries indicating the authors.
    
    Returns
    -------
    str
        A string representing the id of the main author.
    
    """
    for author_dict in authors_lst:
        if author_dict['role'] == '':
            return author_dict['author_id']
    return ''

In [63]:
def add_author_info(data_df):
    """Adds author info to the data in `data_df`.
    
    Parameters
    ----------
    data_df: pd.DataFrame
        The DataFrame to which author information
        will be added.
    
    Returns
    -------
    pd.DataFrame
        The DataFrame obtained from `data_df` after adding
        columns to record author information.
    
    """
    data_df['is_translated'] = data_df['authors'].apply(lambda x: 1 if is_translated(x) else 0)
    data_df['main_author'] = data_df['authors'].apply(lambda x: get_author_id(x))
    return data_df

In [78]:
def add_series_info(data_df):
    """Adds info about each book's place in a series.
    
    Parameters
    ----------
    data_df: pd.DataFrame
        The DataFrame to which series info is added.
    
    Returns
    -------
    pd.DataFrame
        The DataFrame obtained from `data_df` after adding
        columns to record series info.
    
    """
    data_df['is_in_series'] = data_df['series'].apply(lambda x: 1 if len(x) > 0 else 0)
    data_df['series_length'] = data_df['series'].apply(lambda x: len(x) + 1)
    return data_df

In [81]:
def fix_text_col(data_df, text_col):
    """Replaces null values of `text_col` with the empty string.
    
    Parameters
    ----------
    data_df: pd.DataFrame
        The DataFrame for which the text column will be fixed.
    text_col: str
        The name of the column in `data_df` to be fixed.
    
    Returns
    -------
    np.array
        The array obtained from `text_col` in `data_df` after the
        fix has been applied.
    
    """
    return np.where(pd.isnull(data_df[text_col]), "", data_df[text_col])

In [82]:
def combine_title_description(data_df):
    """Combines the title and description columns in `data_df`.
    
    A new column is added containing the book title followed by
    its description so that both can be used in a language model.
    
    Parameters
    ----------
    data_df: pd.DataFrame
        The DataFrame to which the operation is applied.
    
    Returns
    -------
    pd.DataFrame
        The DataFrame obtained from `data_df` after adding an extra
        column with the book title followed by the description.
    
    """
    data_df['fixedTitle'] = fix_text_col(data_df, 'title')
    data_df['fixedDescription'] = fix_text_col(data_df, 'description')
    data_df['title_description'] = data_df['fixedTitle'] + " " + data_df['fixedDescription']
    data_df = data_df.drop(columns=['fixedTitle', 'fixedDescription'])
    return data_df

In [87]:
def pre_process_data(data_df, processing_funcs):
    """Applies the preprocessing pipeline to `data_df`.
    
    Parameters
    ----------
    data_df: pd.DataFrame
        The DataFrame to which the pipeline is applied.
    processing_funcs: list
        A list of preprocessing functions to be applied
        to `data_df`.
    
    Returns
    -------
    pd.DataFrame
        The DataFrame obtained from `data_df` after applying
        the preprocessing pipeline.
    
    """
    for func in processing_funcs:
        data_df = func(data_df)
    return data_df

In [90]:
pre_processing_funcs = [remove_incorrect_records, create_chain, format_date_cols, 
                        add_author_info, add_series_info, combine_title_description]
all_interactions_df = pre_process_data(all_interactions_df, pre_processing_funcs)