# Language Baseline Models

These language models serve as a baseline and will be leverage the book title and description in order to hopefully enhance the predictive power.

In [1]:
import json
import os
import random
import numpy as np
import pandas as pd

random.seed(42)
np.random.seed(42)

### Load Training Data

In [2]:
OUTPUT_DATA_DIR = "./output_data/"

train_df = pd.read_csv(OUTPUT_DATA_DIR+"interactions_training.csv")

### Load Validation Data

In [3]:
val_df = pd.read_csv(OUTPUT_DATA_DIR+"interactions_validation.csv")

In [4]:
pd.set_option('display.max_columns', None)

### Sample Data For Prototyping

In [5]:
train_df = train_df.sample(frac=0.25)

### Clean Data For Textual Analysis

In [6]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

def process_book_text(book_text, exclude_text, ps):
    """Pre-processes the text given by `review_text`.
    
    Parameters
    ----------
    book_text: str
        The book text to be processed.
    exclude_text: collection
        A collection of words to be excluded.
    ps: PorterStemmer
        The PorterStemmer used to perform word stemming.
    
    Returns
    -------
    str
        A string representing the processed version of `review_text`.
    
    """
    book = re.sub('[^a-zA-Z0-9]', ' ', book_text).lower().split()
    book = [ps.stem(word) for word in book if not word in exclude_text]
    return ' '.join(book)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Matthew/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
def preprocess_for_classification(data_df):
    """Preprocesses `data_df` to be used in classification.
    
    Parameters
    ----------
    data_df: pd.DataFrame
        The DataFrame to be processed.
    
    Returns
    -------
    pd.DataFrame
        The DataFrame obtained from `data_df` after processing.
    
    """
    # flags for most popular formats
    data_df['format'] = data_df['format'].apply(lambda x: str(x).lower())
    data_df['is_paperback'] = data_df['format'].apply(lambda x: int("paper" in x))
    data_df['is_hardcover'] = data_df['format'].apply(lambda x: int("hard" in x))
    data_df['is_audio'] = data_df['format'].apply(lambda x: int("audio" in x))
    data_df['is_other_format'] = (data_df['is_paperback'] + data_df['is_hardcover'] + 
                                  data_df['is_audio'] + data_df['is_ebook'])
    data_df['is_other_format'] = data_df['is_other_format'].apply(lambda x: 0 if x > 0 else 1)
    
    #flags for most popular publishers
    data_df['publisher'] = data_df['publisher'].apply(lambda x: str(x).lower())
    data_df['from_penguin'] = data_df['publisher'].apply(lambda x: int("penguin" in x))
    data_df['from_harpercollins'] = data_df['publisher'].apply(lambda x: int("harpercollins" in x or "harper collins" in x))
    data_df['from_university_press'] = data_df['publisher'].apply(lambda x: int("university press" in x))
    data_df['from_vintage'] = data_df['publisher'].apply(lambda x: int("vintage" in x))
    data_df['from_createspace'] = data_df['publisher'].apply(lambda x: int("createspace" in x or "create space" in x))
    data_df['other_publisher'] = (data_df['from_penguin'] + data_df['from_harpercollins'] + 
                                  data_df['from_university_press'] + data_df['from_vintage'] + data_df['from_createspace'])
    data_df['other_publisher'] = data_df['other_publisher'].apply(lambda x: 0 if x > 0 else 1)
    
    # ensuring columns are not missing
    train_df['average_rating'] = train_df['average_rating'].apply(lambda x: 0.0 if pd.isnull(x) else x)
    train_df['text_reviews_count'] = train_df['text_reviews_count'].apply(lambda x: 0 if pd.isnull(x) else x)
    train_df['ratings_count'] = train_df['ratings_count'].apply(lambda x: 0 if pd.isnull(x) else x)
    
    median_page_count = train_df['num_pages'].median()
    train_df['num_pages'] = train_df['num_pages'].apply(lambda x: median_page_count if pd.isnull(x) else x)
    
    # flags for most popular authors
    train_df['main_author'] = train_df['main_author'].astype(str)
    train_df['author_a'] = train_df['main_author'].apply(lambda x: int(x == "435477.0"))
    train_df['author_b'] = train_df['main_author'].apply(lambda x: int(x == "903.0"))
    train_df['author_c'] = train_df['main_author'].apply(lambda x: int(x == "947.0"))
    train_df['author_d'] = train_df['main_author'].apply(lambda x: int(x == "4624490.0"))
    train_df['author_e'] = train_df['main_author'].apply(lambda x: int(x == "18540.0"))
    train_df['author_f'] = train_df['main_author'].apply(lambda x: int(x == "8075577.0"))
    train_df['author_other'] = (train_df['author_a'] + train_df['author_b'] + 
                                train_df['author_c'] + train_df['author_d'] + 
                                train_df['author_e'] +train_df['author_f'])
    train_df['author_other'] = train_df['author_other'].apply(lambda x: 0 if x > 0 else 1)
    return train_df

In [23]:
def preprocess_all_book_text(data_df, id_col, text_col, exclude_text, ps):
    """Preprocesses the book text in `data_df` for `text_col`.
    
    The dataframe is restricted to `id_col` and `text_col` and then the
    unique ids are chosen. This is so that we only preprocess the text
    for a book once. Then we join the resulting text back to `data_df`.
    
    Parameters
    ----------
    data_df: pd.DataFrame
        The DataFrame containing the data to be preprocessed.
    id_col: str
        The column from which unique ids are chosen.
    text_col: str
        The column to be pre-processed.
    exclude_text: collection
        A collection of words to remove
    ps: PorterStemmer
        The PorterStemmer used for word stemming.
    
    Returns
    -------
    pd.DataFrame
        The DataFrame obtained from `data_df` after adding a column
        with the processed text.
    
    """
    book_df = train_df[[id_col, text_col]]
    book_df = book_df.drop_duplicates(subset=[id_col])
    book_df['cleaned_text'] = book_df[text_col].apply(lambda x: process_book_text(x, exclude_text, ps))
    final_df = pd.merge(train_df, book_df[[id_col, "cleaned_text"]], how="inner", on=[id_col])
    return final_df

In [24]:
def run_preprocess_pipeline(data_df, exclude_text, ps):
    """Runs the full pre-processing pipeline on `data_df`.
    
    Parameters
    ----------
    data_df: pd.DataFrame
    
    """
    processed_df = preprocess_for_classification(data_df)
    return preprocess_all_book_text(processed_df, "book_id", "title_description", exclude_text, ps)

In [25]:
format_counts = train_df.groupby(train_df['format'])['user_id'].count()
format_counts.sort_values()

format
hardcover and paperback                           1
print                                             1
hardcover first edition                           1
trade paperback                                   1
letterpressed chapbook                            1
hand-stitched chapbook                            1
turtleback                                        1
hardcover/paperback                               1
chapbook/ebook                                    1
chapbook/pamphlet                                 1
saddle stitched chapbook                          1
cards in envelope                                 1
paper, color photo and black twine                1
paperback, e-book                                 1
japanese stab bound                               1
casebound                                         2
newsprint stapled folio                           2
slipcased hardcover                               2
audio                                             2
chapb

In [26]:
publisher_counts = train_df.groupby(train_df['publisher'])['user_id'].count()
publisher_counts = publisher_counts.sort_values()
publisher_counts[-20:]

publisher
mariner books                     517
simon & schuster                  520
city lights                       528
graywolf press                    564
harpercollins                     566
penguin books                     699
penguin                           718
rupa & co                         743
everyman's library                756
new directions                    802
oxford university press           821
doubleday & company, inc.         858
andrews mcmeel publishing         905
dover publications                938
vintage                          1018
createspace                      1586
cambridge university press       2442
harpercollins childrens books    4448
nan                              5536
penguin classics                 5920
Name: user_id, dtype: int64

In [27]:
print(len(train_df[pd.isnull(train_df['average_rating'])]))
print(len(train_df[pd.isnull(train_df['text_reviews_count'])]))
print(len(train_df[pd.isnull(train_df['num_pages'])]))
print(len(train_df[pd.isnull(train_df['ratings_count'])]))
train_df['num_pages'].median()

0
0
0
0


203.0

In [28]:
author_counts = train_df.groupby(train_df['main_author'])['user_id'].count()
author_counts = author_counts.sort_values()
print(author_counts.values[-10:])
print(author_counts.index[-6:])

[ 862  952  972 1104 1169 1283 1814 3626 4118 4528]
Index(['nan', '18540.0', '4624490.0', '947.0', '903.0', '435477.0'], dtype='object', name='main_author')


In [29]:
book_df = train_df[['book_id', 'title_description']]
book_df = book_df.drop_duplicates(subset=['book_id'])
book_df

Unnamed: 0,book_id,title_description
113914,18295863,Letter Composed During a Lull in the Fighting ...
224359,18003300,Love & Misadventure Lang Leav is a poet and in...
90635,46199,"Letters to a Young Poet In 1903, a student at ..."
113523,30119,Where the Sidewalk Ends Where the Sidewalk End...
196420,5289,Complete Works of Oscar Wilde In print since 1...
...,...,...
38204,35498776,The Sky Threw Stars the storm. the strike. the...
172382,874604,Collected Verse of Edgar A. Guest
173754,31122069,The Lay of Aotrou and Itroun The Lay of Aotrou...
67472,22736736,September First


In [30]:
exclude_english = set(stopwords.words('english'))
ps = PorterStemmer()
train_df_processed = run_preprocess_pipeline(train_df, exclude_english, ps)
val_df_processed = run_preprocess_pipeline(val_df, exclude_english, ps)

In [36]:
from sklearn.utils import shuffle

def shuffle_dataset(data_df):
    """Randomly shuffles `df`.
    
    Parameters
    ----------
    df: pd.DataFrame
        The DataFrame to be shuffled.
    
    Returns
    -------
    pd.DataFrame
        A shuffled dataframe obtained from `df`.
    
    """
    data_df = shuffle(data_df)
    data_df.reset_index(inplace=True, drop=True)
    return data_df

We reshuffle the datasets as they were sorted for the merge

In [37]:
train_df_processed = shuffle_dataset(train_df_processed)
val_df_processed = shuffle_dataset(val_df_processed)

We take a set of non text features which we will add to our textual model

In [51]:
columns_to_keep = ['text_reviews_count', 'is_ebook', 'average_rating', 'num_pages', 
                   'publication_year', 'ratings_count', 'is_translated', 'is_in_series',
                   'series_length', 'is_paperback', 'is_hardcover', 'is_audio', 'is_other_format',
                   'from_penguin', 'from_harpercollins', 'from_university_press', 'from_vintage',
                   'from_createspace', 'other_publisher', 'author_a', 'author_b', 'author_c',
                   'author_d', 'author_e', 'author_f', 'author_other']
X_train_reg = train_df_processed[columns_to_keep]
X_val_reg = val_df_processed[columns_to_keep]

In [52]:
def log_transform_columns(data_df, cols):
    """Applies a log transform to `cols` in `data_df`.
    
    Parameters
    ----------
    data_df: pd.DataFrame
        The DataFrame in which the columns will be transformed.
    cols: collection
        The columns in `data_df` to be log scaled.
    
    Returns
    -------
    pd.DataFrame
        The DataFrame obtained from `data_df` after log scaling
        the columns `cols`.
    
    """
    for col in cols:
        data_df[col] = data_df[col].apply(lambda x: np.log(x) if x > 0 else 0)
    return data_df

In [53]:
log_transform_cols = ['text_reviews_count', 'ratings_count']
X_train_reg = log_transform_columns(X_train_reg, log_transform_cols)
X_val_reg = log_transform_columns(X_val_reg, log_transform_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df[col] = data_df[col].apply(lambda x: np.log(x) if x > 0 else 0)


In [54]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()

X_train_reg = min_max_scaler.fit_transform(X_train_reg)
X_val_reg = min_max_scaler.transform(X_val_reg)

### Building the Textual Model

We apply TF-IDF only to the unique corpus of book descriptions. We only want to apply it to the unique descriptions because we do not want to overweight the books that are frequently read.

We start by getting the book corpus.

In [56]:
book_df = train_df_processed[['book_id', 'cleaned_text']]
book_df = book_df.drop_duplicates(subset=['book_id'])

So we fit TF-IDF to the book corpus and then use it to transform the training and validation text to a sparse matrix

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_model = TfidfVectorizer()

tfidf_model.fit(book_df['cleaned_text'])

train_tfidf = tfidf_model.transform(train_df_processed['cleaned_text'])
val_tfidf = tfidf_model.transform(val_df_processed['cleaned_text'])