# Meta Models

In this notebook we prototype a set of Meta Models. In particular, we use the s-values trained using the chainRec algorithm and use these as input features to our model. These are combined with book level features in the hopes to enhance the predictive accuracy of the model.

In [1]:
import json
import os
import random
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
import scipy.sparse as sp

random.seed(42)
np.random.seed(42)

### Loading S-Values

We load the S-Values from the ChainRec model.

In [44]:
MAPPING_DIR = './mappings/'

s_values_df = pd.read_csv(MAPPING_DIR+"goodreads_s_values.csv")

In [45]:
cols_to_keep = ['user_number', 'item_number', 's1', 's2', 's3', 's4']

s_values_df = s_values_df[cols_to_keep]

In [46]:
s_values_df['user_number'] = s_values_df['user_number'].apply(lambda x: str(x))
s_values_df['item_number'] = s_values_df['item_number'].apply(lambda x: str(x))
s_values_df['user_item_id'] = s_values_df['user_number'] + "-" + s_values_df['item_number']

In [5]:
OUTPUT_DATA_DIR = "./output_data/"

train_df_processed = pd.read_csv(OUTPUT_DATA_DIR+"text_processed_training.csv")
val_df_processed = pd.read_csv(OUTPUT_DATA_DIR+"text_processed_validation.csv")
test_df_processed = pd.read_csv(OUTPUT_DATA_DIR+"text_processed_testing.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [47]:
def load_mapping(mapping_file):
    """Loads the mapping from `mapping_file`.
    
    Parameters
    ----------
    mapping_file: str
        The name of the mapping file to import.
    
    Returns
    -------
    pd.DataFrame
        The DataFrame created from the mapping.
    
    """
    return pd.read_csv(os.path.join("mappings", "{}.csv".format(mapping_file)))

In [48]:
user_map = load_mapping("user_map")
book_map = load_mapping("book_map")

In [49]:
book_map['book_id'] = book_map['book_id'].apply(lambda x: str(x))

In [50]:
def create_user_item_id(data_df, u_map, i_map):
    data_df['book_id'] = data_df['book_id'].apply(lambda x: str(x))
    data_df = pd.merge(data_df, u_map, how="left", on=["user_id"])
    data_df = pd.merge(data_df, i_map, how="left", on=["book_id"])
    data_df['user_number'] = data_df['user_number'].apply(lambda x: str(x))
    data_df['book_number'] = data_df['book_number'].apply(lambda x: str(x))
    data_df['user_item_id'] = data_df['user_number'] + "-" + data_df['book_number']
    return data_df.drop(columns=['user_number', 'book_number'])

In [51]:
train_df = create_user_item_id(train_df_processed, user_map, book_map)
val_df = create_user_item_id(val_df_processed, user_map, book_map)
test_df = create_user_item_id(test_df_processed, user_map, book_map)

In [52]:
s_values_df.drop(columns=['user_number', 'item_number'], inplace=True)

In [53]:
train_df_s = pd.merge(train_df, s_values_df, how='left', on=['user_item_id'])
val_df_s = pd.merge(val_df, s_values_df, how='left', on=['user_item_id'])
test_df_s = pd.merge(test_df, s_values_df, how='left', on=['user_item_id'])

In [13]:
train_df_s.drop(columns=['user_item_id'], inplace=True)
val_df_s.drop(columns=['user_item_id'], inplace=True)
test_df_s.drop(columns=['user_item_id'], inplace=True)

In [14]:
train_df_s.to_csv(OUTPUT_DATA_DIR+"training_s_vals.csv", index=False)
val_df_s.to_csv(OUTPUT_DATA_DIR+"validation_s_vals.csv", index=False)
test_df_s.to_csv(OUTPUT_DATA_DIR+"testing_s_vals.csv", index=False)

In [15]:
columns_to_keep = ['text_reviews_count', 'is_ebook', 'average_rating', 'num_pages',
                   'ratings_count', 'is_translated', 'is_in_series', 'series_length', 
                   'is_paperback', 'is_hardcover', 'is_audio', 'is_other_format', 'from_penguin', 
                   'from_harpercollins', 'from_university_press', 'from_vintage',
                   'from_createspace', 'other_publisher', 'author_a', 'author_b', 'author_c',
                   'author_d', 'author_e', 'author_f', 'author_other', 's1', 's2', 's3', 's4']
X_train_reg = train_df_s[columns_to_keep]
X_val_reg = val_df_s[columns_to_keep]
X_test_reg = test_df_s[columns_to_keep]

In [16]:
def log_transform_columns(data_df, cols):
    """Applies a log transform to `cols` in `data_df`.

    Parameters
    ----------
    data_df: pd.DataFrame
        The DataFrame in which the columns will be transformed.
    cols: collection
        The columns in `data_df` to be log scaled.

    Returns
    -------
    pd.DataFrame
        The DataFrame obtained from `data_df` after log scaling
        the columns `cols`.

    """
    for col in cols:
        data_df[col] = data_df[col].apply(lambda x: np.log(x) if x > 0 else 0)
    return data_df

In [17]:
log_transform_cols = ['text_reviews_count', 'ratings_count']
X_train_reg = log_transform_columns(X_train_reg, log_transform_cols)
X_val_reg = log_transform_columns(X_val_reg, log_transform_cols)
X_test_reg = log_transform_columns(X_test_reg, log_transform_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df[col] = data_df[col].apply(lambda x: np.log(x) if x > 0 else 0)


### Basic Regression

In [18]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()

X_train_reg1 = min_max_scaler.fit_transform(X_train_reg)
X_val_reg1 = min_max_scaler.transform(X_val_reg)
X_test_reg1 = min_max_scaler.transform(X_test_reg)

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

reg_model = LogisticRegression(max_iter=200)
reg_model.fit(X_train_reg1, train_df_s['recommended'])

train_AUC = roc_auc_score(train_df_s['recommended'], reg_model.predict(X_train_reg1))
val_AUC = roc_auc_score(val_df_s['recommended'], reg_model.predict(X_val_reg1))

print("Training AUC: {}".format(train_AUC))
print("Validation AUC: {}".format(val_AUC))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training AUC: 0.6770555855780461
Validation AUC: 0.5590056441175538


In [58]:
from xgboost import XGBClassifier

xg_cls = XGBClassifier(
    objective='binary:logistic', learning_rate=0.1,
    max_depth=1, n_estimators=10)

xg_cls.fit(X_train_reg1, train_df_processed['recommended'])
train_AUC = roc_auc_score(
    train_df_processed['recommended'], xg_cls.predict(X_train_reg1))
val_AUC = roc_auc_score(
    val_df_processed['recommended'], xg_cls.predict(X_val_reg1))

print("Training AUC: {}".format(train_AUC))
print("Validation AUC: {}".format(val_AUC))

Training AUC: 0.6635181386123419
Validation AUC: 0.5960578200432686


In [24]:
def sigmoid(val):
    """Applies the sigmoid function to `val`.
    
    The sigmoid function has the form
    f(x) = 1 / (1 + exp(-x))
    
    Parameters
    ----------
    val: float
        The operand to the sigmoid function.
    
    Returns
    -------
    float
        The result of applying the sigmoid
        function to `val`.
    
    """
    return 1 / (1 + np.exp(-val))

In [25]:
def scale_s_values(data_df):
    """Applies the sigmoid function to the s values in `data_df`.
    
    Parameters
    ---------
    data_df: pd.DataFrame
        The DataFrame for which the operation is performed.
    
    Returns
    -------
    pd.DataFrame
        The DataFrame that results from scaling the s values
        in `data_df`.
    
    """
    for s_col in ["s1", "s2", "s3", "s4"]:
        data_df[s_col] = data_df[s_col].apply(lambda x: sigmoid(x))
    return data_df

In [26]:
X_train_reg2 = scale_s_values(X_train_reg)
X_val_reg2 = scale_s_values(X_val_reg)
X_test_reg2 = scale_s_values(X_test_reg)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df[s_col] = data_df[s_col].apply(lambda x: sigmoid(x))


In [27]:
reg_model = LogisticRegression(max_iter=200)
reg_model.fit(X_train_reg2, train_df_s['recommended'])

train_AUC = roc_auc_score(train_df_s['recommended'], reg_model.predict(X_train_reg2))
val_AUC = roc_auc_score(val_df_s['recommended'], reg_model.predict(X_val_reg2))

print("Training AUC: {}".format(train_AUC))
print("Validation AUC: {}".format(val_AUC))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training AUC: 0.6598480261682507
Validation AUC: 0.5752980498317812


In [38]:
from xgboost import XGBClassifier

xg_cls = XGBClassifier(
    objective='binary:logistic', learning_rate=0.1,
    max_depth=1, n_estimators=10)

xg_cls.fit(X_train_reg2, train_df_processed['recommended'])
train_AUC = roc_auc_score(
    train_df_processed['recommended'], xg_cls.predict(X_train_reg2))
val_AUC = roc_auc_score(
    val_df_processed['recommended'], xg_cls.predict(X_val_reg2))

print("Training AUC: {}".format(train_AUC))
print("Validation AUC: {}".format(val_AUC))

Training AUC: 0.6635181386123419
Validation AUC: 0.5960578200432686


In [31]:
book_df = train_df_s[['book_id', 'cleaned_text']]
book_df = book_df.drop_duplicates(subset=['book_id'])

book_df['cleaned_text'] = book_df['cleaned_text'].apply(lambda x: "" if pd.isnull(x) else x)

w2v = Word2Vec(list(book_df['cleaned_text']), size=200, window=10, min_count=1)

In [32]:
def create_book_vector(book_text, vec_length):
    """Creates a vector for the book given by `book_text`.

    The word vectors for each word in `book_text` are
    averaged to build a vector for the book.

    Parameters
    ----------
    book_text: str
        The book text for which the vector is generated.

    Returns
    -------
    vector
        A vector for the book.

    """
    text_vecs = [word for word in str(book_text) if word in w2v.wv.vocab]
    if len(text_vecs) > 0:
        return np.mean(w2v[text_vecs], axis=0)
    return np.zeros(vec_length)

In [33]:
train_df_s['book_vector'] = train_df_s['cleaned_text'].apply(lambda x: create_book_vector(x, 200))
val_df_s['book_vector'] = val_df_s['cleaned_text'].apply(lambda x: create_book_vector(x, 200))
test_df_s['book_vector'] = test_df_s['cleaned_text'].apply(lambda x: create_book_vector(x, 200))

  return np.mean(w2v[text_vecs], axis=0)


In [34]:
def create_book_vec_df(book_vecs, indices):
    """Creates a dataframe from `book_vecs`.

    Each numpy array in `book_vecs` is converted to a
    row in the resulting dataframe.

    Parameters
    ----------
    book_vecs: list
        A list of numpy arrays where each array corresponds
        to the book vector for a book.
    indicies: np.array
        A numpy array of indices for the DataFrame

    Returns
    -------
    pd.DataFrame
        The DataFrame obtained from converting `review_vecs`
        to a dataframe.

    """
    book_vec_df = pd.DataFrame(np.vstack(book_vecs))
    book_vec_df.columns = ["word" + str(col) for col in book_vec_df.columns]
    book_vec_df.index = indices
    return book_vec_df

In [35]:
train_wv = create_book_vec_df(train_df_s['book_vector'], train_df_s.index)
val_wv = create_book_vec_df(val_df_s['book_vector'], val_df_s.index)
test_wv = create_book_vec_df(test_df_s['book_vector'], test_df_s.index)

In [39]:
min_max_scaler = MinMaxScaler()

X_train_reg3 = min_max_scaler.fit_transform(X_train_reg2)
X_val_reg3 = min_max_scaler.transform(X_val_reg2)
X_test_reg3 = min_max_scaler.transform(X_test_reg2)

In [40]:
X_train_reg_df = pd.DataFrame(np.vstack(X_train_reg3))
X_train_reg_df.index = train_df_s.index

X_val_reg_df = pd.DataFrame(np.vstack(X_val_reg3))
X_val_reg_df.index = val_df_s.index

X_test_reg_df = pd.DataFrame(np.vstack(X_test_reg3))
X_test_reg_df.index = test_df_s.index

X_train_wv_reg = sp.csr_matrix(pd.concat([train_wv, X_train_reg_df], axis=1))
X_val_wv_reg = sp.csr_matrix(pd.concat([val_wv, X_val_reg_df], axis=1))
X_test_wv_reg = sp.csr_matrix(pd.concat([test_wv, X_test_reg_df], axis=1))

In [43]:
xg_cls = XGBClassifier(
    objective='binary:logistic', learning_rate=0.1,
    max_depth=1, n_estimators=10)

xg_cls.fit(X_train_wv_reg, train_df_processed['recommended'])
train_AUC = roc_auc_score(
    train_df_processed['recommended'], xg_cls.predict(X_train_wv_reg))
val_AUC = roc_auc_score(
    val_df_processed['recommended'], xg_cls.predict(X_val_wv_reg))

print("Training AUC: {}".format(train_AUC))
print("Validation AUC: {}".format(val_AUC))

Training AUC: 0.6635181386123419
Validation AUC: 0.5960578200432686


In [58]:
s_cols = ['s1', 's2', 's3', 's4', 'recommended']

s_train = train_df_s[s_cols]
s_val = val_df_s[s_cols]
s_test = test_df_s[s_cols]

In [60]:
X_train_s = s_train.drop(columns=['recommended'])
X_val_s = s_val.drop(columns=['recommended'])
X_test_s = s_test.drop(columns=['recommended'])

In [61]:
reg_model = LogisticRegression(max_iter=200)
reg_model.fit(X_train_s, s_train['recommended'])

train_AUC = roc_auc_score(s_train['recommended'], reg_model.predict(X_train_s))
val_AUC = roc_auc_score(s_val['recommended'], reg_model.predict(X_val_s))

print("Training AUC: {}".format(train_AUC))
print("Validation AUC: {}".format(val_AUC))

Training AUC: 0.6538201668630219
Validation AUC: 0.5327062016726037


In [77]:
xg_cls = XGBClassifier(
    objective='binary:logistic', learning_rate=0.1,
    max_depth=1, n_estimators=10)

xg_cls.fit(X_train_s, s_train['recommended'])
train_AUC = roc_auc_score(
    s_train['recommended'], xg_cls.predict(X_train_s))
val_AUC = roc_auc_score(
    s_val['recommended'], xg_cls.predict(X_val_s))

print("Training AUC: {}".format(train_AUC))
print("Validation AUC: {}".format(val_AUC))

Training AUC: 0.645297665355641
Validation AUC: 0.579378306728969


In [74]:
from sklearn.ensemble import RandomForestClassifier

ranfor_model = RandomForestClassifier(n_estimators=500, max_depth=2)
ranfor_model.fit(X_train_s, s_train['recommended'])

train_AUC = roc_auc_score(
    s_train['recommended'], ranfor_model.predict(X_train_s))
val_AUC = roc_auc_score(
    s_val['recommended'], ranfor_model.predict(X_val_s))

print("Training AUC: {}".format(train_AUC))
print("Validation AUC: {}".format(val_AUC))

Training AUC: 0.652354968917165
Validation AUC: 0.5567303402118982


In [78]:
xg_cls = XGBClassifier(
    objective='binary:logistic', learning_rate=0.1,
    max_depth=1, n_estimators=10)

xg_cls.fit(X_train_s, s_train['recommended'])

s_train_preds = xg_cls.predict(X_train_s)
s_val_preds = xg_cls.predict(X_val_s)
s_test_preds = xg_cls.predict(X_test_s)

In [79]:
columns_to_keep = ['text_reviews_count', 'is_ebook', 'average_rating', 'num_pages',
                   'ratings_count', 'is_translated', 'is_in_series', 'series_length', 
                   'is_paperback', 'is_hardcover', 'is_audio', 'is_other_format', 'from_penguin', 
                   'from_harpercollins', 'from_university_press', 'from_vintage',
                   'from_createspace', 'other_publisher', 'author_a', 'author_b', 'author_c',
                   'author_d', 'author_e', 'author_f', 'author_other']
X_train_reg = train_df_s[columns_to_keep]
X_val_reg = val_df_s[columns_to_keep]
X_test_reg = test_df_s[columns_to_keep]

In [80]:
log_transform_cols = ['text_reviews_count', 'ratings_count']
X_train_reg = log_transform_columns(X_train_reg, log_transform_cols)
X_val_reg = log_transform_columns(X_val_reg, log_transform_cols)
X_test_reg = log_transform_columns(X_test_reg, log_transform_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df[col] = data_df[col].apply(lambda x: np.log(x) if x > 0 else 0)


In [81]:
min_max_scaler = MinMaxScaler()

X_train_reg1 = min_max_scaler.fit_transform(X_train_reg)
X_val_reg1 = min_max_scaler.transform(X_val_reg)
X_test_reg1 = min_max_scaler.transform(X_test_reg)

In [82]:
X_train_reg_df = pd.DataFrame(np.vstack(X_train_reg1))
X_train_reg_df.index = train_df_s.index

X_val_reg_df = pd.DataFrame(np.vstack(X_val_reg1))
X_val_reg_df.index = val_df_s.index

X_test_reg_df = pd.DataFrame(np.vstack(X_test_reg1))
X_test_reg_df.index = test_df_s.index

X_train_wv_reg = sp.csr_matrix(pd.concat([train_wv, X_train_reg_df], axis=1))
X_val_wv_reg = sp.csr_matrix(pd.concat([val_wv, X_val_reg_df], axis=1))
X_test_wv_reg = sp.csr_matrix(pd.concat([test_wv, X_test_reg_df], axis=1))

In [83]:
xg_cls = XGBClassifier(
    objective='binary:logistic', learning_rate=0.1,
    max_depth=2, n_estimators=2000)

xg_cls.fit(X_train_wv_reg, train_df_processed['recommended'])

xgb_train_preds = xg_cls.predict(X_train_wv_reg)
xgb_val_preds = xg_cls.predict(X_val_wv_reg)
xgb_test_preds = xg_cls.predict(X_test_wv_reg)

train_AUC = roc_auc_score(
    train_df_processed['recommended'], xgb_train_preds)
val_AUC = roc_auc_score(
    val_df_processed['recommended'], xgb_val_preds)

print("Training AUC: {}".format(train_AUC))
print("Validation AUC: {}".format(val_AUC))

Training AUC: 0.6742530422009975
Validation AUC: 0.6135375347776293


In [87]:
train_meta = pd.DataFrame({'s_preds': s_train_preds,
                           'xgb_preds': xgb_train_preds})

val_meta = pd.DataFrame({'s_preds': s_val_preds,
                           'xgb_preds': xgb_val_preds})

test_meta = pd.DataFrame({'s_preds': s_test_preds,
                          'xgb_preds': xgb_test_preds})

In [88]:
reg_model = LogisticRegression(max_iter=200)
reg_model.fit(train_meta, s_train['recommended'])

train_AUC = roc_auc_score(s_train['recommended'], reg_model.predict(train_meta))
val_AUC = roc_auc_score(s_val['recommended'], reg_model.predict(val_meta))

print("Training AUC: {}".format(train_AUC))
print("Validation AUC: {}".format(val_AUC))

Training AUC: 0.6742530422009975
Validation AUC: 0.6135375347776293


In [89]:
xg_cls = XGBClassifier(
    objective='binary:logistic', learning_rate=0.1,
    max_depth=1, n_estimators=10)

xg_cls.fit(train_meta, s_train['recommended'])
train_AUC = roc_auc_score(
    s_train['recommended'], xg_cls.predict(train_meta))
val_AUC = roc_auc_score(
    s_val['recommended'], xg_cls.predict(val_meta))

print("Training AUC: {}".format(train_AUC))
print("Validation AUC: {}".format(val_AUC))

Training AUC: 0.6742530422009975
Validation AUC: 0.6135375347776293
